From 9326926bef943245c244eb0e6129ae046a3719a9 Mon Sep 17 00:00:00 2001 From: Taras Glek Date: Wed, 26 Feb 2020 09:39:52 -0800 Subject: [PATCH 01/42] NFS engine --- HOWTO | 13 +- Makefile | 6 + configure | 28 ++++ engines/nfs.c | 351 +++++++++++++++++++++++++++++++++++++++++++++++ examples/nfs.fio | 23 ++++ fio.1 | 10 ++ optgroup.c | 4 + optgroup.h | 2 + options.c | 5 + 9 files changed, 441 insertions(+), 1 deletion(-) create mode 100644 engines/nfs.c create mode 100644 examples/nfs.fio diff --git a/HOWTO b/HOWTO index 2788670ddb..367164b117 100644 --- a/HOWTO +++ b/HOWTO @@ -1168,7 +1168,7 @@ I/O type **1** Backward-compatible alias for **mixed**. - + **2** Alias for **both**. @@ -2091,6 +2091,12 @@ I/O engine I/O engine supporting asynchronous read and write operations to the DAOS File System (DFS) via libdfs. + **nfs** + I/O engine supporting asynchronous read and write operations to + NFS filesystems from userspace via libnfs. This is useful for + achieving higher concurrency and thus throughput than is possible + via kernel NFS. + I/O engine specific parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2508,6 +2514,11 @@ with the caveat that when used on the command line, they must come after the Specificy a different object class for the dfs file. Use DAOS container's object class by default. +.. option:: nfs_url=str : [nfs] + + URL in libnfs format, eg nfs:///path[?arg=val[&arg=val]*] + Refer to the libnfs README for more details. + I/O depth ~~~~~~~~~ diff --git a/Makefile b/Makefile index fce3d0d134..78a369eb3e 100644 --- a/Makefile +++ b/Makefile @@ -79,6 +79,12 @@ ifdef CONFIG_LIBNBD ENGINES += nbd endif +ifdef CONFIG_LIBNFS + CFLAGS += $(LIBNFS_CFLAGS) + LIBS += $(LIBNFS_LIBS) + SOURCE += engines/nfs.c +endif + ifdef CONFIG_64BIT CPPFLAGS += -DBITS_PER_LONG=64 else ifdef CONFIG_32BIT diff --git a/configure b/configure index a7d82be06b..a9f0c033e4 100755 --- a/configure +++ b/configure @@ -172,6 +172,7 @@ libiscsi="no" libnbd="no" libzbc="" dfs="" +libnfs="no" dynamic_engines="no" prefix=/usr/local @@ -241,6 +242,8 @@ for opt do ;; --disable-tcmalloc) disable_tcmalloc="yes" ;; + --enable-libnfs) libnfs="yes" + ;; --dynamic-libengines) dynamic_engines="yes" ;; --disable-dfs) dfs="no" @@ -273,6 +276,7 @@ if test "$show_help" = "yes" ; then echo "--disable-http Disable HTTP support even if found" echo "--disable-gfapi Disable gfapi" echo "--enable-libhdfs Enable hdfs support" + echo "--enable-libnfs Enable nfs support" echo "--disable-lex Disable use of lex/yacc for math" echo "--disable-pmem Disable pmem based engines even if found" echo "--enable-lex Enable use of lex/yacc for math" @@ -2276,6 +2280,21 @@ EOF fi fi print_config "DAOS File System (dfs) Engine" "$dfs" +# Check if we have libnfs (for nfs support). +if test "$libnfs" = "yes" ; then + if $(pkg-config libnfs); then + libnfs="yes" + libnfs_cflags=$(pkg-config --cflags libnfs) + # libnfs_libs=$(pkg-config --libs libnfs) + libnfs_libs=/usr/local/lib/libnfs.a + else + if test "$libnfs" = "yes" ; then + echo "libnfs" "Install libnfs" + fi + libnfs="no" + fi +fi +print_config "nfs engine" "$libnfs" ########################################## # Check if we have lex/yacc available @@ -3101,6 +3120,9 @@ fi if test "$dfs" = "yes" ; then output_sym "CONFIG_DFS" fi +if test "$libnfs" = "yes" ; then + output_sym "CONFIG_NFS" +fi if test "$march_set" = "no" && test "$build_native" = "yes" ; then output_sym "CONFIG_BUILD_NATIVE" fi @@ -3140,6 +3162,12 @@ if test "$libnbd" = "yes" ; then echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak fi +if test "$libnfs" = "yes" ; then + output_sym "CONFIG_LIBNFS" + echo "CONFIG_LIBNFS=m" >> $config_host_mak + echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak + echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak +fi if test "$dynamic_engines" = "yes" ; then output_sym "CONFIG_DYNAMIC_ENGINES" fi diff --git a/engines/nfs.c b/engines/nfs.c new file mode 100644 index 0000000000..df09477600 --- /dev/null +++ b/engines/nfs.c @@ -0,0 +1,351 @@ +// https://github.com/axboe/fio/pull/762 sample pull req for new engine +#include +#include +#include +#include +#include + +#include "../fio.h" +#include "../optgroup.h" + +enum nfs_op_type { + NFS_READ_WRITE = 0, + NFS_STAT_MKDIR_RMDIR, + NFS_STAT_TOUCH_RM, +}; + +struct fio_libnfs_options { + struct nfs_context *context; + char *nfs_url; + // the following implements a circular queue of outstanding IOs + int outstanding_events; // IOs issued to libnfs, that have not returned yet + int prev_requested_event_index; // event last returned via fio_libnfs_event + int next_buffered_event; // round robin-pointer within events[] + int buffered_event_count; // IOs completed by libnfs faiting for FIO + int free_event_buffer_index; // next empty buffer + unsigned int queue_depth; // nfs_callback needs this info, but doesn't have fio td structure to pull it from + struct io_u**events; +}; + +struct nfs_data { + struct nfsfh *nfsfh; + struct fio_libnfs_options *options; +}; + +static struct fio_option options[] = { + { + .name = "nfs_url", + .lname = "nfs_url", + .type = FIO_OPT_STR_STORE, + .help = "URL in libnfs format, eg nfs:///path[?arg=val[&arg=val]*]", + .off1 = offsetof(struct fio_libnfs_options, nfs_url), + .category = FIO_OPT_C_ENGINE, + .group = __FIO_OPT_G_NFS, + }, + { + .name = NULL, + }, +}; + +/* + * The ->event() hook is called to match an event number with an io_u. + * After the core has called ->getevents() and it has returned eg 3, + * the ->event() hook must return the 3 events that have completed for + * subsequent calls to ->event() with [0-2]. Required. + */ +static struct io_u *fio_libnfs_event(struct thread_data *td, int event) +{ + struct fio_libnfs_options *o = td->eo; + struct io_u *io_u = o->events[o->next_buffered_event]; + assert(o->events[o->next_buffered_event]); + o->events[o->next_buffered_event] = NULL; + o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth; + // validate our state machine + assert(o->buffered_event_count); + o->buffered_event_count--; + assert(io_u); + // assert that fio_libnfs_event is being called in sequential fashion + assert(event == 0 || o->prev_requested_event_index + 1 == event); + if (o->buffered_event_count == 0) { + o->prev_requested_event_index = -1; + } else { + o->prev_requested_event_index = event; + } + return io_u; +} + +static int nfs_event_loop(struct thread_data *td, bool flush) { + struct fio_libnfs_options *o = td->eo; + struct pollfd pfds[1]; /* nfs:0 */ + // we already have stuff queued for fio, no need to waste cpu on poll() + if (o->buffered_event_count) { + return o->buffered_event_count; + } + // fio core logic seems to stop calling this event-loop if we ever return with 0 events + #define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events)) + + do { + int timeout = SHOULD_WAIT() ? -1 : 0; + int ret = 0; + pfds[0].fd = nfs_get_fd(o->context); + pfds[0].events = nfs_which_events(o->context); + ret = poll(&pfds[0], 1, timeout); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + log_err("nfs: failed to poll events: %s.\n", + strerror(errno)); + break; + } + + ret = nfs_service(o->context, pfds[0].revents); + if (ret < 0) { + log_err("nfs: socket is in an unrecoverable error state.\n"); + break; + } + } while (SHOULD_WAIT()); + return o->buffered_event_count; +} +#undef SHOULD_WAIT + +/* + * The ->getevents() hook is used to reap completion events from an async + * io engine. It returns the number of completed events since the last call, + * which may then be retrieved by calling the ->event() hook with the event + * numbers. Required. + */ +static int fio_libnfs_getevents(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) +{ + return nfs_event_loop(td, false); +} + +static void nfs_callback(int res, struct nfs_context *nfs, void *data, + void *private_data) +{ + struct io_u *io_u = private_data; + struct nfs_data *nfs_data = io_u->file->engine_data; + struct fio_libnfs_options *o = nfs_data->options; + if (res < 0) { + log_err("Failed NFS operation(code:%d): %s\n", res, nfs_get_error(o->context)); + io_u->error = -res; + // res is used for read math below, don't wanna pass negative there + res = 0; + } else if (io_u->ddir == DDIR_READ) { + memcpy(io_u->buf, data, res); + if (res == 0) { + log_err("Got NFS EOF, this is probably not expected\n"); + } + } + // fio uses resid to track remaining data + io_u->resid = io_u->xfer_buflen - res; + + assert(!o->events[o->free_event_buffer_index]); + o->events[o->free_event_buffer_index] = io_u; + o->free_event_buffer_index = (o->free_event_buffer_index + 1) % o->queue_depth; + o->outstanding_events--; + o->buffered_event_count++; +} + +static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u) { + struct nfs_data *nfs_data = io_u->engine_data; + return nfs_pwrite_async(o->context, nfs_data->nfsfh, + io_u->offset, io_u->buflen, io_u->buf, nfs_callback, + io_u); +} + +static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) { + struct nfs_data *nfs_data = io_u->engine_data; + return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset, io_u->buflen, nfs_callback, io_u); +} + +/* + * The ->queue() hook is responsible for initiating io on the io_u + * being passed in. If the io engine is a synchronous one, io may complete + * before ->queue() returns. Required. + * + * The io engine must transfer in the direction noted by io_u->ddir + * to the buffer pointed to by io_u->xfer_buf for as many bytes as + * io_u->xfer_buflen. Residual data count may be set in io_u->resid + * for a short read/write. + */ +static enum fio_q_status fio_libnfs_queue(struct thread_data *td, + struct io_u *io_u) +{ + struct nfs_data *nfs_data = io_u->file->engine_data; + struct fio_libnfs_options *o = nfs_data->options; + struct nfs_context *nfs = o->context; + int err; + enum fio_q_status ret = FIO_Q_QUEUED; + + io_u->engine_data = nfs_data; + switch(io_u->ddir) { + case DDIR_WRITE: + err = queue_write(o, io_u); + break; + case DDIR_READ: + err = queue_read(o, io_u); + break; + case DDIR_TRIM: + log_err("nfs: trim is not supported"); + err = -1; + break; + default: + log_err("nfs: unhandled io %d\n", io_u->ddir); + err = -1; + } + if (err) { + log_err("nfs: Failed to queue nfs op: %s\n", nfs_get_error(nfs)); + td->error = 1; + return FIO_Q_COMPLETED; + } + o->outstanding_events++; + return ret; +} + +/** Do a mount if one has not been done before */ +static int do_mount(struct thread_data *td, const char *url) +{ + size_t event_size = sizeof(struct io_u **) * td->o.iodepth; + struct fio_libnfs_options *options = td->eo; + struct nfs_url *nfs_url = NULL; + int ret = 0; + int path_len = 0; + char *mnt_dir = NULL; + + if (options->context) { + return 0; + } + + options->context = nfs_init_context(); + if (options->context == NULL) { + log_err("nfs: failed to init nfs context\n"); + return -1; + } + + options->events = malloc(event_size); + memset(options->events, 0, event_size); + + options->prev_requested_event_index = -1; + options->queue_depth = td->o.iodepth; + + nfs_url = nfs_parse_url_full(options->context, url); + path_len = strlen(nfs_url->path); + mnt_dir = malloc(path_len + strlen(nfs_url->file) + 1); + strcpy(mnt_dir, nfs_url->path); + strcpy(mnt_dir + strlen(nfs_url->path), nfs_url->file); + ret = nfs_mount(options->context, nfs_url->server, mnt_dir); + free(mnt_dir); + nfs_destroy_url(nfs_url); + return ret; +} + +/* + * The init function is called once per thread/process, and should set up + * any structures that this io engine requires to keep track of io. Not + * required. + */ +static int fio_libnfs_setup(struct thread_data *td) +{ + // flipping this makes using gdb easier, but tends to hang fio on exit + td->o.use_thread = 0; + return 0; +} + +/* + * This is paired with the ->init() function and is called when a thread is + * done doing io. Should tear down anything setup by the ->init() function. + * Not required. + */ +static void fio_libnfs_cleanup(struct thread_data *td) +{ + struct fio_libnfs_options *o = td->eo; + nfs_umount(o->context); + nfs_destroy_context(o->context); + free(o->events); +} + +static int fio_libnfs_open(struct thread_data *td, struct fio_file *f) +{ + int ret; + struct fio_libnfs_options *options = td->eo; + struct nfs_data *nfs_data = NULL; + int flags = 0; + + if (!options->nfs_url) { + log_err("nfs: nfs_url is a required parameter\n"); + return -1; + } + + ret = do_mount(td, options->nfs_url); + + if (ret != 0) { + log_err("nfs: Failed to mount %s with code %d: %s\n", options->nfs_url, ret, nfs_get_error(options->context)); + return ret; + } + nfs_data = malloc(sizeof(struct nfs_data)); + memset(nfs_data, 0, sizeof(struct nfs_data)); + nfs_data->options = options; + + if (td->o.td_ddir == TD_DDIR_WRITE) { + flags |= O_CREAT | O_RDWR; + } else { + flags |= O_RDWR; + } + ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh); + + if (ret != 0) { + log_err("Failed to open %s: %s\n", f->file_name, nfs_get_error(options->context)); + } + f->engine_data = nfs_data; + return ret; +} + +static int fio_libnfs_close(struct thread_data *td, struct fio_file *f) +{ + struct nfs_data *nfs_data = f->engine_data; + struct fio_libnfs_options *o = nfs_data->options; + int ret = 0; + if (nfs_data->nfsfh) { + ret = nfs_close(o->context, nfs_data->nfsfh); + } + free(nfs_data); + f->engine_data = NULL; + return ret; +} + +/* + * Hook for writing out outstanding data. + */ +static int fio_libnfs_commit(struct thread_data *td) { + nfs_event_loop(td, true); + return 0; +} + +struct ioengine_ops ioengine = { + .name = "nfs", + .version = FIO_IOOPS_VERSION, + .setup = fio_libnfs_setup, + .queue = fio_libnfs_queue, + .getevents = fio_libnfs_getevents, + .event = fio_libnfs_event, + .cleanup = fio_libnfs_cleanup, + .open_file = fio_libnfs_open, + .close_file = fio_libnfs_close, + .commit = fio_libnfs_commit, + .flags = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL, + .options = options, + .option_struct_size = sizeof(struct fio_libnfs_options), +}; + +static void fio_init fio_nfs_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_nfs_unregister(void) +{ + unregister_ioengine(&ioengine); +} + diff --git a/examples/nfs.fio b/examples/nfs.fio new file mode 100644 index 0000000000..2449f4154d --- /dev/null +++ b/examples/nfs.fio @@ -0,0 +1,23 @@ +[global] +nfs_url=nfs://127.0.0.1/nfs +blocksize=524288 +iodepth=10 +ioengine=nfs +size=104857600 +lat_percentiles=1 +group_reporting +numjobs=10 +direct=1 +ramp_time=5s +filename_format=myfiles.$clientuid.$jobnum.$filenum +time_based=1 + +[write] +rw=write +runtime=10s +stonewall + +[read] +wait_for=write +rw=randread +runtime=10s \ No newline at end of file diff --git a/fio.1 b/fio.1 index f959e00d01..b12381b584 100644 --- a/fio.1 +++ b/fio.1 @@ -1882,6 +1882,12 @@ not be \fBcudamalloc\fR. This ioengine defines engine specific options. .B dfs I/O engine supporting asynchronous read and write operations to the DAOS File System (DFS) via libdfs. +.TP +.B nfs +I/O engine supporting asynchronous read and write operations to +NFS filesystems from userspace via libnfs. This is useful for +achieving higher concurrency and thus throughput than is possible +via kernel NFS. .SS "I/O engine specific parameters" In addition, there are some parameters which are only valid when a specific \fBioengine\fR is in use. These are used identically to normal parameters, @@ -2260,6 +2266,10 @@ Use DAOS container's chunk size by default. .BI (dfs)object_class Specificy a different object class for the dfs file. Use DAOS container's object class by default. +.TP +.BI (nfs)nfs_url +URL in libnfs format, eg nfs:///path[?arg=val[&arg=val]*] +Refer to the libnfs README for more details. .SS "I/O depth" .TP .BI iodepth \fR=\fPint diff --git a/optgroup.c b/optgroup.c index 15a16229ef..bebb4a5133 100644 --- a/optgroup.c +++ b/optgroup.c @@ -185,6 +185,10 @@ static const struct opt_group fio_opt_cat_groups[] = { .name = "DAOS File System (dfs) I/O engine", /* dfs */ .mask = FIO_OPT_G_DFS, }, + { + .name = "NFS I/O engine", /* nfs */ + .mask = FIO_OPT_G_NFS, + }, { .name = NULL, }, diff --git a/optgroup.h b/optgroup.h index ff74862968..1fb84a296b 100644 --- a/optgroup.h +++ b/optgroup.h @@ -70,6 +70,7 @@ enum opt_category_group { __FIO_OPT_G_NR, __FIO_OPT_G_LIBCUFILE, __FIO_OPT_G_DFS, + __FIO_OPT_G_NFS, FIO_OPT_G_RATE = (1ULL << __FIO_OPT_G_RATE), FIO_OPT_G_ZONE = (1ULL << __FIO_OPT_G_ZONE), @@ -110,6 +111,7 @@ enum opt_category_group { FIO_OPT_G_INVALID = (1ULL << __FIO_OPT_G_NR), FIO_OPT_G_ISCSI = (1ULL << __FIO_OPT_G_ISCSI), FIO_OPT_G_NBD = (1ULL << __FIO_OPT_G_NBD), + FIO_OPT_G_NFS = (1ULL << __FIO_OPT_G_NFS), FIO_OPT_G_IOURING = (1ULL << __FIO_OPT_G_IOURING), FIO_OPT_G_FILESTAT = (1ULL << __FIO_OPT_G_FILESTAT), FIO_OPT_G_LIBCUFILE = (1ULL << __FIO_OPT_G_LIBCUFILE), diff --git a/options.c b/options.c index ddabaa82d2..b82a10aa44 100644 --- a/options.c +++ b/options.c @@ -2025,6 +2025,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { { .ival = "dfs", .help = "DAOS File System (dfs) IO engine", }, +#endif +#ifdef CONFIG_NFS + { .ival = "nfs", + .help = "NFS IO engine", + }, #endif }, }, From ebcdccdeeec1673b8f7b12c4176d19982ddad7cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= Date: Mon, 11 Jan 2021 13:41:54 +0100 Subject: [PATCH 02/42] engines/libpmem: set file open/create mode always to RW MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit previously, when created file with a 'write' job it couldn't be open later on, when a 'read' job was ran. Signed-off-by: Łukasz Stolarczuk --- engines/libpmem.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/engines/libpmem.c b/engines/libpmem.c index 2338f0fa24..364e384de7 100644 --- a/engines/libpmem.c +++ b/engines/libpmem.c @@ -2,7 +2,7 @@ * libpmem: IO engine that uses PMDK libpmem to read and write data * * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation. - * Copyright 2018-2020, Intel Corporation + * Copyright 2018-2021, Intel Corporation * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License, @@ -97,17 +97,10 @@ static int fio_libpmem_file(struct thread_data *td, struct fio_file *f, size_t length, off_t off) { struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); - mode_t mode = 0; + mode_t mode = S_IWUSR | S_IRUSR; size_t mapped_len; int is_pmem; - if(td_rw(td)) - mode = S_IWUSR | S_IRUSR; - else if (td_write(td)) - mode = S_IWUSR; - else - mode = S_IRUSR; - dprint(FD_IO, "DEBUG fio_libpmem_file\n"); dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name, td->o.verify); From 0e684e9d0e1605ce31977f697c97e0b78d393638 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= Date: Wed, 13 Jan 2021 17:43:03 +0100 Subject: [PATCH 03/42] engines/libpmem: cleanup a little code, comments and example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Łukasz Stolarczuk --- engines/libpmem.c | 49 ++++++++++++++++++-------------------------- examples/libpmem.fio | 35 ++++++++++++++++--------------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/engines/libpmem.c b/engines/libpmem.c index 364e384de7..3502911257 100644 --- a/engines/libpmem.c +++ b/engines/libpmem.c @@ -18,7 +18,8 @@ /* * libpmem engine * - * IO engine that uses libpmem to write data (and memcpy to read) + * IO engine that uses libpmem (part of PMDK collection) to write data + * and libc's memcpy to read. It requires PMDK >= 1.5. * * To use: * ioengine=libpmem @@ -43,25 +44,13 @@ * mkdir /mnt/pmem0 * mount -o dax /dev/pmem0 /mnt/pmem0 * - * See examples/libpmem.fio for more. - * - * - * libpmem.so - * By default, the libpmem engine will let the system find the libpmem.so - * that it uses. You can use an alternative libpmem by setting the - * FIO_PMEM_LIB environment variable to the full path to the desired - * libpmem.so. This engine requires PMDK >= 1.5. + * See examples/libpmem.fio for complete usage example. */ #include -#include #include #include #include -#include -#include -#include -#include #include #include "../fio.h" @@ -77,8 +66,8 @@ static int fio_libpmem_init(struct thread_data *td) { struct thread_options *o = &td->o; - dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %u \n o->fdatasync_blocks %u \n", - o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks); + dprint(FD_IO, "o->rw_min_bs %llu\n o->fsync_blocks %u\n o->fdatasync_blocks %u\n", + o->rw_min_bs, o->fsync_blocks, o->fdatasync_blocks); dprint(FD_IO, "DEBUG fio_libpmem_init\n"); if ((o->rw_min_bs & page_mask) && @@ -91,7 +80,8 @@ static int fio_libpmem_init(struct thread_data *td) } /* - * This is the pmem_map_file execution function + * This is the pmem_map_file execution function, a helper to + * fio_libpmem_open_file function. */ static int fio_libpmem_file(struct thread_data *td, struct fio_file *f, size_t length, off_t off) @@ -135,11 +125,11 @@ static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f) { struct fio_libpmem_data *fdd; - dprint(FD_IO,"DEBUG fio_libpmem_open_file\n"); - dprint(FD_IO,"f->io_size=%ld \n",f->io_size); - dprint(FD_IO,"td->o.size=%lld \n",td->o.size); - dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth); - dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch); + dprint(FD_IO, "DEBUG fio_libpmem_open_file\n"); + dprint(FD_IO, "f->io_size=%ld\n", f->io_size); + dprint(FD_IO, "td->o.size=%lld\n", td->o.size); + dprint(FD_IO, "td->o.iodepth=%d\n", td->o.iodepth); + dprint(FD_IO, "td->o.iodepth_batch=%d\n", td->o.iodepth_batch); if (fio_file_open(f)) td_io_close_file(td, f); @@ -160,8 +150,8 @@ static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u) struct fio_file *f = io_u->file; struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); - dprint(FD_IO, "DEBUG fio_libpmem_prep\n" ); - dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %ld : " + dprint(FD_IO, "DEBUG fio_libpmem_prep\n"); + dprint(FD_IO, "io_u->offset %llu : fdd->libpmem_off %ld : " "io_u->buflen %llu : fdd->libpmem_sz %ld\n", io_u->offset, fdd->libpmem_off, io_u->buflen, fdd->libpmem_sz); @@ -185,8 +175,9 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td, io_u->error = 0; dprint(FD_IO, "DEBUG fio_libpmem_queue\n"); - dprint(FD_IO,"td->o.odirect %d td->o.sync_io %d \n",td->o.odirect, td->o.sync_io); - /* map both O_SYNC / DSYNC to not using NODRAIN */ + dprint(FD_IO, "td->o.odirect %d td->o.sync_io %d\n", + td->o.odirect, td->o.sync_io); + /* map both O_SYNC / DSYNC to not use NODRAIN */ flags = td->o.sync_io ? 0 : PMEM_F_MEM_NODRAIN; flags |= td->o.odirect ? PMEM_F_MEM_NONTEMPORAL : PMEM_F_MEM_TEMPORAL; @@ -196,7 +187,7 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td, break; case DDIR_WRITE: dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n", - io_u->mmap_data, io_u->xfer_buf ); + io_u->mmap_data, io_u->xfer_buf); pmem_memcpy(io_u->mmap_data, io_u->xfer_buf, io_u->xfer_buflen, @@ -220,8 +211,8 @@ static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f) struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); int ret = 0; - dprint(FD_IO,"DEBUG fio_libpmem_close_file\n"); - dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect); + dprint(FD_IO, "DEBUG fio_libpmem_close_file\n"); + dprint(FD_IO, "td->o.odirect %d\n", td->o.odirect); if (!td->o.odirect) { dprint(FD_IO,"pmem_drain\n"); diff --git a/examples/libpmem.fio b/examples/libpmem.fio index 0ff681f071..3b854a32bf 100644 --- a/examples/libpmem.fio +++ b/examples/libpmem.fio @@ -1,6 +1,6 @@ [global] bs=4k -size=8g +size=10g ioengine=libpmem norandommap time_based @@ -17,16 +17,6 @@ thread numjobs=1 runtime=300 -# -# In case of 'scramble_buffers=1', the source buffer -# is rewritten with a random value every write operations. -# -# But when 'scramble_buffers=0' is set, the source buffer isn't -# rewritten. So it will be likely that the source buffer is in CPU -# cache and it seems to be high performance. -# -scramble_buffers=0 - # # depends on direct option, flags are set for pmem_memcpy() call: # direct=1 - PMEM_F_MEM_NONTEMPORAL, @@ -39,9 +29,19 @@ direct=1 # sync=1 +# +# In case of 'scramble_buffers=1', the source buffer +# is rewritten with a random value every write operation. +# +# But when 'scramble_buffers=0' is set, the source buffer isn't +# rewritten. So it will be likely that the source buffer is in CPU +# cache and it seems to be high write performance. +# +scramble_buffers=1 # -# Setting for fio process's CPU Node and Memory Node +# Setting for fio process's CPU Node and Memory Node. +# Set proper node below or use `numactl` command along with FIO. # numa_cpu_nodes=0 numa_mem_policy=bind:0 @@ -53,21 +53,22 @@ cpus_allowed_policy=split # # The libpmem engine does IO to files in a DAX-mounted filesystem. -# The filesystem should be created on an NVDIMM (e.g /dev/pmem0) +# The filesystem should be created on a Non-Volatile DIMM (e.g /dev/pmem0) # and then mounted with the '-o dax' option. Note that the engine # accesses the underlying NVDIMM directly, bypassing the kernel block # layer, so the usual filesystem/disk performance monitoring tools such # as iostat will not provide useful data. # -directory=/mnt/pmem0 +#filename=/mnt/pmem/somefile +directory=/mnt/pmem [libpmem-seqwrite] rw=write stonewall -#[libpmem-seqread] -#rw=read -#stonewall +[libpmem-seqread] +rw=read +stonewall #[libpmem-randwrite] #rw=randwrite From 94c0b971d5e535e6b991899a57f88b6512412e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= Date: Thu, 14 Jan 2021 18:19:43 +0100 Subject: [PATCH 04/42] engines/libpmem: do not call drain on close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit no matter if direct was 1 or 0, it's not necessary. It's either covered by non-temporal stores or it's not desired by user (if 0 was set). Signed-off-by: Łukasz Stolarczuk --- engines/libpmem.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/engines/libpmem.c b/engines/libpmem.c index 3502911257..ab29a45383 100644 --- a/engines/libpmem.c +++ b/engines/libpmem.c @@ -212,12 +212,6 @@ static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f) int ret = 0; dprint(FD_IO, "DEBUG fio_libpmem_close_file\n"); - dprint(FD_IO, "td->o.odirect %d\n", td->o.odirect); - - if (!td->o.odirect) { - dprint(FD_IO,"pmem_drain\n"); - pmem_drain(); - } if (fdd->libpmem_ptr) ret = pmem_unmap(fdd->libpmem_ptr, fdd->libpmem_sz); From 165b8a70f919eb8858a9109f5d0db6548df2822c Mon Sep 17 00:00:00 2001 From: Taras Glek Date: Tue, 20 Apr 2021 11:02:18 -0700 Subject: [PATCH 05/42] NFS configure fixes --- configure | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/configure b/configure index a9f0c033e4..fd675d930b 100755 --- a/configure +++ b/configure @@ -170,9 +170,9 @@ disable_native="no" march_set="no" libiscsi="no" libnbd="no" +libnfs="no" libzbc="" dfs="" -libnfs="no" dynamic_engines="no" prefix=/usr/local @@ -242,7 +242,7 @@ for opt do ;; --disable-tcmalloc) disable_tcmalloc="yes" ;; - --enable-libnfs) libnfs="yes" + --disable-nfs) disable_nfs="yes" ;; --dynamic-libengines) dynamic_engines="yes" ;; @@ -274,6 +274,7 @@ if test "$show_help" = "yes" ; then echo "--disable-rados Disable Rados support even if found" echo "--disable-rbd Disable Rados Block Device even if found" echo "--disable-http Disable HTTP support even if found" + echo "--disable-nfs Disable userspace NFS support even if found" echo "--disable-gfapi Disable gfapi" echo "--enable-libhdfs Enable hdfs support" echo "--enable-libnfs Enable nfs support" @@ -2280,21 +2281,21 @@ EOF fi fi print_config "DAOS File System (dfs) Engine" "$dfs" -# Check if we have libnfs (for nfs support). -if test "$libnfs" = "yes" ; then + +########################################## +# Check if we have libnfs (for userspace nfs support). +if test "$disable_nfs" != "yes"; then if $(pkg-config libnfs); then libnfs="yes" libnfs_cflags=$(pkg-config --cflags libnfs) - # libnfs_libs=$(pkg-config --libs libnfs) - libnfs_libs=/usr/local/lib/libnfs.a + libnfs_libs=$(pkg-config --libs libnfs) else if test "$libnfs" = "yes" ; then echo "libnfs" "Install libnfs" fi - libnfs="no" fi fi -print_config "nfs engine" "$libnfs" +print_config "NFS engine" "$libnfs" ########################################## # Check if we have lex/yacc available From 4662c206a07e408b1970a577fda107e4f9397a68 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 22 Apr 2021 11:17:58 +0200 Subject: [PATCH 06/42] init: zonemode=zbd does not work with create_serialize=0 zbd_init_zone_info() has a comment that it only works correctly if it called before the first fio fork() call. However, right now, there is nothing that ensures this. If the user specifies --create_serialize=0 and --numjobs=2, each thread will get their own version of zbd_info. zbd_info contains one mutex per zone, so if the threads get different zbd_info, two threads can manage to lock the same zone at the same time, which will lead to I/O errors. Explicitly disallow --zonemode=zbd together with --create_serialize=0, so that we know that all threads will use the same zbd_info, instead of silently misbehaving. Analysis: setup_files() calls zbd_init_files() which calls zbd_init_zone_info(). zbd_init_zone_info() does a for_each_td(), where it checks if zbd_info (for the same filename) has already been allocated by another thread. This only works if create_serialize=1 (default). If create_serialize=0, zbd_init_zone_info() will get called in parallel, and in this case when the second thread checks if any other thread has allocated zbd_info, the check will fail, since the first thread has not yet been running long enough to allocate zbd_info. Signed-off-by: Niklas Cassel --- init.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/init.c b/init.c index 37bff8763c..60c7cff405 100644 --- a/init.c +++ b/init.c @@ -633,6 +633,11 @@ static int fixup_options(struct thread_data *td) ret |= 1; } + if (o->zone_mode == ZONE_MODE_ZBD && !o->create_serialize) { + log_err("fio: --zonemode=zbd and --create_serialize=0 are not compatible.\n"); + ret |= 1; + } + if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) { log_err("fio: --zonesize must be specified when using --zonemode=strided.\n"); ret |= 1; From 6a2299789dccdd24351744476586e7d562a3940d Mon Sep 17 00:00:00 2001 From: Oksana Salyk Date: Fri, 23 Apr 2021 08:09:44 +0200 Subject: [PATCH 07/42] rpma: gpspm: introduce the busy_wait_polling toggle The performance of the librpma_gpspm engine depends heavily on how much CPU power it can use to its work. One can want either to take all available CPU power and see what the maximum possible performance is or configure it less aggressively and collect the results when the CPU is not solely dedicated to doing this one task. The librpma_gpspm engine allows toggling between one and another by either waiting for incoming requests in the kernel using rpma_conn_completion_wait() (busy_wait_polling=0) or trying to collect the completion as soon as it appears by polling all the time using rpma_conn_completion_get() (busy_wait_polling=1). Signed-off-by: Oksana Salyk --- HOWTO | 5 +++++ engines/librpma_fio.c | 11 +++++++++++ engines/librpma_fio.h | 2 ++ engines/librpma_gpspm.c | 25 +++++++++++++++++++++++-- examples/librpma_gpspm-server.fio | 2 ++ fio.1 | 4 ++++ 6 files changed, 47 insertions(+), 2 deletions(-) diff --git a/HOWTO b/HOWTO index e6078c5f1e..889526d921 100644 --- a/HOWTO +++ b/HOWTO @@ -2237,6 +2237,11 @@ with the caveat that when used on the command line, they must come after the Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0. +.. option:: busy_wait_polling=bool : [librpma_*_server] + + Set to 0 to wait for completion instead of busy-wait polling completion. + Default: 1. + .. option:: interface=str : [netsplice] [net] The IP address of the network interface used to send or receive UDP diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c index 810b55e23d..3d605ed6c3 100644 --- a/engines/librpma_fio.c +++ b/engines/librpma_fio.c @@ -49,6 +49,17 @@ struct fio_option librpma_fio_options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_LIBRPMA, }, + { + .name = "busy_wait_polling", + .lname = "Set to 0 to wait for completion instead of busy-wait polling completion.", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct librpma_fio_options_values, + busy_wait_polling), + .help = "Set to false if you want to reduce CPU usage", + .def = "1", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBRPMA, + }, { .name = NULL, }, diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h index 8cfb2e2d1a..fb89d99d69 100644 --- a/engines/librpma_fio.h +++ b/engines/librpma_fio.h @@ -41,6 +41,8 @@ struct librpma_fio_options_values { char *port; /* Direct Write to PMem is possible */ unsigned int direct_write_to_pmem; + /* Set to 0 to wait for completion instead of busy-wait polling completion. */ + unsigned int busy_wait_polling; }; extern struct fio_option librpma_fio_options[]; diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c index ac614f462a..7414770971 100644 --- a/engines/librpma_gpspm.c +++ b/engines/librpma_gpspm.c @@ -683,12 +683,33 @@ static int server_cmpl_process(struct thread_data *td) struct librpma_fio_server_data *csd = td->io_ops_data; struct server_data *sd = csd->server_data; struct rpma_completion *cmpl = &sd->msgs_queued[sd->msg_queued_nr]; + struct librpma_fio_options_values *o = td->eo; int ret; ret = rpma_conn_completion_get(csd->conn, cmpl); if (ret == RPMA_E_NO_COMPLETION) { - /* lack of completion is not an error */ - return 0; + if (o->busy_wait_polling == 0) { + ret = rpma_conn_completion_wait(csd->conn); + if (ret == RPMA_E_NO_COMPLETION) { + /* lack of completion is not an error */ + return 0; + } else if (ret != 0) { + librpma_td_verror(td, ret, "rpma_conn_completion_wait"); + goto err_terminate; + } + + ret = rpma_conn_completion_get(csd->conn, cmpl); + if (ret == RPMA_E_NO_COMPLETION) { + /* lack of completion is not an error */ + return 0; + } else if (ret != 0) { + librpma_td_verror(td, ret, "rpma_conn_completion_get"); + goto err_terminate; + } + } else { + /* lack of completion is not an error */ + return 0; + } } else if (ret != 0) { librpma_td_verror(td, ret, "rpma_conn_completion_get"); goto err_terminate; diff --git a/examples/librpma_gpspm-server.fio b/examples/librpma_gpspm-server.fio index d618f2db21..67e92a28ad 100644 --- a/examples/librpma_gpspm-server.fio +++ b/examples/librpma_gpspm-server.fio @@ -20,6 +20,8 @@ thread # set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible # (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html) direct_write_to_pmem=0 +# set to 0 (false) to wait for completion instead of busy-wait polling completion. +busy_wait_polling=1 numjobs=1 # number of expected incomming connections iodepth=2 # number of parallel GPSPM requests size=100MiB # size of workspace for a single connection diff --git a/fio.1 b/fio.1 index 18dc156ad0..c3916168f2 100644 --- a/fio.1 +++ b/fio.1 @@ -1999,6 +1999,10 @@ The IP address to be used for RDMA-CM based I/O. .BI (librpma_*_server)direct_write_to_pmem \fR=\fPbool Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0. .TP +.BI (librpma_*_server)busy_wait_polling \fR=\fPbool +Set to 0 to wait for completion instead of busy-wait polling completion. +Default: 1. +.TP .BI (netsplice,net)interface \fR=\fPstr The IP address of the network interface used to send or receive UDP multicast. From e9d2a04d1278ce02140a8b8da4d5aede7a6ad39d Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Mon, 26 Apr 2021 00:10:40 +0900 Subject: [PATCH 08/42] gettime: Fix compilation on non-Linux with pthread_getaffinity_np() 874d55e50c("os/os-linux: add pthread CPU affinity helper") and a few commits after that broke compilation on non-Linux platforms which support pthread_getaffinity_np(). Define fio_get_thread_affinity() on non-Linux platforms, and make gettime test FIO_HAVE_GET_THREAD_AFFINITY which may or may not depend on pthread. FIO_HAVE_GET_THREAD_AFFINITY is currently not defined on Windows. Signed-off-by: Tomohiro Kusumi --- gettime.c | 2 +- os/os-aix.h | 6 ++++++ os/os-android.h | 6 ++++++ os/os-dragonfly.h | 6 ++++++ os/os-freebsd.h | 6 ++++++ os/os-hpux.h | 7 +++++++ os/os-linux.h | 3 +++ os/os-mac.h | 6 ++++++ os/os-netbsd.h | 6 ++++++ os/os-openbsd.h | 6 ++++++ os/os-solaris.h | 6 ++++++ 11 files changed, 59 insertions(+), 1 deletion(-) diff --git a/gettime.c b/gettime.c index e3f483a700..099e9d9f6c 100644 --- a/gettime.c +++ b/gettime.c @@ -679,7 +679,7 @@ int fio_monotonic_clocktest(int debug) unsigned int i; os_cpu_mask_t mask; -#ifdef CONFIG_PTHREAD_GETAFFINITY +#ifdef FIO_HAVE_GET_THREAD_AFFINITY fio_get_thread_affinity(mask); #else memset(&mask, 0, sizeof(mask)); diff --git a/os/os-aix.h b/os/os-aix.h index 1aab96e08d..db99eef4ce 100644 --- a/os/os-aix.h +++ b/os/os-aix.h @@ -18,6 +18,12 @@ #define FIO_USE_GENERIC_SWAP +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + static inline int blockdev_invalidate_cache(struct fio_file *f) { return ENOTSUP; diff --git a/os/os-android.h b/os/os-android.h index 3c05077624..3f1aa9d30a 100644 --- a/os/os-android.h +++ b/os/os-android.h @@ -58,6 +58,12 @@ #define MAP_HUGETLB 0x40000 /* arch specific */ #endif +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + #ifndef CONFIG_NO_SHM /* * Bionic doesn't support SysV shared memeory, so implement it using ashmem diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h index 44bfcd5d06..6e46589450 100644 --- a/os/os-dragonfly.h +++ b/os/os-dragonfly.h @@ -92,6 +92,12 @@ typedef cpumask_t os_cpu_mask_t; /* No CPU_COUNT(), but use the default function defined in os/os.h */ #define fio_cpu_count(mask) CPU_COUNT((mask)) +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + static inline int fio_cpuset_init(os_cpu_mask_t *mask) { CPUMASK_ASSZERO(*mask); diff --git a/os/os-freebsd.h b/os/os-freebsd.h index b3addf981f..1b24fa022a 100644 --- a/os/os-freebsd.h +++ b/os/os-freebsd.h @@ -37,6 +37,12 @@ typedef cpuset_t os_cpu_mask_t; #define fio_cpu_isset(mask, cpu) (CPU_ISSET((cpu), (mask)) != 0) #define fio_cpu_count(mask) CPU_COUNT((mask)) +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + static inline int fio_cpuset_init(os_cpu_mask_t *mask) { CPU_ZERO(mask); diff --git a/os/os-hpux.h b/os/os-hpux.h index c1dafe42ee..a80cb2bc47 100644 --- a/os/os-hpux.h +++ b/os/os-hpux.h @@ -38,6 +38,13 @@ #define FIO_USE_GENERIC_SWAP #define FIO_OS_HAVE_AIOCB_TYPEDEF + +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + typedef struct aiocb64 os_aiocb_t; static inline int blockdev_invalidate_cache(struct fio_file *f) diff --git a/os/os-linux.h b/os/os-linux.h index ea8d79221c..f7137abe1b 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -74,8 +74,11 @@ typedef cpu_set_t os_cpu_mask_t; sched_getaffinity((pid), (ptr)) #endif +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY #define fio_get_thread_affinity(mask) \ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif #define fio_cpu_clear(mask, cpu) (void) CPU_CLR((cpu), (mask)) #define fio_cpu_set(mask, cpu) (void) CPU_SET((cpu), (mask)) diff --git a/os/os-mac.h b/os/os-mac.h index 683aab3220..ec2cc1e555 100644 --- a/os/os-mac.h +++ b/os/os-mac.h @@ -27,6 +27,12 @@ #define fio_swap32(x) OSSwapInt32(x) #define fio_swap64(x) OSSwapInt64(x) +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + #ifndef CONFIG_CLOCKID_T typedef unsigned int clockid_t; #endif diff --git a/os/os-netbsd.h b/os/os-netbsd.h index abc1d3cb70..624c7fa509 100644 --- a/os/os-netbsd.h +++ b/os/os-netbsd.h @@ -35,6 +35,12 @@ #define fio_swap32(x) bswap32(x) #define fio_swap64(x) bswap64(x) +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) { struct disklabel dl; diff --git a/os/os-openbsd.h b/os/os-openbsd.h index 994bf078c9..f1bad67165 100644 --- a/os/os-openbsd.h +++ b/os/os-openbsd.h @@ -35,6 +35,12 @@ #define fio_swap32(x) swap32(x) #define fio_swap64(x) swap64(x) +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) { struct disklabel dl; diff --git a/os/os-solaris.h b/os/os-solaris.h index f1966f449d..ea1f081c89 100644 --- a/os/os-solaris.h +++ b/os/os-solaris.h @@ -46,6 +46,12 @@ struct solaris_rand_seed { #define os_ctime_r(x, y, z) ctime_r((x), (y), (z)) #define FIO_OS_HAS_CTIME_R +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + typedef psetid_t os_cpu_mask_t; static inline int chardev_size(struct fio_file *f, unsigned long long *bytes) From 3277b7e48e9d3600d4a33a652e8c2a20e59f2f37 Mon Sep 17 00:00:00 2001 From: Rebecca Cran Date: Wed, 21 Apr 2021 20:32:25 -0600 Subject: [PATCH 09/42] The GPL isn't a EULA: remove it and introduce WixUI_Minimal_NoEULA The GPL shouldn't be used as a EULA in an installer. Remove it, and since the WixUI_Minimal dialog set requires a EULA create a custom WixUI_Minimal_NoEULA set. Signed-off-by: Rebecca Cran Signed-off-by: Jens Axboe --- os/windows/WixUI_Minimal_NoEULA.wxs | 96 ++++++++++++++++++++++++++++ os/windows/WixUI_fio.wxl | 12 ++++ os/windows/dobuild.cmd | 5 +- os/windows/eula.rtf | Bin 1075 -> 0 bytes os/windows/install.wxs | 2 +- 5 files changed, 113 insertions(+), 2 deletions(-) create mode 100755 os/windows/WixUI_Minimal_NoEULA.wxs create mode 100755 os/windows/WixUI_fio.wxl delete mode 100755 os/windows/eula.rtf diff --git a/os/windows/WixUI_Minimal_NoEULA.wxs b/os/windows/WixUI_Minimal_NoEULA.wxs new file mode 100755 index 0000000000..48391186eb --- /dev/null +++ b/os/windows/WixUI_Minimal_NoEULA.wxs @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + + + + + + NOT Installed OR NOT PATCH + Installed AND PATCH + + + Installed AND PATCH + NOT Installed OR NOT PATCH + + + + + 1 + + 1 + + 1 + + 1 + 1 + 1 + + 1 + Installed AND PATCH + + + 0 + NOT Installed + + + + + + + + \ No newline at end of file diff --git a/os/windows/WixUI_fio.wxl b/os/windows/WixUI_fio.wxl new file mode 100755 index 0000000000..11ec736a55 --- /dev/null +++ b/os/windows/WixUI_fio.wxl @@ -0,0 +1,12 @@ + + + + + + + + + +The Setup Wizard will install [ProductName] on your computer. Click Install to continue or Cancel to exit the Setup Wizard. + + \ No newline at end of file diff --git a/os/windows/dobuild.cmd b/os/windows/dobuild.cmd index 08df3e876d..7b9cb1ddad 100644 --- a/os/windows/dobuild.cmd +++ b/os/windows/dobuild.cmd @@ -44,7 +44,10 @@ if exist ..\..\fio.pdb ( @if ERRORLEVEL 1 goto end "%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs @if ERRORLEVEL 1 goto end -"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi +"%WIX%bin\candle" -nologo -arch %FIO_ARCH% WixUI_Minimal_NoEULA.wxs +@if ERRORLEVEL 1 goto end + +"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj WixUI_Minimal_NoEULA.wixobj -loc WixUI_fio.wxl -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi :end if defined SIGN_FIO ( diff --git a/os/windows/eula.rtf b/os/windows/eula.rtf deleted file mode 100755 index a931017cd644cf9f1723705abb785fad694e113c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1075 zcmaJ=O>g2b5Z!a7{)aiXD+*2u^n(*yl@curD1a&?UraJdtTlFIJ5Yr9?>kA0s=bhd zzw&10z31_fTb~77XeXN?Df1xgg*?S9V_cg`CT3d-f8?3b-ft8w7|+~ZI>)o55Z0k* zFh;dQFB361ovyErJTb~xLc@StOc~EDqGo)fDhzwQL4*0pbJS>sHx^=<6qtq#0@h~j z;E8pE6W>YP|CKuJ@+t}7`ihsrrrdu#gp9_Z+pVWC=r_~jV?EvXSv_^{8%U2eklsTB z>316lMc0srrv~!aCrG1+wo2&y2}aQj?qHnudiNAd9X4sJlwq8 zLdXJk7cvTNetwS2LOLj|$*tG|Ei+ON&SZWsEIz}*RFDV_og15#x1@mg7WC9bJpmqp?>eeTY+=R`cfsg9yi?8Z1 zGL%(#X>=7w2t-=o(p_sYXPyj5xypD!IX8-EwMa+5=7`6*S;kETVV=PH+%|H zlAwa}lEl4K3N{Fp!>m#*ATQuOT8?L{B@Cx)cpt{`aJpO%2 z_!cWG+WF&nJ%5cS(eyVI-k0vV)7kI$?Oa!FWAhI6M%@>#1E&)O ctJFH>j~9`=Qj8}VJJEN38312;>EZtP2jP%zcmMzZ diff --git a/os/windows/install.wxs b/os/windows/install.wxs index f73ec5e251..7773bb3b86 100755 --- a/os/windows/install.wxs +++ b/os/windows/install.wxs @@ -107,7 +107,7 @@ - + From 6308ef297145e73add65ba86bfdbeaf967957d1f Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 27 Apr 2021 17:41:14 +0000 Subject: [PATCH 10/42] ioengines: don't call zbd_put_io_u() for engines not implementing commit Commit d9ed3e63e528 ("zbd: Fix zone locking for async I/O engines") added a call to zbd_put_io_u() in the case where td->io_ops->commit callback is not implemented on an ioengine. The commit in question fails to mention why this zbd_put_io_u() call was added for ioengines not implementing the commit callback. The code in td_io_queue() looks like this: ret = td->io_ops->queue(td, io_u); zbd_queue_io_u(td, io_u, ret); if (!td->io_ops->commit) { io_u_mark_submit(td, 1); io_u_mark_complete(td, 1); zbd_put_io_u(td, io_u); } SYNC I/O engine case (e.g. psync): The zone will be locked by zbd_adjust_block(), td->io_ops->queue(td, io_u), which for a sync I/O engine will return FIO_Q_COMPLETED. This return value will be send in to zbd_queue_io_u(), which at the end of the function, unlocks the zone if the return value from ->queue() differs from FIO_Q_QUEUED. For a sync I/O engine, the zone will be unlocked here, and io_u->zbd_put_io function pointer will be set to NULL. psync does not implement the ->commit() callback, so it will call zbd_put_io_u(), which will do nothing, because the io_u->zbd_put_io pointer is NULL. ASYNC I/O engine case (e.g. io_uring): The zone will be locked by zbd_adjust_block(), td->io_ops->queue(td, io_u), which for an async I/O engine will return FIO_Q_QUEUED. This return value will be send in to zbd_queue_io_u(), which at the end of the function, unlocks the zone if the return value from ->queue() differs from FIO_Q_QUEUED. For an async I/O engine, the zone will not be unlocked here, so the io_u->zbd_put_io function pointer will still be set. io_uring does implement the ->commit() callback, so it will not call zbd_put_io_u() here at all. Instead zbd_put_io_u() will be called by do_io() -> wait_for_completions() -> io_u_queued_complete() -> ios_completed() -> put_io_u() -> zbd_put_io_u(), which will unlock the zone and will set the io_u->zbd_put_io function pointer to NULL. In conclusion, the zbd_put_io_u() should never had been added in the case where the ->commit() callback wasn't implemented in the first place, and removing it shouldn't affect ioengines psync or io_uring. Commit d9ed3e63e528 ("zbd: Fix zone locking for async I/O engines") probably made the assumption that an async I/O engine == the ->commit() callback is implemented, however, this is not true, there are async I/O engines in tree (and out of tree), that does not implement the ->commit() callback. Instead, an async I/O engine is recognized by the ->queue() callback returning FIO_Q_QUEUED. Removing the invalid zbd_put_io_u() call will ensure that a zone is not prematurely unlocked for async I/O engines that do not implement the ->commit() callback. Unlocking a zone prematurely leads to I/O errors. Fixes: d9ed3e63e528 ("zbd: Fix zone locking for async I/O engines") Signed-off-by: Niklas Cassel Signed-off-by: Jens Axboe --- ioengines.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ioengines.c b/ioengines.c index f88b0537f1..3561bb4e6e 100644 --- a/ioengines.c +++ b/ioengines.c @@ -414,7 +414,6 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) if (!td->io_ops->commit) { io_u_mark_submit(td, 1); io_u_mark_complete(td, 1); - zbd_put_io_u(td, io_u); } if (ret == FIO_Q_COMPLETED) { From 1fb2bc2f73579bf4b9eb92c54a8479ccc204720c Mon Sep 17 00:00:00 2001 From: Taras Glek Date: Wed, 5 May 2021 09:00:13 -0700 Subject: [PATCH 11/42] C-style comments --- engines/nfs.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/engines/nfs.c b/engines/nfs.c index df09477600..70bfd24e27 100644 --- a/engines/nfs.c +++ b/engines/nfs.c @@ -1,4 +1,3 @@ -// https://github.com/axboe/fio/pull/762 sample pull req for new engine #include #include #include @@ -17,13 +16,13 @@ enum nfs_op_type { struct fio_libnfs_options { struct nfs_context *context; char *nfs_url; - // the following implements a circular queue of outstanding IOs - int outstanding_events; // IOs issued to libnfs, that have not returned yet - int prev_requested_event_index; // event last returned via fio_libnfs_event - int next_buffered_event; // round robin-pointer within events[] - int buffered_event_count; // IOs completed by libnfs faiting for FIO - int free_event_buffer_index; // next empty buffer - unsigned int queue_depth; // nfs_callback needs this info, but doesn't have fio td structure to pull it from + unsigned int queue_depth; /* nfs_callback needs this info, but doesn't have fio td structure to pull it from */ + /* the following implement a circular queue of outstanding IOs */ + int outstanding_events; /* IOs issued to libnfs, that have not returned yet */ + int prev_requested_event_index; /* event last returned via fio_libnfs_event */ + int next_buffered_event; /* round robin-pointer within events[] */ + int buffered_event_count; /* IOs completed by libnfs, waiting for FIO */ + int free_event_buffer_index; /* next free buffer */ struct io_u**events; }; @@ -60,11 +59,11 @@ static struct io_u *fio_libnfs_event(struct thread_data *td, int event) assert(o->events[o->next_buffered_event]); o->events[o->next_buffered_event] = NULL; o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth; - // validate our state machine + /* validate our state machine */ assert(o->buffered_event_count); o->buffered_event_count--; assert(io_u); - // assert that fio_libnfs_event is being called in sequential fashion + /* assert that fio_libnfs_event is being called in sequential fashion */ assert(event == 0 || o->prev_requested_event_index + 1 == event); if (o->buffered_event_count == 0) { o->prev_requested_event_index = -1; @@ -77,11 +76,11 @@ static struct io_u *fio_libnfs_event(struct thread_data *td, int event) static int nfs_event_loop(struct thread_data *td, bool flush) { struct fio_libnfs_options *o = td->eo; struct pollfd pfds[1]; /* nfs:0 */ - // we already have stuff queued for fio, no need to waste cpu on poll() + /* we already have stuff queued for fio, no need to waste cpu on poll() */ if (o->buffered_event_count) { return o->buffered_event_count; } - // fio core logic seems to stop calling this event-loop if we ever return with 0 events + /* fio core logic seems to stop calling this event-loop if we ever return with 0 events */ #define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events)) do { @@ -130,7 +129,7 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data, if (res < 0) { log_err("Failed NFS operation(code:%d): %s\n", res, nfs_get_error(o->context)); io_u->error = -res; - // res is used for read math below, don't wanna pass negative there + /* res is used for read math below, don't wanna pass negative there */ res = 0; } else if (io_u->ddir == DDIR_READ) { memcpy(io_u->buf, data, res); @@ -138,7 +137,7 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data, log_err("Got NFS EOF, this is probably not expected\n"); } } - // fio uses resid to track remaining data + /* fio uses resid to track remaining data */ io_u->resid = io_u->xfer_buflen - res; assert(!o->events[o->free_event_buffer_index]); @@ -248,7 +247,7 @@ static int do_mount(struct thread_data *td, const char *url) */ static int fio_libnfs_setup(struct thread_data *td) { - // flipping this makes using gdb easier, but tends to hang fio on exit + /* Using threads with libnfs causes fio to hang on exit, lower performance */ td->o.use_thread = 0; return 0; } From 7654a8d5e4d20e88556e427d6cc5944bcf042e9b Mon Sep 17 00:00:00 2001 From: Taras Glek Date: Wed, 5 May 2021 09:02:20 -0700 Subject: [PATCH 12/42] single line bodies --- engines/nfs.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/engines/nfs.c b/engines/nfs.c index 70bfd24e27..6d4ad7b1f2 100644 --- a/engines/nfs.c +++ b/engines/nfs.c @@ -77,9 +77,8 @@ static int nfs_event_loop(struct thread_data *td, bool flush) { struct fio_libnfs_options *o = td->eo; struct pollfd pfds[1]; /* nfs:0 */ /* we already have stuff queued for fio, no need to waste cpu on poll() */ - if (o->buffered_event_count) { + if (o->buffered_event_count) return o->buffered_event_count; - } /* fio core logic seems to stop calling this event-loop if we ever return with 0 events */ #define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events)) @@ -105,8 +104,8 @@ static int nfs_event_loop(struct thread_data *td, bool flush) { } } while (SHOULD_WAIT()); return o->buffered_event_count; -} #undef SHOULD_WAIT +} /* * The ->getevents() hook is used to reap completion events from an async @@ -133,9 +132,8 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data, res = 0; } else if (io_u->ddir == DDIR_READ) { memcpy(io_u->buf, data, res); - if (res == 0) { + if (res == 0) log_err("Got NFS EOF, this is probably not expected\n"); - } } /* fio uses resid to track remaining data */ io_u->resid = io_u->xfer_buflen - res; @@ -213,9 +211,8 @@ static int do_mount(struct thread_data *td, const char *url) int path_len = 0; char *mnt_dir = NULL; - if (options->context) { + if (options->context) return 0; - } options->context = nfs_init_context(); if (options->context == NULL) { @@ -294,9 +291,8 @@ static int fio_libnfs_open(struct thread_data *td, struct fio_file *f) } ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh); - if (ret != 0) { + if (ret != 0) log_err("Failed to open %s: %s\n", f->file_name, nfs_get_error(options->context)); - } f->engine_data = nfs_data; return ret; } @@ -306,9 +302,8 @@ static int fio_libnfs_close(struct thread_data *td, struct fio_file *f) struct nfs_data *nfs_data = f->engine_data; struct fio_libnfs_options *o = nfs_data->options; int ret = 0; - if (nfs_data->nfsfh) { + if (nfs_data->nfsfh) ret = nfs_close(o->context, nfs_data->nfsfh); - } free(nfs_data); f->engine_data = NULL; return ret; From 388f111191981b7162ce3283bc33afbe6ca7dc79 Mon Sep 17 00:00:00 2001 From: Taras Glek Date: Wed, 5 May 2021 09:04:56 -0700 Subject: [PATCH 13/42] skip skeleton comments --- engines/nfs.c | 37 +++---------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/engines/nfs.c b/engines/nfs.c index 6d4ad7b1f2..21be88334d 100644 --- a/engines/nfs.c +++ b/engines/nfs.c @@ -46,12 +46,6 @@ static struct fio_option options[] = { }, }; -/* - * The ->event() hook is called to match an event number with an io_u. - * After the core has called ->getevents() and it has returned eg 3, - * the ->event() hook must return the 3 events that have completed for - * subsequent calls to ->event() with [0-2]. Required. - */ static struct io_u *fio_libnfs_event(struct thread_data *td, int event) { struct fio_libnfs_options *o = td->eo; @@ -107,12 +101,6 @@ static int nfs_event_loop(struct thread_data *td, bool flush) { #undef SHOULD_WAIT } -/* - * The ->getevents() hook is used to reap completion events from an async - * io engine. It returns the number of completed events since the last call, - * which may then be retrieved by calling the ->event() hook with the event - * numbers. Required. - */ static int fio_libnfs_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t) { @@ -157,16 +145,6 @@ static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) { return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset, io_u->buflen, nfs_callback, io_u); } -/* - * The ->queue() hook is responsible for initiating io on the io_u - * being passed in. If the io engine is a synchronous one, io may complete - * before ->queue() returns. Required. - * - * The io engine must transfer in the direction noted by io_u->ddir - * to the buffer pointed to by io_u->xfer_buf for as many bytes as - * io_u->xfer_buflen. Residual data count may be set in io_u->resid - * for a short read/write. - */ static enum fio_q_status fio_libnfs_queue(struct thread_data *td, struct io_u *io_u) { @@ -201,7 +179,9 @@ static enum fio_q_status fio_libnfs_queue(struct thread_data *td, return ret; } -/** Do a mount if one has not been done before */ +/* + * Do a mount if one has not been done before + */ static int do_mount(struct thread_data *td, const char *url) { size_t event_size = sizeof(struct io_u **) * td->o.iodepth; @@ -237,11 +217,6 @@ static int do_mount(struct thread_data *td, const char *url) return ret; } -/* - * The init function is called once per thread/process, and should set up - * any structures that this io engine requires to keep track of io. Not - * required. - */ static int fio_libnfs_setup(struct thread_data *td) { /* Using threads with libnfs causes fio to hang on exit, lower performance */ @@ -249,11 +224,6 @@ static int fio_libnfs_setup(struct thread_data *td) return 0; } -/* - * This is paired with the ->init() function and is called when a thread is - * done doing io. Should tear down anything setup by the ->init() function. - * Not required. - */ static void fio_libnfs_cleanup(struct thread_data *td) { struct fio_libnfs_options *o = td->eo; @@ -342,4 +312,3 @@ static void fio_exit fio_nfs_unregister(void) { unregister_ioengine(&ioengine); } - From c94b8f18181f2aca2e5ad25aa66cb1e354570e9f Mon Sep 17 00:00:00 2001 From: Taras Glek Date: Wed, 5 May 2021 09:11:06 -0700 Subject: [PATCH 14/42] clean up nfs example --- examples/nfs.fio | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/nfs.fio b/examples/nfs.fio index 2449f4154d..f856cebfbe 100644 --- a/examples/nfs.fio +++ b/examples/nfs.fio @@ -7,7 +7,6 @@ size=104857600 lat_percentiles=1 group_reporting numjobs=10 -direct=1 ramp_time=5s filename_format=myfiles.$clientuid.$jobnum.$filenum time_based=1 @@ -20,4 +19,4 @@ stonewall [read] wait_for=write rw=randread -runtime=10s \ No newline at end of file +runtime=10s From cffe80a41cbf9b26446c803177a27f7695f94a31 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 6 May 2021 17:23:31 +0100 Subject: [PATCH 15/42] configure: fix check_min_lib_version() eval The following shell statement: if eval "echo \$$_feature" = "yes" ; then executes: echo $... = "yes" It does not actually compare the variable named by $_feature to the string "yes". Add the missing "test" call so the comparison happens as intended and wrap the eval so it doesn't include the = "yes". Fixes: 3e48f7c9de61 ("configure: fix syntax error with NetBSD") Cc: Dmitry Fomichev Signed-off-by: Stefan Hajnoczi Signed-off-by: Jens Axboe --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index a7d82be06b..e886bdc8c0 100755 --- a/configure +++ b/configure @@ -142,7 +142,7 @@ check_min_lib_version() { fi : "${_feature:=${1}}" if "${cross_prefix}"pkg-config --version > /dev/null 2>&1; then - if eval "echo \$$_feature" = "yes" ; then + if test "$(eval echo \"\$$_feature\")" = "yes" ; then feature_not_found "$_feature" "$1 >= $2" fi else From 193aaf6a41329b1858d75970cdc4e1777b87c07a Mon Sep 17 00:00:00 2001 From: Gonzalez Date: Thu, 6 May 2021 11:15:41 -0700 Subject: [PATCH 16/42] Add Documentation for z unit --- HOWTO | 14 ++++++++++---- fio.1 | 26 +++++++++++++++++++------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/HOWTO b/HOWTO index 889526d921..177310f64f 100644 --- a/HOWTO +++ b/HOWTO @@ -544,6 +544,9 @@ Parameter types * *Ti* -- means tebi (Ti) or 1024**4 * *Pi* -- means pebi (Pi) or 1024**5 + For Zone Block Device Mode: + * *z* -- means Zone + With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite from those specified in the SI and IEC 80000-13 standards to provide compatibility with old scripts. For example, 4k means 4096. @@ -1277,13 +1280,14 @@ I/O type .. option:: offset=int Start I/O at the provided offset in the file, given as either a fixed size in - bytes or a percentage. If a percentage is given, the generated offset will be + bytes, zones or a percentage. If a percentage is given, the generated offset will be aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if provided. Data before the given offset will not be touched. This effectively caps the file size at `real_size - offset`. Can be combined with :option:`size` to constrain the start and end range of the I/O workload. A percentage can be specified by a number between 1 and 100 followed by '%', - for example, ``offset=20%`` to specify 20%. + for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as + number of zones using 'z'. .. option:: offset_align=int @@ -1300,7 +1304,8 @@ I/O type intended to operate on a file in parallel disjoint segments, with even spacing between the starting points. Percentages can be used for this option. If a percentage is given, the generated offset will be aligned to the minimum - ``blocksize`` or to the value of ``offset_align`` if provided. + ``blocksize`` or to the value of ``offset_align`` if provided. In ZBD mode, value can + also be set as number of zones using 'z'. .. option:: number_ios=int @@ -1818,7 +1823,8 @@ I/O size If this option is not specified, fio will use the full size of the given files or devices. If the files do not exist, size must be given. It is also possible to give size as a percentage between 1 and 100. If ``size=20%`` is - given, fio will use 20% of the full size of the given files or devices. + given, fio will use 20% of the full size of the given files or devices. + In ZBD mode, value can also be set as number of zones using 'z'. Can be combined with :option:`offset` to constrain the start and end range that I/O will be done within. diff --git a/fio.1 b/fio.1 index c3916168f2..e7da5c6826 100644 --- a/fio.1 +++ b/fio.1 @@ -288,6 +288,15 @@ Pi means pebi (Pi) or 1024**5 .PD .RE .P +For Zone Block Device Mode: +.RS +.P +.PD 0 +z means Zone +.P +.PD +.RE +.P With `kb_base=1024' (the default), the unit prefixes are opposite from those specified in the SI and IEC 80000-13 standards to provide compatibility with old scripts. For example, 4k means 4096. @@ -1061,13 +1070,14 @@ should be associated with them. .TP .BI offset \fR=\fPint[%|z] Start I/O at the provided offset in the file, given as either a fixed size in -bytes or a percentage. If a percentage is given, the generated offset will be +bytes, zones or a percentage. If a percentage is given, the generated offset will be aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if provided. Data before the given offset will not be touched. This effectively caps the file size at `real_size \- offset'. Can be combined with \fBsize\fR to constrain the start and end range of the I/O workload. A percentage can be specified by a number between 1 and 100 followed by '%', -for example, `offset=20%' to specify 20%. +for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as +number of zones using 'z'. .TP .BI offset_align \fR=\fPint If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR @@ -1082,7 +1092,8 @@ specified). This option is useful if there are several jobs which are intended to operate on a file in parallel disjoint segments, with even spacing between the starting points. Percentages can be used for this option. If a percentage is given, the generated offset will be aligned to the minimum -\fBblocksize\fR or to the value of \fBoffset_align\fR if provided. +\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value +can be set as number of zones using 'z'. .TP .BI number_ios \fR=\fPint Fio will normally perform I/Os until it has exhausted the size of the region @@ -1607,9 +1618,9 @@ set to the physical size of the given files or devices if they exist. If this option is not specified, fio will use the full size of the given files or devices. If the files do not exist, size must be given. It is also possible to give size as a percentage between 1 and 100. If `size=20%' is -given, fio will use 20% of the full size of the given files or devices. -Can be combined with \fBoffset\fR to constrain the start and end range -that I/O will be done within. +given, fio will use 20% of the full size of the given files or devices. In ZBD mode, +size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to +constrain the start and end range that I/O will be done within. .TP .BI io_size \fR=\fPint[%|z] "\fR,\fB io_limit" \fR=\fPint[%|z] Normally fio operates within the region set by \fBsize\fR, which means @@ -1621,7 +1632,8 @@ will perform I/O within the first 20GiB but exit when 5GiB have been done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB, and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within the 0..20GiB region. Value can be set as percentage: \fBio_size\fR=N%. -In this case \fBio_size\fR multiplies \fBsize\fR= value. +In this case \fBio_size\fR multiplies \fBsize\fR= value. In ZBD mode, value can +also be set as number of zones using 'z'. .TP .BI filesize \fR=\fPirange(int) Individual file sizes. May be a range, in which case fio will select sizes From 79f488cbd95ca6989031a7ace5ec382313d31b3c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 7 May 2021 16:13:05 -0500 Subject: [PATCH 17/42] don't access dlclose'd dynamic ioengine object after close Alexey reported this bug when using dynamically loaded IO engines; a segfault on the line where we set the dlhandle to NULL after the dlclose. I think this is because ops points to the thing we obtained from dlsym: ops = dlsym(dlhandle, engine_lib); and after the final dlclose, the object no longer exists and efforts to set the handle within it will fail for obvious reasons. I'm not sure why I hadn't seen this before. Fixes-RH-Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1956963 Reported-by: Alexey Dobriyan Fixes: f6931a1 ("fio: move dynamic library handle to io_ops structure") Tested-by: Alexey Dobriyan Signed-off-by: Eric Sandeen Signed-off-by: Jens Axboe --- ioengines.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ioengines.c b/ioengines.c index 3561bb4e6e..dd61af07a4 100644 --- a/ioengines.c +++ b/ioengines.c @@ -234,7 +234,6 @@ void free_ioengine(struct thread_data *td) if (td->io_ops->dlhandle) { dprint(FD_IO, "dlclose ioengine %s\n", td->io_ops->name); dlclose(td->io_ops->dlhandle); - td->io_ops->dlhandle = NULL; } td->io_ops = NULL; From 6ee607ba9c5129ebf0bac1c42fa0a4700456cb88 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 6 May 2021 13:18:45 +0000 Subject: [PATCH 18/42] oslib/linux-blkzoned: make sure that we always support zone capacity A common problem is that users upgrade their kernel to support NVMe ZNS devices, however, they still use the kernel uapi headers provided by their distro. This means that even if the kernel will populate the zone capacity fields for each zone in the zone report returned by the ioctl, fio will not know how to interpret that data. This leads to fio writing past the zone capacity, which will lead to I/O errors. It is not trivial for a user to realize that the kernel uapi headers provided by their distro is the reason for these I/O errors. In order to make it easier for these users, provide a copy of the current zoned block device kernel uapi structs. If the kernel uapi headers installed on the system are too old to support zone capacity, use the locally defined structs instead. If the installed headers are new enough to support zone capacity, use the installed headers. This way, fio will always be able to handle zone capacity (if the kernel supports it). At the same time, we will not redefine any structs from the installed headers if they are newer than our locally defined structs. Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel Signed-off-by: Jens Axboe --- oslib/linux-blkzoned.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c index f37c67fc86..81e4e7f0d5 100644 --- a/oslib/linux-blkzoned.c +++ b/oslib/linux-blkzoned.c @@ -23,6 +23,37 @@ #include +/* + * If the uapi headers installed on the system lacks zone capacity support, + * use our local versions. If the installed headers are recent enough to + * support zone capacity, do not redefine any structs. + */ +#ifndef CONFIG_HAVE_REP_CAPACITY +#define BLK_ZONE_REP_CAPACITY (1 << 0) + +struct blk_zone_v2 { + __u64 start; /* Zone start sector */ + __u64 len; /* Zone length in number of sectors */ + __u64 wp; /* Zone write pointer position */ + __u8 type; /* Zone type */ + __u8 cond; /* Zone condition */ + __u8 non_seq; /* Non-sequential write resources active */ + __u8 reset; /* Reset write pointer recommended */ + __u8 resv[4]; + __u64 capacity; /* Zone capacity in number of sectors */ + __u8 reserved[24]; +}; +#define blk_zone blk_zone_v2 + +struct blk_zone_report_v2 { + __u64 sector; + __u32 nr_zones; + __u32 flags; +struct blk_zone zones[0]; +}; +#define blk_zone_report blk_zone_report_v2 +#endif /* CONFIG_HAVE_REP_CAPACITY */ + /* * Read up to 255 characters from the first line of a file. Strip the trailing * newline. @@ -116,10 +147,8 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, static uint64_t zone_capacity(struct blk_zone_report *hdr, struct blk_zone *blkz) { -#ifdef CONFIG_HAVE_REP_CAPACITY if (hdr->flags & BLK_ZONE_REP_CAPACITY) return blkz->capacity << 9; -#endif return blkz->len << 9; } From 418f53993b07e48b5a69db84c9c7209acd53eac3 Mon Sep 17 00:00:00 2001 From: Martin Bukatovic Date: Tue, 11 May 2021 09:38:55 +0200 Subject: [PATCH 19/42] Make fill_device to stop writing on EDQUOT Option fill_device stops writing when we run out of quota as well. Signed-off-by: Martin Bukatovic Signed-off-by: Jens Axboe --- HOWTO | 3 ++- backend.c | 7 ++++--- filesetup.c | 11 ++++++++--- fio.1 | 3 ++- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/HOWTO b/HOWTO index 177310f64f..f5681c0dca 100644 --- a/HOWTO +++ b/HOWTO @@ -1858,7 +1858,8 @@ I/O size .. option:: fill_device=bool, fill_fs=bool Sets size to something really large and waits for ENOSPC (no space left on - device) as the terminating condition. Only makes sense with sequential + device) or EDQUOT (disk quota exceeded) + as the terminating condition. Only makes sense with sequential write. For a read workload, the mount point will be filled first then I/O started on the result. This option doesn't make sense if operating on a raw device node, since the size of that is already known by the file system. diff --git a/backend.c b/backend.c index 399c299e14..6290e0d652 100644 --- a/backend.c +++ b/backend.c @@ -393,7 +393,7 @@ static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir, td_clear_error(td); *retptr = 0; return false; - } else if (td->o.fill_device && err == ENOSPC) { + } else if (td->o.fill_device && (err == ENOSPC || err == EDQUOT)) { /* * We expect to hit this error if * fill_device option is set. @@ -1105,7 +1105,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) if (td->trim_entries) log_err("fio: %lu trim entries leaked?\n", td->trim_entries); - if (td->o.fill_device && td->error == ENOSPC) { + if (td->o.fill_device && (td->error == ENOSPC || td->error == EDQUOT)) { td->error = 0; fio_mark_td_terminate(td); } @@ -1120,7 +1120,8 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) if (i) { ret = io_u_queued_complete(td, i); - if (td->o.fill_device && td->error == ENOSPC) + if (td->o.fill_device && + (td->error == ENOSPC || td->error == EDQUOT)) td->error = 0; } diff --git a/filesetup.c b/filesetup.c index e664f8b42f..296de5a11a 100644 --- a/filesetup.c +++ b/filesetup.c @@ -226,11 +226,16 @@ static int extend_file(struct thread_data *td, struct fio_file *f) if (r < 0) { int __e = errno; - if (__e == ENOSPC) { + if (__e == ENOSPC || __e == EDQUOT) { + const char *__e_name; if (td->o.fill_device) break; - log_info("fio: ENOSPC on laying out " - "file, stopping\n"); + if (__e == ENOSPC) + __e_name = "ENOSPC"; + else + __e_name = "EDQUOT"; + log_info("fio: %s on laying out " + "file, stopping\n", __e_name); } td_verror(td, errno, "write"); } else diff --git a/fio.1 b/fio.1 index e7da5c6826..533bcf6a52 100644 --- a/fio.1 +++ b/fio.1 @@ -1650,7 +1650,8 @@ of a file. This option is ignored on non-regular files. .TP .BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool Sets size to something really large and waits for ENOSPC (no space left on -device) as the terminating condition. Only makes sense with sequential +device) or EDQUOT (disk quota exceeded) +as the terminating condition. Only makes sense with sequential write. For a read workload, the mount point will be filled first then I/O started on the result. This option doesn't make sense if operating on a raw device node, since the size of that is already known by the file system. From 30bec59eab3908b681cbc2866179f7166a849c83 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 May 2021 07:58:03 -0600 Subject: [PATCH 20/42] os: define EDQUOT to EIO if the OS doesn't provide it Fixes: 418f53993b07 ("Make fill_device to stop writing on EDQUOT") Signed-off-by: Jens Axboe --- os/os.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/os/os.h b/os/os.h index b46f416400..e47d3d9706 100644 --- a/os/os.h +++ b/os/os.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "../arch/arch.h" /* IWYU pragma: export */ #include "../lib/types.h" @@ -58,6 +59,10 @@ typedef enum { #error "unsupported os" #endif +#ifndef EDQUOT +#define EDQUOT EIO +#endif + #ifdef CONFIG_POSIXAIO #include #ifndef FIO_OS_HAVE_AIOCB_TYPEDEF From 2984a4fcedcdc5536b2559d634694fb8fecf40c4 Mon Sep 17 00:00:00 2001 From: Lars Kellogg-Stedman Date: Wed, 12 May 2021 18:41:43 -0400 Subject: [PATCH 21/42] fix fio2gnuplot to work with new logging format The logging format updates documented in 1a953d97 were never propagated to fio2gnuplot, which since then has been failing with a ValueError exception. This commit explicits limits fio2gnuplot to only reading the first 4 columns in the log file. Closes #928 --- tools/plot/fio2gnuplot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot index 78ee82fb80..d2dc81df9b 100755 --- a/tools/plot/fio2gnuplot +++ b/tools/plot/fio2gnuplot @@ -198,7 +198,7 @@ def compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir, min_time, max_ # Index will be used to remember what file was featuring what value index=index+1 - time, perf, x, block_size = line[1] + time, perf, x, block_size = line[1][:4] if (blk_size == 0): try: blk_size=int(block_size) From 106e14ce87c5b1984727aabf9a48f7284bff21c1 Mon Sep 17 00:00:00 2001 From: Felix Abecassis Date: Thu, 13 May 2021 17:02:40 -0700 Subject: [PATCH 22/42] stat: fix integer overflow in convert_agg_kbytes_percent Assuming that "int" is 32-bit, for high bandwidth values (> 21.5 GB/s) the expression "mean * 100" will cause an integer overflow before the conversion to "double" happens. Signed-off-by: Felix Abecassis --- stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stat.c b/stat.c index b7222f465f..a8a96c85a4 100644 --- a/stat.c +++ b/stat.c @@ -462,7 +462,7 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i { double p_of_agg = 100.0; if (rs && rs->agg[ddir] > 1024) { - p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024.0); + p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0); if (p_of_agg > 100.0) p_of_agg = 100.0; From 6399ab79bf410ac317260614c36f60ad76e5aa35 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 14 May 2021 12:52:51 +0000 Subject: [PATCH 23/42] zbd: only put an upper limit on max open zones once There is an upper limit that is checked for each td, and for each file, even though a file has a pointer to a zoned_block_device_info that has already been created. Multiple files, from the same or from another td can point to the same zoned_block_device_info. All zoned_block_device_info:s have already been created earlier in the call chain. Simplify this by only checking the upper limit on max open zones when a zoned_block_device_info is created. This way, max_open_zones is handled from a single location, instead of potentially being reassigned from a completely different location. Signed-off-by: Niklas Cassel Signed-off-by: Jens Axboe --- zbd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zbd.c b/zbd.c index eed796b321..46ff9aeb04 100644 --- a/zbd.c +++ b/zbd.c @@ -588,7 +588,8 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) if (ret == 0) { f->zbd_info->model = zbd_model; - f->zbd_info->max_open_zones = td->o.max_open_zones; + f->zbd_info->max_open_zones = + min_not_zero(td->o.max_open_zones, ZBD_MAX_OPEN_ZONES); } return ret; } @@ -726,8 +727,6 @@ int zbd_setup_files(struct thread_data *td) if (zbd_is_seq_job(f)) assert(f->min_zone < f->max_zone); - zbd->max_open_zones = zbd->max_open_zones ?: ZBD_MAX_OPEN_ZONES; - if (td->o.max_open_zones > 0 && zbd->max_open_zones != td->o.max_open_zones) { log_err("Different 'max_open_zones' values\n"); From eaa45783ef5079884f96813e74c6b450dc52d0f0 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 14 May 2021 12:52:51 +0000 Subject: [PATCH 24/42] oslib/linux-blkzoned: move sysfs reading into its own function Move the sysfs reading into its own function so that it can be reused. This new function will be reused in a following patch. No functional change intended. Signed-off-by: Niklas Cassel Signed-off-by: Jens Axboe --- oslib/linux-blkzoned.c | 62 +++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c index 81e4e7f0d5..84a64ed301 100644 --- a/oslib/linux-blkzoned.c +++ b/oslib/linux-blkzoned.c @@ -74,12 +74,16 @@ static char *read_file(const char *path) return strdup(line); } -int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, - enum zbd_zoned_model *model) +/* + * Get the value of a sysfs attribute for a block device. + * + * Returns NULL on failure. + * Returns a pointer to a string on success. + * The caller is responsible for freeing the memory. + */ +static char *blkzoned_get_sysfs_attr(const char *file_name, const char *attr) { - const char *file_name = f->file_name; - char *zoned_attr_path = NULL; - char *model_str = NULL; + char *attr_path = NULL; struct stat statbuf; char *sys_devno_path = NULL; char *part_attr_path = NULL; @@ -87,13 +91,7 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, char sys_path[PATH_MAX]; ssize_t sz; char *delim = NULL; - - if (f->filetype != FIO_TYPE_BLOCK) { - *model = ZBD_IGNORE; - return 0; - } - - *model = ZBD_NONE; + char *attr_str = NULL; if (stat(file_name, &statbuf) < 0) goto out; @@ -123,24 +121,44 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, *delim = '\0'; } - if (asprintf(&zoned_attr_path, - "/sys/dev/block/%s/queue/zoned", sys_path) < 0) + if (asprintf(&attr_path, + "/sys/dev/block/%s/%s", sys_path, attr) < 0) goto out; - model_str = read_file(zoned_attr_path); + attr_str = read_file(attr_path); +out: + free(attr_path); + free(part_str); + free(part_attr_path); + free(sys_devno_path); + + return attr_str; +} + +int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model) +{ + char *model_str = NULL; + + if (f->filetype != FIO_TYPE_BLOCK) { + *model = ZBD_IGNORE; + return 0; + } + + *model = ZBD_NONE; + + model_str = blkzoned_get_sysfs_attr(f->file_name, "queue/zoned"); if (!model_str) - goto out; - dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str); + return 0; + + dprint(FD_ZBD, "%s: zbd model string: %s\n", f->file_name, model_str); if (strcmp(model_str, "host-aware") == 0) *model = ZBD_HOST_AWARE; else if (strcmp(model_str, "host-managed") == 0) *model = ZBD_HOST_MANAGED; -out: + free(model_str); - free(zoned_attr_path); - free(part_str); - free(part_attr_path); - free(sys_devno_path); + return 0; } From d2f442bc0bd507510089d56cd510616093415702 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 14 May 2021 12:53:14 +0000 Subject: [PATCH 25/42] ioengines: add get_max_open_zones zoned block device operation Define a new IO engine operation to get the maximum number of open zones. Like the existing IO engine operations: .get_zoned_model, .report_zones, and .reset_wp, this new IO engine operation is only valid for zoned block devices. Similarly to the other zbd IO engine operations, also provide a default implementation inside oslib/linux-blkzoned.c that will be used if the ioengine does not override it. The default Linux oslib implementation is implemented similarly to blkzoned_get_zoned_model(), i.e. it will return a successful error code even when the sysfs attribute does not exist. This is because the sysfs max_open_zones attribute was introduced first in Linux v5.9. All error handling is still there, so an ioengine that provides its own implementation will still have its error code respected properly. Signed-off-by: Niklas Cassel Signed-off-by: Jens Axboe --- engines/skeleton_external.c | 13 ++++++ ioengines.h | 4 +- oslib/blkzoned.h | 7 +++ oslib/linux-blkzoned.c | 21 +++++++++ zbd.c | 90 ++++++++++++++++++++++++++++++++++--- 5 files changed, 129 insertions(+), 6 deletions(-) diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c index 7f3e4cb3a1..c79b6f1114 100644 --- a/engines/skeleton_external.c +++ b/engines/skeleton_external.c @@ -193,6 +193,18 @@ static int fio_skeleton_reset_wp(struct thread_data *td, struct fio_file *f, return 0; } +/* + * Hook called for getting the maximum number of open zones for a + * ZBD_HOST_MANAGED zoned block device. + * A @max_open_zones value set to zero means no limit. + */ +static int fio_skeleton_get_max_open_zones(struct thread_data *td, + struct fio_file *f, + unsigned int *max_open_zones) +{ + return 0; +} + /* * Note that the structure is exported, so that fio can get it via * dlsym(..., "ioengine"); for (and only for) external engines. @@ -212,6 +224,7 @@ struct ioengine_ops ioengine = { .get_zoned_model = fio_skeleton_get_zoned_model, .report_zones = fio_skeleton_report_zones, .reset_wp = fio_skeleton_reset_wp, + .get_max_open_zones = fio_skeleton_get_max_open_zones, .options = options, .option_struct_size = sizeof(struct fio_skeleton_options), }; diff --git a/ioengines.h b/ioengines.h index 1d01ab0a6d..b3f755b477 100644 --- a/ioengines.h +++ b/ioengines.h @@ -8,7 +8,7 @@ #include "io_u.h" #include "zbd_types.h" -#define FIO_IOOPS_VERSION 29 +#define FIO_IOOPS_VERSION 30 #ifndef CONFIG_DYNAMIC_ENGINES #define FIO_STATIC static @@ -59,6 +59,8 @@ struct ioengine_ops { uint64_t, struct zbd_zone *, unsigned int); int (*reset_wp)(struct thread_data *, struct fio_file *, uint64_t, uint64_t); + int (*get_max_open_zones)(struct thread_data *, struct fio_file *, + unsigned int *); int option_struct_size; struct fio_option *options; }; diff --git a/oslib/blkzoned.h b/oslib/blkzoned.h index 4cc071dc6a..719b041d12 100644 --- a/oslib/blkzoned.h +++ b/oslib/blkzoned.h @@ -16,6 +16,8 @@ extern int blkzoned_report_zones(struct thread_data *td, struct zbd_zone *zones, unsigned int nr_zones); extern int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length); +extern int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones); #else /* * Define stubs for systems that do not have zoned block device support. @@ -44,6 +46,11 @@ static inline int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f, { return -EIO; } +static inline int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + return -EIO; +} #endif #endif /* FIO_BLKZONED_H */ diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c index 84a64ed301..6f89ec6f41 100644 --- a/oslib/linux-blkzoned.c +++ b/oslib/linux-blkzoned.c @@ -162,6 +162,27 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, return 0; } +int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + char *max_open_str; + + if (f->filetype != FIO_TYPE_BLOCK) + return -EIO; + + max_open_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_open_zones"); + if (!max_open_str) + return 0; + + dprint(FD_ZBD, "%s: max open zones supported by device: %s\n", + f->file_name, max_open_str); + *max_open_zones = atoll(max_open_str); + + free(max_open_str); + + return 0; +} + static uint64_t zone_capacity(struct blk_zone_report *hdr, struct blk_zone *blkz) { diff --git a/zbd.c b/zbd.c index 46ff9aeb04..68cd58e1b9 100644 --- a/zbd.c +++ b/zbd.c @@ -113,6 +113,34 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f, return ret; } +/** + * zbd_get_max_open_zones - Get the maximum number of open zones + * @td: FIO thread data + * @f: FIO file for which to get max open zones + * @max_open_zones: Upon success, result will be stored here. + * + * A @max_open_zones value set to zero means no limit. + * + * Returns 0 upon success and a negative error code upon failure. + */ +int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + int ret; + + if (td->io_ops && td->io_ops->get_max_open_zones) + ret = td->io_ops->get_max_open_zones(td, f, max_open_zones); + else + ret = blkzoned_get_max_open_zones(td, f, max_open_zones); + if (ret < 0) { + td_verror(td, errno, "get max open zones failed"); + log_err("%s: get max open zones failed (%d).\n", + f->file_name, errno); + } + + return ret; +} + /** * zbd_zone_idx - convert an offset into a zone number * @f: file pointer. @@ -554,6 +582,51 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) return ret; } +static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f) +{ + struct zoned_block_device_info *zbd = f->zbd_info; + unsigned int max_open_zones; + int ret; + + if (zbd->model != ZBD_HOST_MANAGED) { + /* Only host-managed devices have a max open limit */ + zbd->max_open_zones = td->o.max_open_zones; + goto out; + } + + /* If host-managed, get the max open limit */ + ret = zbd_get_max_open_zones(td, f, &max_open_zones); + if (ret) + return ret; + + if (!max_open_zones) { + /* No device limit */ + zbd->max_open_zones = td->o.max_open_zones; + } else if (!td->o.max_open_zones) { + /* No user limit. Set limit to device limit */ + zbd->max_open_zones = max_open_zones; + } else if (td->o.max_open_zones <= max_open_zones) { + /* Both user limit and dev limit. User limit not too large */ + zbd->max_open_zones = td->o.max_open_zones; + } else { + /* Both user limit and dev limit. User limit too large */ + td_verror(td, EINVAL, + "Specified --max_open_zones is too large"); + log_err("Specified --max_open_zones (%d) is larger than max (%u)\n", + td->o.max_open_zones, max_open_zones); + return -EINVAL; + } + +out: + /* Ensure that the limit is not larger than FIO's internal limit */ + zbd->max_open_zones = min_not_zero(zbd->max_open_zones, + (uint32_t) ZBD_MAX_OPEN_ZONES); + dprint(FD_ZBD, "%s: using max open zones limit: %"PRIu32"\n", + f->file_name, zbd->max_open_zones); + + return 0; +} + /* * Allocate zone information and store it into f->zbd_info if zonemode=zbd. * @@ -576,9 +649,13 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) case ZBD_HOST_AWARE: case ZBD_HOST_MANAGED: ret = parse_zone_info(td, f); + if (ret) + return ret; break; case ZBD_NONE: ret = init_zone_info(td, f); + if (ret) + return ret; break; default: td_verror(td, EINVAL, "Unsupported zoned model"); @@ -586,12 +663,15 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) return -EINVAL; } - if (ret == 0) { - f->zbd_info->model = zbd_model; - f->zbd_info->max_open_zones = - min_not_zero(td->o.max_open_zones, ZBD_MAX_OPEN_ZONES); + f->zbd_info->model = zbd_model; + + ret = zbd_set_max_open_zones(td, f); + if (ret) { + zbd_free_zone_info(f); + return ret; } - return ret; + + return 0; } void zbd_free_zone_info(struct fio_file *f) From e8267436fd7a02d819f3d0a2a77527d2f942e08b Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 14 May 2021 12:53:15 +0000 Subject: [PATCH 26/42] engines/libzbc: add support for the get_max_open_zones io op Add support for the new .get_max_open_zones io operation. zbc.c will only ever call this callback for host-managed devices. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- engines/libzbc.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/engines/libzbc.c b/engines/libzbc.c index 2aacf7bbeb..3dde93db54 100644 --- a/engines/libzbc.c +++ b/engines/libzbc.c @@ -19,6 +19,7 @@ struct libzbc_data { struct zbc_device *zdev; enum zbc_dev_model model; uint64_t nr_sectors; + uint32_t max_open_seq_req; }; static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f) @@ -32,6 +33,7 @@ static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f) zbc_get_device_info(ld->zdev, zinfo); ld->model = zinfo->zbd_model; ld->nr_sectors = zinfo->zbd_sectors; + ld->max_open_seq_req = zinfo->zbd_max_nr_open_seq_req; dprint(FD_ZBD, "%s: vendor_id:%s, type: %s, model: %s\n", f->file_name, zinfo->zbd_vendor_id, @@ -335,6 +337,24 @@ static int libzbc_reset_wp(struct thread_data *td, struct fio_file *f, return -ret; } +static int libzbc_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + struct libzbc_data *ld; + int ret; + + ret = libzbc_open_dev(td, f, &ld); + if (ret) + return ret; + + if (ld->max_open_seq_req == ZBC_NO_LIMIT) + *max_open_zones = 0; + else + *max_open_zones = ld->max_open_seq_req; + + return 0; +} + ssize_t libzbc_rw(struct thread_data *td, struct io_u *io_u) { struct libzbc_data *ld = td->io_ops_data; @@ -414,6 +434,7 @@ FIO_STATIC struct ioengine_ops ioengine = { .get_zoned_model = libzbc_get_zoned_model, .report_zones = libzbc_report_zones, .reset_wp = libzbc_reset_wp, + .get_max_open_zones = libzbc_get_max_open_zones, .queue = libzbc_queue, .flags = FIO_SYNCIO | FIO_NOEXTEND | FIO_RAWIO, }; From d7e3adb683f85e49e078599a08aec7cd7c32d977 Mon Sep 17 00:00:00 2001 From: DevriesL Date: Tue, 25 May 2021 23:45:11 +0800 Subject: [PATCH 27/42] android: add support for NDK sharedmem Android add support for NDK sharedmem since API level 26 and prohibit the directly use of ashmem since API level 29, so we can use sharedmem if targeting API level is higher than 26. Signed-off-by: DevriesL --- os/os-android.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/os/os-android.h b/os/os-android.h index 3f1aa9d30a..a81cd815e1 100644 --- a/os/os-android.h +++ b/os/os-android.h @@ -71,11 +71,15 @@ #include #include #include +#include +#if __ANDROID_API__ >= __ANDROID_API_O__ +#include +#else +#define ASHMEM_DEVICE "/dev/ashmem" +#endif #define shmid_ds shmid64_ds #define SHM_HUGETLB 04000 -#define ASHMEM_DEVICE "/dev/ashmem" - static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf) { int ret=0; @@ -89,6 +93,16 @@ static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf) return ret; } +#if __ANDROID_API__ >= __ANDROID_API_O__ +static inline int shmget(key_t __key, size_t __size, int __shmflg) +{ + char keybuf[11]; + + sprintf(keybuf, "%d", __key); + + return ASharedMemory_create(keybuf, __size + sizeof(uint64_t)); +} +#else static inline int shmget(key_t __key, size_t __size, int __shmflg) { int fd,ret; @@ -114,6 +128,7 @@ static inline int shmget(key_t __key, size_t __size, int __shmflg) close(fd); return ret; } +#endif static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg) { From 0313e938c9c8bb37d71dade239f1f5326677b079 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 May 2021 10:10:32 -0600 Subject: [PATCH 28/42] Fio 3.27 Signed-off-by: Jens Axboe --- FIO-VERSION-GEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN index 294860716c..47af94e9de 100755 --- a/FIO-VERSION-GEN +++ b/FIO-VERSION-GEN @@ -1,7 +1,7 @@ #!/bin/sh GVF=FIO-VERSION-FILE -DEF_VER=fio-3.26 +DEF_VER=fio-3.27 LF=' ' From 6df25f781e07e373833ec1629e005d36474c3b67 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 27 May 2021 11:12:31 +0000 Subject: [PATCH 29/42] zbd: add missing client/server support for option max_open_zones Ensure that we convert the max_open_zones option for client/server. Use __cpu_to_le32()/__le32_to_cpu() rather than cpu_to_le32()/le32_to_cpu(), since max_open_zones is defined as int rather than unsigned int in thread_options.h. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- cconv.c | 2 ++ server.h | 2 +- thread_options.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cconv.c b/cconv.c index aa06e3ea6e..d4dfb81b3f 100644 --- a/cconv.c +++ b/cconv.c @@ -231,6 +231,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->zone_capacity = le64_to_cpu(top->zone_capacity); o->zone_skip = le64_to_cpu(top->zone_skip); o->zone_mode = le32_to_cpu(top->zone_mode); + o->max_open_zones = __le32_to_cpu(top->max_open_zones); o->lockmem = le64_to_cpu(top->lockmem); o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent); o->offset_increment = le64_to_cpu(top->offset_increment); @@ -573,6 +574,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->zone_capacity = __cpu_to_le64(o->zone_capacity); top->zone_skip = __cpu_to_le64(o->zone_skip); top->zone_mode = __cpu_to_le32(o->zone_mode); + top->max_open_zones = __cpu_to_le32(o->max_open_zones); top->lockmem = __cpu_to_le64(o->lockmem); top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add); top->file_size_low = __cpu_to_le64(o->file_size_low); diff --git a/server.h b/server.h index b45b319ba2..8cf3a60b4b 100644 --- a/server.h +++ b/server.h @@ -48,7 +48,7 @@ struct fio_net_cmd_reply { }; enum { - FIO_SERVER_VER = 89, + FIO_SERVER_VER = 90, FIO_SERVER_MAX_FRAGMENT_PDU = 1024, FIO_SERVER_MAX_CMD_MB = 2048, diff --git a/thread_options.h b/thread_options.h index 5ecc72d7b5..4d48e46299 100644 --- a/thread_options.h +++ b/thread_options.h @@ -656,6 +656,7 @@ struct thread_options_pack { uint32_t allow_mounted_write; uint32_t zone_mode; + int32_t max_open_zones; } __attribute__((packed)); extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top); From 575686bb85fa36f326524c505e83c54abc0d2f2b Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 27 May 2021 11:12:32 +0000 Subject: [PATCH 30/42] zbd: add a new --ignore_zone_limits option In commit d2f442bc0bd5 ("ioengines: add get_max_open_zones zoned block device operation") we added a check that verifies that the specified --max_open_zones value is lower than the max value reported by the device. For ZNS devices there is a max open zones and a max active zones limit. For ZAC/ZBC devices there is only a max open zones limit. On ZAC/ZBC, there is thus no limit on the amount of zones that can be in zone state closed. When doing a write to an empty or closed zone, a ZAC/ZBC drive will close an arbitrary implicit open zone in order to handle the write. The ZNS specification has no requirement on closing a zone in order to handle a write to an empty or closed zone. The drive is free to return an error. Even on ZAC/ZBC, you do not want to exceed the max open zones limit, since it will lead to additional implicit close zone and implicit open zone operations, which may degrade performance. However, it seems that this is sometimes done on purpose, in order to measure the overhead of these additional operations. Therefore, add an option that allows the user to ignore the reported device limits. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- cconv.c | 2 ++ fio.1 | 5 +++++ options.c | 10 ++++++++++ server.h | 2 +- thread_options.h | 2 ++ zbd.c | 2 +- 6 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cconv.c b/cconv.c index d4dfb81b3f..74c241063a 100644 --- a/cconv.c +++ b/cconv.c @@ -232,6 +232,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->zone_skip = le64_to_cpu(top->zone_skip); o->zone_mode = le32_to_cpu(top->zone_mode); o->max_open_zones = __le32_to_cpu(top->max_open_zones); + o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits); o->lockmem = le64_to_cpu(top->lockmem); o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent); o->offset_increment = le64_to_cpu(top->offset_increment); @@ -575,6 +576,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->zone_skip = __cpu_to_le64(o->zone_skip); top->zone_mode = __cpu_to_le32(o->zone_mode); top->max_open_zones = __cpu_to_le32(o->max_open_zones); + top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits); top->lockmem = __cpu_to_le64(o->lockmem); top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add); top->file_size_low = __cpu_to_le64(o->file_size_low); diff --git a/fio.1 b/fio.1 index ab08cb0120..5aa54a4d04 100644 --- a/fio.1 +++ b/fio.1 @@ -835,6 +835,11 @@ threads/processes. .BI job_max_open_zones \fR=\fPint Limit on the number of simultaneously opened zones per single thread/process. .TP +.BI ignore_zone_limits \fR=\fPbool +If this isn't set, fio will query the max open zones limit from the zoned block +device, and exit if the specified \fBmax_open_zones\fR value is larger than the +limit reported by the device. Default: false. +.TP .BI zone_reset_threshold \fR=\fPfloat A number between zero and one that indicates the ratio of logical blocks with data to the total number of logical blocks in the test above which zones diff --git a/options.c b/options.c index b82a10aa44..a8986d1167 100644 --- a/options.c +++ b/options.c @@ -3492,6 +3492,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_INVALID, }, + { + .name = "ignore_zone_limits", + .lname = "Ignore zone resource limits", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, ignore_zone_limits), + .def = "0", + .help = "Ignore the zone resource limits (max open/active zones) reported by the device", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, { .name = "zone_reset_threshold", .lname = "Zone reset threshold", diff --git a/server.h b/server.h index 8cf3a60b4b..c128df28ad 100644 --- a/server.h +++ b/server.h @@ -48,7 +48,7 @@ struct fio_net_cmd_reply { }; enum { - FIO_SERVER_VER = 90, + FIO_SERVER_VER = 91, FIO_SERVER_MAX_FRAGMENT_PDU = 1024, FIO_SERVER_MAX_CMD_MB = 2048, diff --git a/thread_options.h b/thread_options.h index 4d48e46299..05c2d1383e 100644 --- a/thread_options.h +++ b/thread_options.h @@ -355,6 +355,7 @@ struct thread_options { unsigned int read_beyond_wp; int max_open_zones; unsigned int job_max_open_zones; + unsigned int ignore_zone_limits; fio_fp64_t zrt; fio_fp64_t zrf; }; @@ -657,6 +658,7 @@ struct thread_options_pack { uint32_t zone_mode; int32_t max_open_zones; + uint32_t ignore_zone_limits; } __attribute__((packed)); extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top); diff --git a/zbd.c b/zbd.c index 68cd58e1b9..5d9e331ac9 100644 --- a/zbd.c +++ b/zbd.c @@ -588,7 +588,7 @@ static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f) unsigned int max_open_zones; int ret; - if (zbd->model != ZBD_HOST_MANAGED) { + if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) { /* Only host-managed devices have a max open limit */ zbd->max_open_zones = td->o.max_open_zones; goto out; From f34b0a0320e0511c5de7f41c1496f11708ff64c1 Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 2 Jun 2021 15:05:17 +0200 Subject: [PATCH 31/42] ci: Installing missing toolchain When trying to rebuild a failed build on a real windows system, the toolchain is missing. Let's add the toolchain here so we can reuse the script locally too. Signed-off-by: Erwan Velu --- ci/appveyor-install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh index c73e4cb53b..5f873a20e0 100755 --- a/ci/appveyor-install.sh +++ b/ci/appveyor-install.sh @@ -31,6 +31,7 @@ case "${DISTRO}" in pacman.exe --noconfirm -S \ mingw-w64-${PACKAGE_ARCH}-clang \ mingw-w64-${PACKAGE_ARCH}-cunit \ + mingw-w64-${PACKAGE_ARCH}-toolchain \ mingw-w64-${PACKAGE_ARCH}-lld ;; esac From 1a1e8144846b175a5858a92a68bc8e6279a549e4 Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 2 Jun 2021 15:52:06 +0200 Subject: [PATCH 32/42] ci: Reporting installed msys2 packages When reproducing a build locally, it's important to be on the same release as the CI. So let's listi the installed packages so we can compare the two builds more easily. Signed-off-by: Erwan Velu --- ci/appveyor-install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh index 5f873a20e0..3137f39ebe 100755 --- a/ci/appveyor-install.sh +++ b/ci/appveyor-install.sh @@ -33,6 +33,7 @@ case "${DISTRO}" in mingw-w64-${PACKAGE_ARCH}-cunit \ mingw-w64-${PACKAGE_ARCH}-toolchain \ mingw-w64-${PACKAGE_ARCH}-lld + pacman.exe -Q # List installed packages ;; esac From 4b0e335a05f3a082a4f051304ba9bb6f36af4432 Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 2 Jun 2021 16:15:59 +0200 Subject: [PATCH 33/42] Makefile: Avoid using built-in stpcpy during clang build Since clang 12, during the clang build, noticed by the CI, the linking fails as clang optimize some string functions to stpcpy. LINK fio lld-link: error: undefined symbol: stpcpy >>> referenced by C:\projects\fio\options.c:5305 >>> options.o:(fio_options_parse) Two possible implementations : - Adding stpcpy in fio as the kernel did : https://lore.kernel.org/lkml/20200815002417.1512973-1-ndesaulniers@google.com/T/ - Disable the implicit stpcpy To avoid adding code into fio, the latter option was used. Signed-off-by: Erwan Velu --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index ef31737371..f57569d5f6 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,11 @@ ifdef CONFIG_PDB LDFLAGS += -fuse-ld=lld $(LINK_PDBFILE) endif +# If clang, do not use builtin stpcpy as it breaks the build +ifeq ($(CC),clang) + FIO_CFLAGS += -fno-builtin-stpcpy +endif + ifdef CONFIG_GFIO PROGS += gfio endif From e1315822835ceaa976a2b8ac6a74ce7bb46b079f Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Fri, 4 Jun 2021 20:32:50 +0900 Subject: [PATCH 34/42] t/zbd: Use max_open_zones that fio fetched from device Recent commit d2f442bc0bd5 ("ioengines: add get_max_open_zones zoned block device operation") modified fio to compare --max_open_zones option value and max_open_zones reported by the device. The device limit is fetched through sysfs or through an ioengine specific implementation. The test script currently try to fetch the max open zones limit using libzbc tools or sg_inq. If either of these fail, default value 128 is supplied. This default value can be too high when the test script is run for certain zoned block devices, and can therefore result in fio error and test case failure. To avoid the failure, modify the default value used in the test script from 128 to 0. With this, --max_open_zones=0 is passed to fio, and it makes fio use the max_open_zones reported by the device. Also add comments to describe why the test script gets max_open_zones with tools. Reviewed-by: Niklas Cassel Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Jens Axboe --- t/zbd/functions | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/t/zbd/functions b/t/zbd/functions index 40ffe1deeb..08a2c629e8 100644 --- a/t/zbd/functions +++ b/t/zbd/functions @@ -173,15 +173,23 @@ last_online_zone() { fi } +# Get max_open_zones of SMR drives using sg_inq or libzbc tools. Two test cases +# 31 and 32 use this max_open_zones value. The test case 31 uses max_open_zones +# to decide number of write target zones. The test case 32 passes max_open_zones +# value to fio with --max_open_zones option. Of note is that fio itself has the +# feature to get max_open_zones from the device through sysfs or ioengine +# specific implementation. This max_open_zones fetch by test script is required +# in case fio is running on an old Linux kernel version which lacks +# max_open_zones in sysfs, or which lacks zoned block device support completely. max_open_zones() { local dev=$1 if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" \ > /dev/null 2>&1; then - # Non scsi device such as null_blk can not return max open zones. - # Use default value. - echo 128 + # When sg_inq can not get max open zones, specify 0 which indicates + # fio to get max open zones limit from the device. + echo 0 else ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 | { From 351fe91089c3babb06ae421a1abce3632f42b672 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Fri, 4 Jun 2021 20:32:51 +0900 Subject: [PATCH 35/42] t/zbd: Add ignore_zone_limit option to test with special max_open_zones Recent commit d2f442bc0bd5 ("ioengines: add get_max_open_zones zoned block device operation") modified fio to compare --max_open_zones option value and max_open_zones reported by the device. When the option --max_open_zones is larger than the device limit, fio exits with an error. However, sometimes it is useful to run fio with --max_open_zones larger than the device limit to check performance impact of implicit zone open and close by the zoned block devices. The test script t/zbd/test-zbd-support has an option -o so that users can specify such larger max_open_zones value. After the commit, such test runs fail with the fio error. To avoid the failure, modify the test script to specify another option --ignore_zone_limits to fio command, which was added by the commit 575686bb85fa (zbd: add a new --ignore_zone_limits option). This option is added to fio command only when users specify -o option and special max_open_zones value to the test script. This change does not affect default test conditions. Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Damien Le Moal Reviewed-by: Niklas Cassel Signed-off-by: Jens Axboe --- t/zbd/test-zbd-support | 1 + 1 file changed, 1 insertion(+) diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support index 26aff3731b..015fa1dc35 100755 --- a/t/zbd/test-zbd-support +++ b/t/zbd/test-zbd-support @@ -1348,6 +1348,7 @@ fi if [[ -n ${max_open_zones_opt} ]]; then # Override max_open_zones with the script option value max_open_zones="${max_open_zones_opt}" + global_var_opts+=("--ignore_zone_limits=1") job_var_opts+=("--max_open_zones=${max_open_zones_opt}") fi From 40d0b84220f7c0ff9c3874656db7f0f8cb6a85e6 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Fri, 4 Jun 2021 20:32:52 +0900 Subject: [PATCH 36/42] t/zbd: Fix write target zones counting in test case #31 The test case #31 in t/zbd/test-zbd-support writes 128KB data to sequential write required zones as the preparation for the following random read test. The data write leaves the target zones in open status. The test case refers the variable 'nz', which has max_open_zones value, to decide how many zones to write the data. However, the end condition of the write target zone loop has a bug. The disk end offset is used as the loop end condition, which does not match the last target zone when number of sequential write required zones divided by nz has remainder. This results in write to more zones than nz=max_open_zones limit and the test case failure. To fix the bug and to simplify the script, avoid the loop and utilize zonemode strided to achieve the same data write pattern. Also specify size and io_size using nz to reliably count the write target zones. Even with the fix above, still the number of open zones may exceed max_open_zones since other test cases executed before the test case 31 may leave open zones on the test target device. To avoid this failure, reset all zones before the data write. The failures were observed with libzbc I/O engine after the commit e8267436fd7a ("engines/libzbc: add support for the get_max_open_zones io op"), which changed the max_open_zones value fio refers. Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Damien Le Moal Reviewed-by: Niklas Cassel Signed-off-by: Jens Axboe --- t/zbd/test-zbd-support | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support index 015fa1dc35..a684f98807 100755 --- a/t/zbd/test-zbd-support +++ b/t/zbd/test-zbd-support @@ -731,32 +731,28 @@ test30() { test31() { local bs inc nz off opts size - prep_write - # Start with writing 128 KB to max_open_zones sequential zones. - bs=128K + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + + # As preparation, write 128 KB to sequential write required zones. Limit + # write target zones up to max_open_zones to keep test time reasonable. + # To distribute the write target zones evenly, skip certain zones for every + # write. Utilize zonemode strided for such write patterns. + bs=$((128 * 1024)) nz=$((max_open_zones)) if [[ $nz -eq 0 ]]; then nz=128 fi - # shellcheck disable=SC2017 - inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size) - * zone_size)) - if [ "$inc" -eq 0 ]; then - require_seq_zones $nz || return $SKIP_TESTCASE - fi - opts=() - for ((off = first_sequential_zone_sector * 512; off < disk_size; - off += inc)); do - opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs") - opts+=("--bs=$bs" "--size=$zone_size" "$(ioengine "libaio")") - opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0") - opts+=("--zonemode=zbd" "--zonesize=${zone_size}") - opts+=(${job_var_opts[@]}) - done - "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 - # Next, run the test. off=$((first_sequential_zone_sector * 512)) size=$((disk_size - off)) + inc=$(((size / nz / zone_size) * zone_size)) + opts=("--name=$dev" "--filename=$dev" "--rw=write" "--bs=${bs}") + opts+=("--offset=$off" "--size=$((inc * nz))" "--io_size=$((bs * nz))") + opts+=("--zonemode=strided" "--zonesize=${bs}" "--zonerange=${inc}") + opts+=("--direct=1") + echo "fio ${opts[@]}" >> "${logfile}.${test_number}" + "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 + + # Next, run the test. opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size") opts+=("--bs=$bs" "$(ioengine "psync")" "--rw=randread" "--direct=1") opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd") From dd4620b7f9171edaa10955c4826454a05af27c85 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 10 Jun 2021 16:55:39 +0100 Subject: [PATCH 37/42] io_uring: drop redundant IO_MODE_OFFLOAD check check_engine_ops() already returns an error if io_submit_mode is IO_MODE_OFFLOAD and the engine is marked FIO_NO_OFFLOAD. Signed-off-by: Stefan Hajnoczi Signed-off-by: Jens Axboe --- engines/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/engines/io_uring.c b/engines/io_uring.c index b962e8041b..9c091e37e6 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -728,12 +728,6 @@ static int fio_ioring_init(struct thread_data *td) struct ioring_data *ld; struct thread_options *to = &td->o; - if (to->io_submit_mode == IO_MODE_OFFLOAD) { - log_err("fio: io_submit_mode=offload is not compatible (or " - "useful) with io_uring\n"); - return 1; - } - /* sqthread submission requires registered files */ if (o->sqpoll_thread) o->registerfiles = 1; From 50cc48d52fec6c74a46e377b23f19ebed532125a Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 14 Jun 2021 13:49:03 +0000 Subject: [PATCH 38/42] zbd: disallow pipes for zonemode=zbd zoned block device support in fio cannot handle pipes, so simply reject them and give a clear error message. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- zbd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zbd.c b/zbd.c index 5d9e331ac9..60325d28fa 100644 --- a/zbd.c +++ b/zbd.c @@ -32,6 +32,11 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f, { int ret; + if (f->filetype == FIO_TYPE_PIPE) { + log_err("zonemode=zbd does not support pipes\n"); + return -EINVAL; + } + if (td->io_ops && td->io_ops->get_zoned_model) ret = td->io_ops->get_zoned_model(td, f, model); else From 9db0cde87d1c928b9d629c6f1b0f8f2ed729d908 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 14 Jun 2021 13:49:04 +0000 Subject: [PATCH 39/42] zbd: allow zonemode=zbd with regular files by emulating zones Currently when using zonemode=zbd and running against a regular file, fio will fail with: fio: file hash not empty on exit Treat regular files just like how we treat regular (non-zoned) block devices: return ZBD_NONE and let zbd.c emulate zones inside the regular file/block device. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- zbd.c | 14 +++++++++++++- zbd_types.h | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/zbd.c b/zbd.c index 60325d28fa..d1db9adc29 100644 --- a/zbd.c +++ b/zbd.c @@ -37,6 +37,12 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f, return -EINVAL; } + /* If regular file, always emulate zones inside the file. */ + if (f->filetype == FIO_TYPE_FILE) { + *model = ZBD_NONE; + return 0; + } + if (td->io_ops && td->io_ops->get_zoned_model) ret = td->io_ops->get_zoned_model(td, f, model); else @@ -414,7 +420,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f) int i; if (zone_size == 0) { - log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n", + log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n", f->file_name); return 1; } @@ -435,6 +441,12 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f) return 1; } + if (f->real_file_size < zone_size) { + log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n", + f->file_name, f->real_file_size, zone_size); + return -EINVAL; + } + nr_zones = (f->real_file_size + zone_size - 1) / zone_size; zbd_info = scalloc(1, sizeof(*zbd_info) + (nr_zones + 1) * sizeof(zbd_info->zone_info[0])); diff --git a/zbd_types.h b/zbd_types.h index 5ed41aa06c..d0f4c44e23 100644 --- a/zbd_types.h +++ b/zbd_types.h @@ -15,7 +15,7 @@ */ enum zbd_zoned_model { ZBD_IGNORE, /* Ignore file */ - ZBD_NONE, /* Regular block device */ + ZBD_NONE, /* No zone support. Emulate zones. */ ZBD_HOST_AWARE, /* Host-aware zoned block device */ ZBD_HOST_MANAGED, /* Host-managed zoned block device */ }; From 2c7dd23e5142e421723ede2557fe868ac32c8265 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 14 Jun 2021 13:49:04 +0000 Subject: [PATCH 40/42] zbd: remove zbd_zoned_model ZBD_IGNORE For a job with zonemode=zbd, we do not want any file to be ignored. Each file's file type in that job should be supported by either zbd.c or the ioengine. If not, we should return an error. This way, ZBD_IGNORE becomes redundant and can be removed. By removing ZBD_IGNORE, we know that all files belonging to a job that has zonemode=zbd set, will either be a zoned block device, or emulate a zoned block device. This means that for jobs that have zonemode=zbd, f->zbd_info will always be non-NULL. This will make the zbd code slightly easier to reason about and to maintain. When removing zbd_zoned_model ZBD_IGNORE, define the new first enum value as 0x1, so that we avoid potential ABI problems with existing binaries. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- engines/libzbc.c | 6 ++---- engines/skeleton_external.c | 1 - oslib/linux-blkzoned.c | 6 ++---- zbd.c | 3 +-- zbd_types.h | 7 +++---- 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/engines/libzbc.c b/engines/libzbc.c index 3dde93db54..7f2bc431b4 100644 --- a/engines/libzbc.c +++ b/engines/libzbc.c @@ -180,10 +180,8 @@ static int libzbc_get_zoned_model(struct thread_data *td, struct fio_file *f, struct libzbc_data *ld; int ret; - if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) { - *model = ZBD_IGNORE; - return 0; - } + if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) + return -EINVAL; ret = libzbc_open_dev(td, f, &ld); if (ret) diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c index c79b6f1114..cff83a10ef 100644 --- a/engines/skeleton_external.c +++ b/engines/skeleton_external.c @@ -156,7 +156,6 @@ static int fio_skeleton_close(struct thread_data *td, struct fio_file *f) /* * Hook for getting the zoned model of a zoned block device for zonemode=zbd. * The zoned model can be one of (see zbd_types.h): - * - ZBD_IGNORE: skip regular files * - ZBD_NONE: regular block device (zone emulation will be used) * - ZBD_HOST_AWARE: host aware zoned block device * - ZBD_HOST_MANAGED: host managed zoned block device diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c index 6f89ec6f41..4e441d29b8 100644 --- a/oslib/linux-blkzoned.c +++ b/oslib/linux-blkzoned.c @@ -140,10 +140,8 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f, { char *model_str = NULL; - if (f->filetype != FIO_TYPE_BLOCK) { - *model = ZBD_IGNORE; - return 0; - } + if (f->filetype != FIO_TYPE_BLOCK) + return -EINVAL; *model = ZBD_NONE; diff --git a/zbd.c b/zbd.c index d1db9adc29..aab4d74136 100644 --- a/zbd.c +++ b/zbd.c @@ -661,8 +661,6 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) return ret; switch (zbd_model) { - case ZBD_IGNORE: - return 0; case ZBD_HOST_AWARE: case ZBD_HOST_MANAGED: ret = parse_zone_info(td, f); @@ -680,6 +678,7 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) return -EINVAL; } + assert(f->zbd_info); f->zbd_info->model = zbd_model; ret = zbd_set_max_open_zones(td, f); diff --git a/zbd_types.h b/zbd_types.h index d0f4c44e23..0a8630cb71 100644 --- a/zbd_types.h +++ b/zbd_types.h @@ -14,10 +14,9 @@ * Zoned block device models. */ enum zbd_zoned_model { - ZBD_IGNORE, /* Ignore file */ - ZBD_NONE, /* No zone support. Emulate zones. */ - ZBD_HOST_AWARE, /* Host-aware zoned block device */ - ZBD_HOST_MANAGED, /* Host-managed zoned block device */ + ZBD_NONE = 0x1, /* No zone support. Emulate zones. */ + ZBD_HOST_AWARE = 0x2, /* Host-aware zoned block device */ + ZBD_HOST_MANAGED = 0x3, /* Host-managed zoned block device */ }; /* From 5ddf46d0b2dfe10b9a518db1f936c81e099b2646 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 14 Jun 2021 13:49:05 +0000 Subject: [PATCH 41/42] zbd: change some f->zbd_info conditionals to asserts Unfortunately, generic fio code calls some zbd_* functions unconditionally. These functions will be called regardless if zonemode == ZONE_MODE_NONE, ZONE_MODE_STRIDED or ZONE_MODE_ZBD, and cannot be optimized. However, some functions are only called when zonemode == ZONE_MODE_ZBD. Since f->zbd_info will always be non-NULL for a job with zonemode=zbd, these functions can be optimized to not check if f->zbd_info is set. Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- zbd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/zbd.c b/zbd.c index aab4d74136..8e99eb95dc 100644 --- a/zbd.c +++ b/zbd.c @@ -808,8 +808,7 @@ int zbd_setup_files(struct thread_data *td) struct fio_zone_info *z; int zi; - if (!zbd) - continue; + assert(zbd); f->min_zone = zbd_zone_idx(f, f->file_offset); f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size); @@ -1470,8 +1469,7 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q, uint32_t zone_idx; uint64_t zone_end; - if (!zbd_info) - return; + assert(zbd_info); zone_idx = zbd_zone_idx(f, io_u->offset); assert(zone_idx < zbd_info->nr_zones); @@ -1531,8 +1529,7 @@ static void zbd_put_io(struct thread_data *td, const struct io_u *io_u) struct fio_zone_info *z; uint32_t zone_idx; - if (!zbd_info) - return; + assert(zbd_info); zone_idx = zbd_zone_idx(f, io_u->offset); assert(zone_idx < zbd_info->nr_zones); @@ -1588,6 +1585,7 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u) assert(td->o.zone_mode == ZONE_MODE_ZBD); assert(td->o.zone_size); + assert(f->zbd_info); zone_idx = zbd_zone_idx(f, f->last_pos[ddir]); z = get_zone(f, zone_idx); @@ -1662,6 +1660,7 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u, * devices with all empty zones. Overwrite the first I/O direction as * write to make sure data to read exists. */ + assert(io_u->file->zbd_info); if (ddir != DDIR_READ || !td_rw(td)) return ddir; @@ -1691,9 +1690,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) uint64_t new_len; int64_t range; - if (!f->zbd_info) - return io_u_accept; - + assert(f->zbd_info); assert(min_bs); assert(is_valid_offset(f, io_u->offset)); assert(io_u->buflen); From a59b12d2a5eb92c1128a5d8ebcd03b1831962ce5 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 14 Jun 2021 13:49:05 +0000 Subject: [PATCH 42/42] t/zbd: update test case 42 Update test case 42 to grep for the new string printed by fio when --zonesize=0 is supplied. Signed-off-by: Niklas Cassel Signed-off-by: Jens Axboe --- t/zbd/test-zbd-support | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support index a684f98807..57e6d05ea7 100755 --- a/t/zbd/test-zbd-support +++ b/t/zbd/test-zbd-support @@ -922,7 +922,7 @@ test41() { test42() { require_regular_block_dev || return $SKIP_TESTCASE read_one_block --zonemode=zbd --zonesize=0 | - grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd' + grep -q 'Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd' } # Check whether fio handles --zonesize=1 correctly for regular block devices.