From 9326926bef943245c244eb0e6129ae046a3719a9 Mon Sep 17 00:00:00 2001
From: Taras Glek <taras@purestorage.com>
Date: Wed, 26 Feb 2020 09:39:52 -0800
Subject: [PATCH 01/42] NFS engine

---
 HOWTO            |  13 +-
 Makefile         |   6 +
 configure        |  28 ++++
 engines/nfs.c    | 351 +++++++++++++++++++++++++++++++++++++++++++++++
 examples/nfs.fio |  23 ++++
 fio.1            |  10 ++
 optgroup.c       |   4 +
 optgroup.h       |   2 +
 options.c        |   5 +
 9 files changed, 441 insertions(+), 1 deletion(-)
 create mode 100644 engines/nfs.c
 create mode 100644 examples/nfs.fio

diff --git a/HOWTO b/HOWTO
index 2788670ddb..367164b117 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1168,7 +1168,7 @@ I/O type
 
 		**1**
 			Backward-compatible alias for **mixed**.
-		
+
 		**2**
 			Alias for **both**.
 
@@ -2091,6 +2091,12 @@ I/O engine
 			I/O engine supporting asynchronous read and write operations to the
 			DAOS File System (DFS) via libdfs.
 
+		**nfs**
+			I/O engine supporting asynchronous read and write operations to
+			NFS filesystems from userspace via libnfs. This is useful for
+			achieving higher concurrency and thus throughput than is possible
+			via kernel NFS.
+
 I/O engine specific parameters
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2508,6 +2514,11 @@ with the caveat that when used on the command line, they must come after the
 	Specificy a different object class for the dfs file.
 	Use DAOS container's object class by default.
 
+.. option:: nfs_url=str : [nfs]
+
+	URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+	Refer to the libnfs README for more details.
+
 I/O depth
 ~~~~~~~~~
 
diff --git a/Makefile b/Makefile
index fce3d0d134..78a369eb3e 100644
--- a/Makefile
+++ b/Makefile
@@ -79,6 +79,12 @@ ifdef CONFIG_LIBNBD
   ENGINES += nbd
 endif
 
+ifdef CONFIG_LIBNFS
+  CFLAGS += $(LIBNFS_CFLAGS)
+  LIBS += $(LIBNFS_LIBS)
+  SOURCE += engines/nfs.c
+endif
+
 ifdef CONFIG_64BIT
   CPPFLAGS += -DBITS_PER_LONG=64
 else ifdef CONFIG_32BIT
diff --git a/configure b/configure
index a7d82be06b..a9f0c033e4 100755
--- a/configure
+++ b/configure
@@ -172,6 +172,7 @@ libiscsi="no"
 libnbd="no"
 libzbc=""
 dfs=""
+libnfs="no"
 dynamic_engines="no"
 prefix=/usr/local
 
@@ -241,6 +242,8 @@ for opt do
   ;;
   --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
+  --enable-libnfs) libnfs="yes"
+  ;;
   --dynamic-libengines) dynamic_engines="yes"
   ;;
   --disable-dfs) dfs="no"
@@ -273,6 +276,7 @@ if test "$show_help" = "yes" ; then
   echo "--disable-http          Disable HTTP support even if found"
   echo "--disable-gfapi         Disable gfapi"
   echo "--enable-libhdfs        Enable hdfs support"
+  echo "--enable-libnfs         Enable nfs support"
   echo "--disable-lex           Disable use of lex/yacc for math"
   echo "--disable-pmem          Disable pmem based engines even if found"
   echo "--enable-lex            Enable use of lex/yacc for math"
@@ -2276,6 +2280,21 @@ EOF
   fi
 fi
 print_config "DAOS File System (dfs) Engine" "$dfs"
+# Check if we have libnfs (for nfs support).
+if test "$libnfs" = "yes" ; then
+  if $(pkg-config libnfs); then
+    libnfs="yes"
+    libnfs_cflags=$(pkg-config --cflags libnfs)
+    # libnfs_libs=$(pkg-config --libs libnfs)
+    libnfs_libs=/usr/local/lib/libnfs.a
+  else
+    if test "$libnfs" = "yes" ; then
+      echo "libnfs" "Install libnfs"
+    fi
+    libnfs="no"
+  fi
+fi
+print_config "nfs engine" "$libnfs"
 
 ##########################################
 # Check if we have lex/yacc available
@@ -3101,6 +3120,9 @@ fi
 if test "$dfs" = "yes" ; then
   output_sym "CONFIG_DFS"
 fi
+if test "$libnfs" = "yes" ; then
+  output_sym "CONFIG_NFS"
+fi
 if test "$march_set" = "no" && test "$build_native" = "yes" ; then
   output_sym "CONFIG_BUILD_NATIVE"
 fi
@@ -3140,6 +3162,12 @@ if test "$libnbd" = "yes" ; then
   echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak
   echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak
 fi
+if test "$libnfs" = "yes" ; then
+  output_sym "CONFIG_LIBNFS"
+  echo "CONFIG_LIBNFS=m" >> $config_host_mak
+  echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak
+  echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak
+fi
 if test "$dynamic_engines" = "yes" ; then
   output_sym "CONFIG_DYNAMIC_ENGINES"
 fi
diff --git a/engines/nfs.c b/engines/nfs.c
new file mode 100644
index 0000000000..df09477600
--- /dev/null
+++ b/engines/nfs.c
@@ -0,0 +1,351 @@
+// https://github.com/axboe/fio/pull/762 sample pull req for new engine
+#include <stdlib.h>
+#include <poll.h>
+#include <nfsc/libnfs.h>
+#include <nfsc/libnfs-raw.h>
+#include <nfsc/libnfs-raw-mount.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+enum nfs_op_type {
+	NFS_READ_WRITE = 0,
+	NFS_STAT_MKDIR_RMDIR,
+	NFS_STAT_TOUCH_RM,
+};
+
+struct fio_libnfs_options {
+	struct nfs_context *context;
+	char *nfs_url;
+	// the following implements a circular queue of outstanding IOs
+	int outstanding_events; // IOs issued to libnfs, that have not returned yet
+	int prev_requested_event_index; // event last returned via fio_libnfs_event
+	int next_buffered_event; // round robin-pointer within events[]
+	int buffered_event_count; // IOs completed by libnfs faiting for FIO
+	int free_event_buffer_index; // next empty buffer
+	unsigned int queue_depth; // nfs_callback needs this info, but doesn't have fio td structure to pull it from
+	struct io_u**events;
+};
+
+struct nfs_data {
+	struct nfsfh *nfsfh;
+	struct fio_libnfs_options *options;
+};
+
+static struct fio_option options[] = {
+	{
+		.name     = "nfs_url",
+		.lname    = "nfs_url",
+		.type     = FIO_OPT_STR_STORE,
+		.help	= "URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]",
+		.off1     = offsetof(struct fio_libnfs_options, nfs_url),
+		.category = FIO_OPT_C_ENGINE,
+		.group	= __FIO_OPT_G_NFS,
+	},
+	{
+		.name     = NULL,
+	},
+};
+
+/*
+ * The ->event() hook is called to match an event number with an io_u.
+ * After the core has called ->getevents() and it has returned eg 3,
+ * the ->event() hook must return the 3 events that have completed for
+ * subsequent calls to ->event() with [0-2]. Required.
+ */
+static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
+{
+	struct fio_libnfs_options *o = td->eo;
+	struct io_u *io_u = o->events[o->next_buffered_event];
+	assert(o->events[o->next_buffered_event]);
+	o->events[o->next_buffered_event] = NULL;
+	o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth;
+	// validate our state machine
+	assert(o->buffered_event_count);
+	o->buffered_event_count--;
+	assert(io_u);
+	// assert that fio_libnfs_event is being called in sequential fashion
+	assert(event == 0 || o->prev_requested_event_index + 1 == event);
+	if (o->buffered_event_count == 0) {
+		o->prev_requested_event_index = -1;
+	} else {
+		o->prev_requested_event_index = event;
+	}
+	return io_u;
+}
+
+static int nfs_event_loop(struct thread_data *td, bool flush) {
+	struct fio_libnfs_options *o = td->eo;
+	struct pollfd pfds[1]; /* nfs:0 */
+	// we already have stuff queued for fio, no need to waste cpu on poll()
+	if (o->buffered_event_count) {
+		return o->buffered_event_count;
+	}
+	// fio core logic seems to stop calling this event-loop if we ever return with 0 events
+	#define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events))
+
+	do {
+		int timeout = SHOULD_WAIT() ? -1 : 0;
+		int ret = 0;
+		pfds[0].fd = nfs_get_fd(o->context);
+		pfds[0].events = nfs_which_events(o->context);
+		ret = poll(&pfds[0], 1, timeout);
+		if (ret < 0) {
+			if (errno == EINTR || errno == EAGAIN) {
+				continue;
+			}
+			log_err("nfs: failed to poll events: %s.\n",
+				strerror(errno));
+			break;
+		}
+
+		ret = nfs_service(o->context, pfds[0].revents);
+		if (ret < 0) {
+			log_err("nfs: socket is in an unrecoverable error state.\n");
+			break;
+		}
+	} while (SHOULD_WAIT());
+	return o->buffered_event_count;
+}
+#undef SHOULD_WAIT
+
+/*
+ * The ->getevents() hook is used to reap completion events from an async
+ * io engine. It returns the number of completed events since the last call,
+ * which may then be retrieved by calling the ->event() hook with the event
+ * numbers. Required.
+ */
+static int fio_libnfs_getevents(struct thread_data *td, unsigned int min,
+				  unsigned int max, const struct timespec *t)
+{
+	return nfs_event_loop(td, false);
+}
+
+static void nfs_callback(int res, struct nfs_context *nfs, void *data,
+                       void *private_data)
+{
+	struct io_u *io_u = private_data;
+	struct nfs_data *nfs_data = io_u->file->engine_data;
+	struct fio_libnfs_options *o = nfs_data->options;
+	if (res < 0) {
+		log_err("Failed NFS operation(code:%d): %s\n", res, nfs_get_error(o->context));
+		io_u->error = -res;
+		// res is used for read math below, don't wanna pass negative there
+		res = 0;
+	} else if (io_u->ddir == DDIR_READ) {
+		memcpy(io_u->buf, data, res);
+		if (res == 0) {
+			log_err("Got NFS EOF, this is probably not expected\n");
+		}
+	}
+	// fio uses resid to track remaining data
+	io_u->resid = io_u->xfer_buflen - res;
+
+	assert(!o->events[o->free_event_buffer_index]);
+	o->events[o->free_event_buffer_index] = io_u;
+	o->free_event_buffer_index = (o->free_event_buffer_index + 1) % o->queue_depth;
+	o->outstanding_events--;
+	o->buffered_event_count++;
+}
+
+static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u) {
+	struct nfs_data *nfs_data = io_u->engine_data;
+	return nfs_pwrite_async(o->context, nfs_data->nfsfh,
+                           io_u->offset, io_u->buflen, io_u->buf, nfs_callback,
+                           io_u);
+}
+
+static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) {
+	struct nfs_data *nfs_data = io_u->engine_data;
+	return nfs_pread_async(o->context,  nfs_data->nfsfh, io_u->offset, io_u->buflen, nfs_callback,  io_u);
+}
+
+/*
+ * The ->queue() hook is responsible for initiating io on the io_u
+ * being passed in. If the io engine is a synchronous one, io may complete
+ * before ->queue() returns. Required.
+ *
+ * The io engine must transfer in the direction noted by io_u->ddir
+ * to the buffer pointed to by io_u->xfer_buf for as many bytes as
+ * io_u->xfer_buflen. Residual data count may be set in io_u->resid
+ * for a short read/write.
+ */
+static enum fio_q_status fio_libnfs_queue(struct thread_data *td,
+					    struct io_u *io_u)
+{
+	struct nfs_data *nfs_data = io_u->file->engine_data;
+	struct fio_libnfs_options *o = nfs_data->options;
+	struct nfs_context *nfs = o->context;
+	int err;
+	enum fio_q_status ret = FIO_Q_QUEUED;
+
+	io_u->engine_data = nfs_data;
+	switch(io_u->ddir) {
+		case DDIR_WRITE:
+			err = queue_write(o, io_u);
+			break;
+		case DDIR_READ:
+			err = queue_read(o, io_u);
+			break;
+		case DDIR_TRIM:
+			log_err("nfs: trim is not supported");
+			err = -1;
+			break;
+		default:
+			log_err("nfs: unhandled io %d\n", io_u->ddir);
+			err = -1;
+	}
+	if (err) {
+		log_err("nfs: Failed to queue nfs op: %s\n", nfs_get_error(nfs));
+		td->error = 1;
+		return FIO_Q_COMPLETED;
+	}
+	o->outstanding_events++;
+	return ret;
+}
+
+/** Do a mount if one has not been done before */
+static int do_mount(struct thread_data *td, const char *url)
+{
+	size_t event_size = sizeof(struct io_u **) * td->o.iodepth;
+	struct fio_libnfs_options *options = td->eo;
+	struct nfs_url *nfs_url = NULL;
+	int ret = 0;
+	int path_len = 0;
+	char *mnt_dir = NULL;
+
+	if (options->context) {
+		return 0;
+	}
+
+	options->context = nfs_init_context();
+	if (options->context == NULL) {
+		log_err("nfs: failed to init nfs context\n");
+		return -1;
+	}
+
+	options->events = malloc(event_size);
+	memset(options->events, 0, event_size);
+
+	options->prev_requested_event_index = -1;
+	options->queue_depth = td->o.iodepth;
+
+	nfs_url = nfs_parse_url_full(options->context, url);
+	path_len = strlen(nfs_url->path);
+	mnt_dir = malloc(path_len + strlen(nfs_url->file) + 1);
+	strcpy(mnt_dir, nfs_url->path);
+	strcpy(mnt_dir + strlen(nfs_url->path), nfs_url->file);
+	ret = nfs_mount(options->context, nfs_url->server, mnt_dir);
+	free(mnt_dir);
+	nfs_destroy_url(nfs_url);
+	return ret;
+}
+
+/*
+ * The init function is called once per thread/process, and should set up
+ * any structures that this io engine requires to keep track of io. Not
+ * required.
+ */
+static int fio_libnfs_setup(struct thread_data *td)
+{
+	// flipping this makes using gdb easier, but tends to hang fio on exit
+	td->o.use_thread = 0;
+	return 0;
+}
+
+/*
+ * This is paired with the ->init() function and is called when a thread is
+ * done doing io. Should tear down anything setup by the ->init() function.
+ * Not required.
+ */
+static void fio_libnfs_cleanup(struct thread_data *td)
+{
+	struct fio_libnfs_options *o = td->eo;
+	nfs_umount(o->context);
+	nfs_destroy_context(o->context);
+	free(o->events);
+}
+
+static int fio_libnfs_open(struct thread_data *td, struct fio_file *f)
+{
+	int ret;
+	struct fio_libnfs_options *options = td->eo;
+	struct nfs_data *nfs_data = NULL;
+	int flags = 0;
+
+	if (!options->nfs_url) {
+		log_err("nfs: nfs_url is a required parameter\n");
+		return -1;
+	}
+
+	ret = do_mount(td, options->nfs_url);
+
+	if (ret != 0) {
+		log_err("nfs: Failed to mount %s with code %d: %s\n", options->nfs_url, ret, nfs_get_error(options->context));
+		return ret;
+	}
+	nfs_data = malloc(sizeof(struct nfs_data));
+	memset(nfs_data, 0, sizeof(struct nfs_data));
+	nfs_data->options = options;
+
+	if (td->o.td_ddir == TD_DDIR_WRITE) {
+		flags |= O_CREAT | O_RDWR;
+	} else {
+		flags |= O_RDWR;
+	}
+	ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh);
+
+	if (ret != 0) {
+		log_err("Failed to open %s: %s\n", f->file_name, nfs_get_error(options->context));
+	}
+	f->engine_data = nfs_data;
+	return ret;
+}
+
+static int fio_libnfs_close(struct thread_data *td, struct fio_file *f)
+{
+	struct nfs_data *nfs_data = f->engine_data;
+	struct fio_libnfs_options *o = nfs_data->options;
+	int ret = 0;
+	if (nfs_data->nfsfh) {
+		ret = nfs_close(o->context, nfs_data->nfsfh);
+	}
+	free(nfs_data);
+	f->engine_data = NULL;
+	return ret;
+}
+
+/*
+ * Hook for writing out outstanding data.
+ */
+static int fio_libnfs_commit(struct thread_data *td) {
+	nfs_event_loop(td, true);
+	return 0;
+}
+
+struct ioengine_ops ioengine = {
+	.name		= "nfs",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_libnfs_setup,
+	.queue		= fio_libnfs_queue,
+	.getevents	= fio_libnfs_getevents,
+	.event		= fio_libnfs_event,
+	.cleanup	= fio_libnfs_cleanup,
+	.open_file	= fio_libnfs_open,
+	.close_file	= fio_libnfs_close,
+	.commit     = fio_libnfs_commit,
+	.flags      = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_libnfs_options),
+};
+
+static void fio_init fio_nfs_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nfs_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
+
diff --git a/examples/nfs.fio b/examples/nfs.fio
new file mode 100644
index 0000000000..2449f4154d
--- /dev/null
+++ b/examples/nfs.fio
@@ -0,0 +1,23 @@
+[global]
+nfs_url=nfs://127.0.0.1/nfs
+blocksize=524288
+iodepth=10
+ioengine=nfs
+size=104857600
+lat_percentiles=1
+group_reporting
+numjobs=10
+direct=1
+ramp_time=5s
+filename_format=myfiles.$clientuid.$jobnum.$filenum
+time_based=1
+
+[write]
+rw=write
+runtime=10s
+stonewall
+
+[read]
+wait_for=write
+rw=randread
+runtime=10s
\ No newline at end of file
diff --git a/fio.1 b/fio.1
index f959e00d01..b12381b584 100644
--- a/fio.1
+++ b/fio.1
@@ -1882,6 +1882,12 @@ not be \fBcudamalloc\fR. This ioengine defines engine specific options.
 .B dfs
 I/O engine supporting asynchronous read and write operations to the DAOS File
 System (DFS) via libdfs.
+.TP
+.B nfs
+I/O engine supporting asynchronous read and write operations to
+NFS filesystems from userspace via libnfs. This is useful for
+achieving higher concurrency and thus throughput than is possible
+via kernel NFS.
 .SS "I/O engine specific parameters"
 In addition, there are some parameters which are only valid when a specific
 \fBioengine\fR is in use. These are used identically to normal parameters,
@@ -2260,6 +2266,10 @@ Use DAOS container's chunk size by default.
 .BI (dfs)object_class
 Specificy a different object class for the dfs file.
 Use DAOS container's object class by default.
+.TP
+.BI (nfs)nfs_url
+URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+Refer to the libnfs README for more details.
 .SS "I/O depth"
 .TP
 .BI iodepth \fR=\fPint
diff --git a/optgroup.c b/optgroup.c
index 15a16229ef..bebb4a5133 100644
--- a/optgroup.c
+++ b/optgroup.c
@@ -185,6 +185,10 @@ static const struct opt_group fio_opt_cat_groups[] = {
 		.name	= "DAOS File System (dfs) I/O engine", /* dfs */
 		.mask	= FIO_OPT_G_DFS,
 	},
+	{
+		.name	= "NFS I/O engine", /* nfs */
+		.mask	= FIO_OPT_G_NFS,
+	},
 	{
 		.name	= NULL,
 	},
diff --git a/optgroup.h b/optgroup.h
index ff74862968..1fb84a296b 100644
--- a/optgroup.h
+++ b/optgroup.h
@@ -70,6 +70,7 @@ enum opt_category_group {
 	__FIO_OPT_G_NR,
 	__FIO_OPT_G_LIBCUFILE,
 	__FIO_OPT_G_DFS,
+	__FIO_OPT_G_NFS,
 
 	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
 	FIO_OPT_G_ZONE		= (1ULL << __FIO_OPT_G_ZONE),
@@ -110,6 +111,7 @@ enum opt_category_group {
 	FIO_OPT_G_INVALID	= (1ULL << __FIO_OPT_G_NR),
 	FIO_OPT_G_ISCSI         = (1ULL << __FIO_OPT_G_ISCSI),
 	FIO_OPT_G_NBD		= (1ULL << __FIO_OPT_G_NBD),
+	FIO_OPT_G_NFS		= (1ULL << __FIO_OPT_G_NFS),
 	FIO_OPT_G_IOURING	= (1ULL << __FIO_OPT_G_IOURING),
 	FIO_OPT_G_FILESTAT	= (1ULL << __FIO_OPT_G_FILESTAT),
 	FIO_OPT_G_LIBCUFILE	= (1ULL << __FIO_OPT_G_LIBCUFILE),
diff --git a/options.c b/options.c
index ddabaa82d2..b82a10aa44 100644
--- a/options.c
+++ b/options.c
@@ -2025,6 +2025,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 			  { .ival = "dfs",
 			    .help = "DAOS File System (dfs) IO engine",
 			  },
+#endif
+#ifdef CONFIG_NFS
+			  { .ival = "nfs",
+			    .help = "NFS IO engine",
+			  },
 #endif
 		},
 	},

From ebcdccdeeec1673b8f7b12c4176d19982ddad7cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Mon, 11 Jan 2021 13:41:54 +0100
Subject: [PATCH 02/42] engines/libpmem: set file open/create mode always to RW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

previously, when created file with a 'write' job it couldn't be open
later on, when a 'read' job was ran.

Signed-off-by: Łukasz Stolarczuk <lukasz.stolarczuk@intel.com>
---
 engines/libpmem.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/engines/libpmem.c b/engines/libpmem.c
index 2338f0fa24..364e384de7 100644
--- a/engines/libpmem.c
+++ b/engines/libpmem.c
@@ -2,7 +2,7 @@
  * libpmem: IO engine that uses PMDK libpmem to read and write data
  *
  * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
- * Copyright 2018-2020, Intel Corporation
+ * Copyright 2018-2021, Intel Corporation
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License,
@@ -97,17 +97,10 @@ static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
 			    size_t length, off_t off)
 {
 	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
-	mode_t mode = 0;
+	mode_t mode = S_IWUSR | S_IRUSR;
 	size_t mapped_len;
 	int is_pmem;
 
-	if(td_rw(td))
-		mode = S_IWUSR | S_IRUSR;
-	else if (td_write(td))
-		mode = S_IWUSR;
-	else
-		mode = S_IRUSR;
-
 	dprint(FD_IO, "DEBUG fio_libpmem_file\n");
 	dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name,
 			td->o.verify);

From 0e684e9d0e1605ce31977f697c97e0b78d393638 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Wed, 13 Jan 2021 17:43:03 +0100
Subject: [PATCH 03/42] engines/libpmem: cleanup a little code, comments and
 example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Łukasz Stolarczuk <lukasz.stolarczuk@intel.com>
---
 engines/libpmem.c    | 49 ++++++++++++++++++--------------------------
 examples/libpmem.fio | 35 ++++++++++++++++---------------
 2 files changed, 38 insertions(+), 46 deletions(-)

diff --git a/engines/libpmem.c b/engines/libpmem.c
index 364e384de7..3502911257 100644
--- a/engines/libpmem.c
+++ b/engines/libpmem.c
@@ -18,7 +18,8 @@
 /*
  * libpmem engine
  *
- * IO engine that uses libpmem to write data (and memcpy to read)
+ * IO engine that uses libpmem (part of PMDK collection) to write data
+ *	and libc's memcpy to read. It requires PMDK >= 1.5.
  *
  * To use:
  *   ioengine=libpmem
@@ -43,25 +44,13 @@
  *     mkdir /mnt/pmem0
  *     mount -o dax /dev/pmem0 /mnt/pmem0
  *
- * See examples/libpmem.fio for more.
- *
- *
- * libpmem.so
- *   By default, the libpmem engine will let the system find the libpmem.so
- *   that it uses. You can use an alternative libpmem by setting the
- *   FIO_PMEM_LIB environment variable to the full path to the desired
- *   libpmem.so. This engine requires PMDK >= 1.5.
+ * See examples/libpmem.fio for complete usage example.
  */
 
 #include <stdio.h>
-#include <limits.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <libgen.h>
 #include <libpmem.h>
 
 #include "../fio.h"
@@ -77,8 +66,8 @@ static int fio_libpmem_init(struct thread_data *td)
 {
 	struct thread_options *o = &td->o;
 
-	dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %u \n o->fdatasync_blocks %u \n",
-			o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+	dprint(FD_IO, "o->rw_min_bs %llu\n o->fsync_blocks %u\n o->fdatasync_blocks %u\n",
+			o->rw_min_bs, o->fsync_blocks, o->fdatasync_blocks);
 	dprint(FD_IO, "DEBUG fio_libpmem_init\n");
 
 	if ((o->rw_min_bs & page_mask) &&
@@ -91,7 +80,8 @@ static int fio_libpmem_init(struct thread_data *td)
 }
 
 /*
- * This is the pmem_map_file execution function
+ * This is the pmem_map_file execution function, a helper to
+ * fio_libpmem_open_file function.
  */
 static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
 			    size_t length, off_t off)
@@ -135,11 +125,11 @@ static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
 {
 	struct fio_libpmem_data *fdd;
 
-	dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
-	dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
-	dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
-	dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
-	dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+	dprint(FD_IO, "DEBUG fio_libpmem_open_file\n");
+	dprint(FD_IO, "f->io_size=%ld\n", f->io_size);
+	dprint(FD_IO, "td->o.size=%lld\n", td->o.size);
+	dprint(FD_IO, "td->o.iodepth=%d\n", td->o.iodepth);
+	dprint(FD_IO, "td->o.iodepth_batch=%d\n", td->o.iodepth_batch);
 
 	if (fio_file_open(f))
 		td_io_close_file(td, f);
@@ -160,8 +150,8 @@ static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
 	struct fio_file *f = io_u->file;
 	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
 
-	dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
-	dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %ld : "
+	dprint(FD_IO, "DEBUG fio_libpmem_prep\n");
+	dprint(FD_IO, "io_u->offset %llu : fdd->libpmem_off %ld : "
 			"io_u->buflen %llu : fdd->libpmem_sz %ld\n",
 			io_u->offset, fdd->libpmem_off,
 			io_u->buflen, fdd->libpmem_sz);
@@ -185,8 +175,9 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
 	io_u->error = 0;
 
 	dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
-	dprint(FD_IO,"td->o.odirect %d td->o.sync_io %d \n",td->o.odirect, td->o.sync_io);
-	/* map both O_SYNC / DSYNC to not using NODRAIN */
+	dprint(FD_IO, "td->o.odirect %d td->o.sync_io %d\n",
+			td->o.odirect, td->o.sync_io);
+	/* map both O_SYNC / DSYNC to not use NODRAIN */
 	flags = td->o.sync_io ? 0 : PMEM_F_MEM_NODRAIN;
 	flags |= td->o.odirect ? PMEM_F_MEM_NONTEMPORAL : PMEM_F_MEM_TEMPORAL;
 
@@ -196,7 +187,7 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
 		break;
 	case DDIR_WRITE:
 		dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
-				io_u->mmap_data, io_u->xfer_buf );
+				io_u->mmap_data, io_u->xfer_buf);
 		pmem_memcpy(io_u->mmap_data,
 					io_u->xfer_buf,
 					io_u->xfer_buflen,
@@ -220,8 +211,8 @@ static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
 	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
 	int ret = 0;
 
-	dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
-	dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+	dprint(FD_IO, "DEBUG fio_libpmem_close_file\n");
+	dprint(FD_IO, "td->o.odirect %d\n", td->o.odirect);
 
 	if (!td->o.odirect) {
 		dprint(FD_IO,"pmem_drain\n");
diff --git a/examples/libpmem.fio b/examples/libpmem.fio
index 0ff681f071..3b854a32bf 100644
--- a/examples/libpmem.fio
+++ b/examples/libpmem.fio
@@ -1,6 +1,6 @@
 [global]
 bs=4k
-size=8g
+size=10g
 ioengine=libpmem
 norandommap
 time_based
@@ -17,16 +17,6 @@ thread
 numjobs=1
 runtime=300
 
-#
-# In case of 'scramble_buffers=1', the source buffer
-# is rewritten with a random value every write operations.
-#
-# But when 'scramble_buffers=0' is set, the source buffer isn't
-# rewritten. So it will be likely that the source buffer is in CPU
-# cache and it seems to be high performance.
-#
-scramble_buffers=0
-
 #
 # depends on direct option, flags are set for pmem_memcpy() call:
 # direct=1 - PMEM_F_MEM_NONTEMPORAL,
@@ -39,9 +29,19 @@ direct=1
 #
 sync=1
 
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritten with a random value every write operation.
+#
+# But when 'scramble_buffers=0' is set, the source buffer isn't
+# rewritten. So it will be likely that the source buffer is in CPU
+# cache and it seems to be high write performance.
+#
+scramble_buffers=1
 
 #
-# Setting for fio process's CPU Node and Memory Node
+# Setting for fio process's CPU Node and Memory Node.
+# Set proper node below or use `numactl` command along with FIO.
 #
 numa_cpu_nodes=0
 numa_mem_policy=bind:0
@@ -53,21 +53,22 @@ cpus_allowed_policy=split
 
 #
 # The libpmem engine does IO to files in a DAX-mounted filesystem.
-# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# The filesystem should be created on a Non-Volatile DIMM (e.g /dev/pmem0)
 # and then mounted with the '-o dax' option.  Note that the engine
 # accesses the underlying NVDIMM directly, bypassing the kernel block
 # layer, so the usual filesystem/disk performance monitoring tools such
 # as iostat will not provide useful data.
 #
-directory=/mnt/pmem0
+#filename=/mnt/pmem/somefile
+directory=/mnt/pmem
 
 [libpmem-seqwrite]
 rw=write
 stonewall
 
-#[libpmem-seqread]
-#rw=read
-#stonewall
+[libpmem-seqread]
+rw=read
+stonewall
 
 #[libpmem-randwrite]
 #rw=randwrite

From 94c0b971d5e535e6b991899a57f88b6512412e58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Thu, 14 Jan 2021 18:19:43 +0100
Subject: [PATCH 04/42] engines/libpmem: do not call drain on close
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

no matter if direct was 1 or 0, it's not necessary.
It's either covered by non-temporal stores or it's not desired
by user (if 0 was set).

Signed-off-by: Łukasz Stolarczuk <lukasz.stolarczuk@intel.com>
---
 engines/libpmem.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/engines/libpmem.c b/engines/libpmem.c
index 3502911257..ab29a45383 100644
--- a/engines/libpmem.c
+++ b/engines/libpmem.c
@@ -212,12 +212,6 @@ static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
 	int ret = 0;
 
 	dprint(FD_IO, "DEBUG fio_libpmem_close_file\n");
-	dprint(FD_IO, "td->o.odirect %d\n", td->o.odirect);
-
-	if (!td->o.odirect) {
-		dprint(FD_IO,"pmem_drain\n");
-		pmem_drain();
-	}
 
 	if (fdd->libpmem_ptr)
 		ret = pmem_unmap(fdd->libpmem_ptr, fdd->libpmem_sz);

From 165b8a70f919eb8858a9109f5d0db6548df2822c Mon Sep 17 00:00:00 2001
From: Taras Glek <taras@purestorage.com>
Date: Tue, 20 Apr 2021 11:02:18 -0700
Subject: [PATCH 05/42] NFS configure fixes

---
 configure | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index a9f0c033e4..fd675d930b 100755
--- a/configure
+++ b/configure
@@ -170,9 +170,9 @@ disable_native="no"
 march_set="no"
 libiscsi="no"
 libnbd="no"
+libnfs="no"
 libzbc=""
 dfs=""
-libnfs="no"
 dynamic_engines="no"
 prefix=/usr/local
 
@@ -242,7 +242,7 @@ for opt do
   ;;
   --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
-  --enable-libnfs) libnfs="yes"
+  --disable-nfs) disable_nfs="yes"
   ;;
   --dynamic-libengines) dynamic_engines="yes"
   ;;
@@ -274,6 +274,7 @@ if test "$show_help" = "yes" ; then
   echo "--disable-rados         Disable Rados support even if found"
   echo "--disable-rbd           Disable Rados Block Device even if found"
   echo "--disable-http          Disable HTTP support even if found"
+  echo "--disable-nfs           Disable userspace NFS support even if found"
   echo "--disable-gfapi         Disable gfapi"
   echo "--enable-libhdfs        Enable hdfs support"
   echo "--enable-libnfs         Enable nfs support"
@@ -2280,21 +2281,21 @@ EOF
   fi
 fi
 print_config "DAOS File System (dfs) Engine" "$dfs"
-# Check if we have libnfs (for nfs support).
-if test "$libnfs" = "yes" ; then
+
+##########################################
+# Check if we have libnfs (for userspace nfs support).
+if test "$disable_nfs" != "yes"; then
   if $(pkg-config libnfs); then
     libnfs="yes"
     libnfs_cflags=$(pkg-config --cflags libnfs)
-    # libnfs_libs=$(pkg-config --libs libnfs)
-    libnfs_libs=/usr/local/lib/libnfs.a
+    libnfs_libs=$(pkg-config --libs libnfs)
   else
     if test "$libnfs" = "yes" ; then
       echo "libnfs" "Install libnfs"
     fi
-    libnfs="no"
   fi
 fi
-print_config "nfs engine" "$libnfs"
+print_config "NFS engine" "$libnfs"
 
 ##########################################
 # Check if we have lex/yacc available

From 4662c206a07e408b1970a577fda107e4f9397a68 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Thu, 22 Apr 2021 11:17:58 +0200
Subject: [PATCH 06/42] init: zonemode=zbd does not work with
 create_serialize=0

zbd_init_zone_info() has a comment that it only works correctly if it
called before the first fio fork() call.
However, right now, there is nothing that ensures this.

If the user specifies --create_serialize=0 and --numjobs=2, each thread
will get their own version of zbd_info.

zbd_info contains one mutex per zone, so if the threads get different
zbd_info, two threads can manage to lock the same zone at the same time,
which will lead to I/O errors.

Explicitly disallow --zonemode=zbd together with --create_serialize=0,
so that we know that all threads will use the same zbd_info, instead of
silently misbehaving.

Analysis:
setup_files() calls zbd_init_files() which calls zbd_init_zone_info().
zbd_init_zone_info() does a for_each_td(), where it checks if zbd_info
(for the same filename) has already been allocated by another thread.
This only works if create_serialize=1 (default).
If create_serialize=0, zbd_init_zone_info() will get called in parallel,
and in this case when the second thread checks if any other thread has
allocated zbd_info, the check will fail, since the first thread has not
yet been running long enough to allocate zbd_info.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
---
 init.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/init.c b/init.c
index 37bff8763c..60c7cff405 100644
--- a/init.c
+++ b/init.c
@@ -633,6 +633,11 @@ static int fixup_options(struct thread_data *td)
 		ret |= 1;
 	}
 
+	if (o->zone_mode == ZONE_MODE_ZBD && !o->create_serialize) {
+		log_err("fio: --zonemode=zbd and --create_serialize=0 are not compatible.\n");
+		ret |= 1;
+	}
+
 	if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) {
 		log_err("fio: --zonesize must be specified when using --zonemode=strided.\n");
 		ret |= 1;

From 6a2299789dccdd24351744476586e7d562a3940d Mon Sep 17 00:00:00 2001
From: Oksana Salyk <oksana.salyk@intel.com>
Date: Fri, 23 Apr 2021 08:09:44 +0200
Subject: [PATCH 07/42] rpma: gpspm: introduce the busy_wait_polling toggle

The performance of the librpma_gpspm engine depends heavily
on how much CPU power it can use to its work.
One can want either to take all available CPU power
and see what the maximum possible performance is
or configure it less aggressively and collect the results
when the CPU is not solely dedicated to doing this one task.

The librpma_gpspm engine allows toggling between one and another
by either waiting for incoming requests in the kernel
using rpma_conn_completion_wait() (busy_wait_polling=0)
or trying to collect the completion as soon as it appears
by polling all the time using rpma_conn_completion_get()
(busy_wait_polling=1).

Signed-off-by: Oksana Salyk <oksana.salyk@intel.com>
---
 HOWTO                             |  5 +++++
 engines/librpma_fio.c             | 11 +++++++++++
 engines/librpma_fio.h             |  2 ++
 engines/librpma_gpspm.c           | 25 +++++++++++++++++++++++--
 examples/librpma_gpspm-server.fio |  2 ++
 fio.1                             |  4 ++++
 6 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/HOWTO b/HOWTO
index e6078c5f1e..889526d921 100644
--- a/HOWTO
+++ b/HOWTO
@@ -2237,6 +2237,11 @@ with the caveat that when used on the command line, they must come after the
 	Set to 1 only when Direct Write to PMem from the remote host is possible.
 	Otherwise, set to 0.
 
+.. option:: busy_wait_polling=bool : [librpma_*_server]
+
+	Set to 0 to wait for completion instead of busy-wait polling completion.
+	Default: 1.
+
 .. option:: interface=str : [netsplice] [net]
 
 	The IP address of the network interface used to send or receive UDP
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c
index 810b55e23d..3d605ed6c3 100644
--- a/engines/librpma_fio.c
+++ b/engines/librpma_fio.c
@@ -49,6 +49,17 @@ struct fio_option librpma_fio_options[] = {
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_LIBRPMA,
 	},
+	{
+		.name	= "busy_wait_polling",
+		.lname	= "Set to 0 to wait for completion instead of busy-wait polling completion.",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct librpma_fio_options_values,
+					busy_wait_polling),
+		.help	= "Set to false if you want to reduce CPU usage",
+		.def	= "1",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_LIBRPMA,
+	},
 	{
 		.name	= NULL,
 	},
diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h
index 8cfb2e2d1a..fb89d99d69 100644
--- a/engines/librpma_fio.h
+++ b/engines/librpma_fio.h
@@ -41,6 +41,8 @@ struct librpma_fio_options_values {
 	char *port;
 	/* Direct Write to PMem is possible */
 	unsigned int direct_write_to_pmem;
+	/* Set to 0 to wait for completion instead of busy-wait polling completion. */
+	unsigned int busy_wait_polling;
 };
 
 extern struct fio_option librpma_fio_options[];
diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c
index ac614f462a..7414770971 100644
--- a/engines/librpma_gpspm.c
+++ b/engines/librpma_gpspm.c
@@ -683,12 +683,33 @@ static int server_cmpl_process(struct thread_data *td)
 	struct librpma_fio_server_data *csd = td->io_ops_data;
 	struct server_data *sd = csd->server_data;
 	struct rpma_completion *cmpl = &sd->msgs_queued[sd->msg_queued_nr];
+	struct librpma_fio_options_values *o = td->eo;
 	int ret;
 
 	ret = rpma_conn_completion_get(csd->conn, cmpl);
 	if (ret == RPMA_E_NO_COMPLETION) {
-		/* lack of completion is not an error */
-		return 0;
+		if (o->busy_wait_polling == 0) {
+			ret = rpma_conn_completion_wait(csd->conn);
+			if (ret == RPMA_E_NO_COMPLETION) {
+				/* lack of completion is not an error */
+				return 0;
+			} else if (ret != 0) {
+				librpma_td_verror(td, ret, "rpma_conn_completion_wait");
+				goto err_terminate;
+			}
+
+			ret = rpma_conn_completion_get(csd->conn, cmpl);
+			if (ret == RPMA_E_NO_COMPLETION) {
+				/* lack of completion is not an error */
+				return 0;
+			} else if (ret != 0) {
+				librpma_td_verror(td, ret, "rpma_conn_completion_get");
+				goto err_terminate;
+			}
+		} else {
+			/* lack of completion is not an error */
+			return 0;
+		}
 	} else if (ret != 0) {
 		librpma_td_verror(td, ret, "rpma_conn_completion_get");
 		goto err_terminate;
diff --git a/examples/librpma_gpspm-server.fio b/examples/librpma_gpspm-server.fio
index d618f2db21..67e92a28ad 100644
--- a/examples/librpma_gpspm-server.fio
+++ b/examples/librpma_gpspm-server.fio
@@ -20,6 +20,8 @@ thread
 # set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
 # (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
 direct_write_to_pmem=0
+# set to 0 (false) to wait for completion instead of busy-wait polling completion.
+busy_wait_polling=1
 numjobs=1 # number of expected incomming connections
 iodepth=2 # number of parallel GPSPM requests
 size=100MiB # size of workspace for a single connection
diff --git a/fio.1 b/fio.1
index 18dc156ad0..c3916168f2 100644
--- a/fio.1
+++ b/fio.1
@@ -1999,6 +1999,10 @@ The IP address to be used for RDMA-CM based I/O.
 .BI (librpma_*_server)direct_write_to_pmem \fR=\fPbool
 Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0.
 .TP
+.BI (librpma_*_server)busy_wait_polling \fR=\fPbool
+Set to 0 to wait for completion instead of busy-wait polling completion.
+Default: 1.
+.TP
 .BI (netsplice,net)interface \fR=\fPstr
 The IP address of the network interface used to send or receive UDP
 multicast.

From e9d2a04d1278ce02140a8b8da4d5aede7a6ad39d Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Mon, 26 Apr 2021 00:10:40 +0900
Subject: [PATCH 08/42] gettime: Fix compilation on non-Linux with
 pthread_getaffinity_np()

874d55e50c("os/os-linux: add pthread CPU affinity helper") and a few
commits after that broke compilation on non-Linux platforms which support
pthread_getaffinity_np().

Define fio_get_thread_affinity() on non-Linux platforms, and make gettime
test FIO_HAVE_GET_THREAD_AFFINITY which may or may not depend on pthread.
FIO_HAVE_GET_THREAD_AFFINITY is currently not defined on Windows.

Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
---
 gettime.c         | 2 +-
 os/os-aix.h       | 6 ++++++
 os/os-android.h   | 6 ++++++
 os/os-dragonfly.h | 6 ++++++
 os/os-freebsd.h   | 6 ++++++
 os/os-hpux.h      | 7 +++++++
 os/os-linux.h     | 3 +++
 os/os-mac.h       | 6 ++++++
 os/os-netbsd.h    | 6 ++++++
 os/os-openbsd.h   | 6 ++++++
 os/os-solaris.h   | 6 ++++++
 11 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/gettime.c b/gettime.c
index e3f483a700..099e9d9f6c 100644
--- a/gettime.c
+++ b/gettime.c
@@ -679,7 +679,7 @@ int fio_monotonic_clocktest(int debug)
 	unsigned int i;
 	os_cpu_mask_t mask;
 
-#ifdef CONFIG_PTHREAD_GETAFFINITY
+#ifdef FIO_HAVE_GET_THREAD_AFFINITY
 	fio_get_thread_affinity(mask);
 #else
 	memset(&mask, 0, sizeof(mask));
diff --git a/os/os-aix.h b/os/os-aix.h
index 1aab96e08d..db99eef4ce 100644
--- a/os/os-aix.h
+++ b/os/os-aix.h
@@ -18,6 +18,12 @@
 
 #define FIO_USE_GENERIC_SWAP
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
 	return ENOTSUP;
diff --git a/os/os-android.h b/os/os-android.h
index 3c05077624..3f1aa9d30a 100644
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -58,6 +58,12 @@
 #define MAP_HUGETLB 0x40000 /* arch specific */
 #endif
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 #ifndef CONFIG_NO_SHM
 /*
  * Bionic doesn't support SysV shared memeory, so implement it using ashmem
diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h
index 44bfcd5d06..6e46589450 100644
--- a/os/os-dragonfly.h
+++ b/os/os-dragonfly.h
@@ -92,6 +92,12 @@ typedef cpumask_t os_cpu_mask_t;
 /* No CPU_COUNT(), but use the default function defined in os/os.h */
 #define fio_cpu_count(mask)             CPU_COUNT((mask))
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
 	CPUMASK_ASSZERO(*mask);
diff --git a/os/os-freebsd.h b/os/os-freebsd.h
index b3addf981f..1b24fa022a 100644
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h
@@ -37,6 +37,12 @@ typedef cpuset_t os_cpu_mask_t;
 #define fio_cpu_isset(mask, cpu)	(CPU_ISSET((cpu), (mask)) != 0)
 #define fio_cpu_count(mask)		CPU_COUNT((mask))
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
         CPU_ZERO(mask);
diff --git a/os/os-hpux.h b/os/os-hpux.h
index c1dafe42ee..a80cb2bc47 100644
--- a/os/os-hpux.h
+++ b/os/os-hpux.h
@@ -38,6 +38,13 @@
 #define FIO_USE_GENERIC_SWAP
 
 #define FIO_OS_HAVE_AIOCB_TYPEDEF
+
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 typedef struct aiocb64 os_aiocb_t;
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
diff --git a/os/os-linux.h b/os/os-linux.h
index ea8d79221c..f7137abe1b 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -74,8 +74,11 @@ typedef cpu_set_t os_cpu_mask_t;
 	sched_getaffinity((pid), (ptr))
 #endif
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
 #define fio_get_thread_affinity(mask)	\
 	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
 
 #define fio_cpu_clear(mask, cpu)	(void) CPU_CLR((cpu), (mask))
 #define fio_cpu_set(mask, cpu)		(void) CPU_SET((cpu), (mask))
diff --git a/os/os-mac.h b/os/os-mac.h
index 683aab3220..ec2cc1e555 100644
--- a/os/os-mac.h
+++ b/os/os-mac.h
@@ -27,6 +27,12 @@
 #define fio_swap32(x)	OSSwapInt32(x)
 #define fio_swap64(x)	OSSwapInt64(x)
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 #ifndef CONFIG_CLOCKID_T
 typedef unsigned int clockid_t;
 #endif
diff --git a/os/os-netbsd.h b/os/os-netbsd.h
index abc1d3cb70..624c7fa509 100644
--- a/os/os-netbsd.h
+++ b/os/os-netbsd.h
@@ -35,6 +35,12 @@
 #define fio_swap32(x)	bswap32(x)
 #define fio_swap64(x)	bswap64(x)
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
 {
 	struct disklabel dl;
diff --git a/os/os-openbsd.h b/os/os-openbsd.h
index 994bf078c9..f1bad67165 100644
--- a/os/os-openbsd.h
+++ b/os/os-openbsd.h
@@ -35,6 +35,12 @@
 #define fio_swap32(x)	swap32(x)
 #define fio_swap64(x)	swap64(x)
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
 {
 	struct disklabel dl;
diff --git a/os/os-solaris.h b/os/os-solaris.h
index f1966f449d..ea1f081c89 100644
--- a/os/os-solaris.h
+++ b/os/os-solaris.h
@@ -46,6 +46,12 @@ struct solaris_rand_seed {
 #define os_ctime_r(x, y, z)     ctime_r((x), (y), (z))
 #define FIO_OS_HAS_CTIME_R
 
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)	\
+	pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
 typedef psetid_t os_cpu_mask_t;
 
 static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)

From 3277b7e48e9d3600d4a33a652e8c2a20e59f2f37 Mon Sep 17 00:00:00 2001
From: Rebecca Cran <rebecca@bsdio.com>
Date: Wed, 21 Apr 2021 20:32:25 -0600
Subject: [PATCH 09/42] The GPL isn't a EULA: remove it and introduce
 WixUI_Minimal_NoEULA

The GPL shouldn't be used as a EULA in an installer.
Remove it, and since the WixUI_Minimal dialog set requires a EULA
create a custom WixUI_Minimal_NoEULA set.

Signed-off-by: Rebecca Cran <rebecca@bsdio.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 os/windows/WixUI_Minimal_NoEULA.wxs |  96 ++++++++++++++++++++++++++++
 os/windows/WixUI_fio.wxl            |  12 ++++
 os/windows/dobuild.cmd              |   5 +-
 os/windows/eula.rtf                 | Bin 1075 -> 0 bytes
 os/windows/install.wxs              |   2 +-
 5 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100755 os/windows/WixUI_Minimal_NoEULA.wxs
 create mode 100755 os/windows/WixUI_fio.wxl
 delete mode 100755 os/windows/eula.rtf

diff --git a/os/windows/WixUI_Minimal_NoEULA.wxs b/os/windows/WixUI_Minimal_NoEULA.wxs
new file mode 100755
index 0000000000..48391186eb
--- /dev/null
+++ b/os/windows/WixUI_Minimal_NoEULA.wxs
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->
+
+
+
+<!--
+First-time install dialog sequence:
+ - WixUI_MyWelcomeDlg
+Maintenance dialog sequence:
+ WixUI_MaintenanceWelcomeDlg
+ - WixUI_MaintenanceTypeDlg
+ - WixUI_VerifyReadyDlg
+-->
+
+<Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">
+  <Fragment>
+    <UI Id="WixUI_Minimal_NoEULA">
+      <TextStyle Id="WixUI_Font_Normal" FaceName="Tahoma" Size="8" />
+      <TextStyle Id="WixUI_Font_Bigger" FaceName="Tahoma" Size="12" />
+      <TextStyle Id="WixUI_Font_Title" FaceName="Tahoma" Size="9" Bold="yes" />
+
+      <Property Id="DefaultUIFont" Value="WixUI_Font_Normal" />
+      <Property Id="WixUI_Mode" Value="Minimal" />
+
+      <DialogRef Id="ErrorDlg" />
+      <DialogRef Id="FatalError" />
+      <DialogRef Id="FilesInUse" />
+      <DialogRef Id="MsiRMFilesInUse" />
+      <DialogRef Id="PrepareDlg" />
+      <DialogRef Id="ProgressDlg" />
+      <DialogRef Id="ResumeDlg" />
+      <DialogRef Id="UserExit" />
+      <DialogRef Id="MyWelcomeDlg" />
+
+      <Dialog Id="MyWelcomeDlg" Width="370" Height="270" Title="!(loc.WelcomeDlg_Title)">
+          <Control Id="Install" Type="PushButton" ElevationShield="yes" X="236" Y="243" Width="56" Height="17" Default="yes" Hidden="yes" Text="!(loc.WelcomeEulaDlgInstall)" >
+            <Publish Property="WixUI_InstallMode" Value="Update">Installed AND PATCH</Publish>
+            <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>
+            <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>
+            <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>
+            <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+            <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+            <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>
+            <Condition Action="show">ALLUSERS</Condition>
+        </Control>
+        <Control Id="InstallNoShield" Type="PushButton" ElevationShield="no" X="212" Y="243" Width="80" Height="17" Default="yes" Text="!(loc.WelcomeEulaDlgInstall)" Hidden="yes">
+          <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>
+          <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>
+          <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>
+          <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+          <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+          <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>
+          <Condition Action="disable"><![CDATA[LicenseAccepted <> "1"]]></Condition>
+          <Condition Action="show">NOT ALLUSERS</Condition>
+        </Control>
+        <Control Id="Cancel" Type="PushButton" X="304" Y="243" Width="56" Height="17" Cancel="yes" Text="!(loc.WixUICancel)">
+          <Publish Event="SpawnDialog" Value="CancelDlg">1</Publish>
+        </Control>
+        <Control Id="Bitmap" Type="Bitmap" X="0" Y="0" Width="370" Height="234" TabSkip="no" Text="!(loc.WelcomeDlgBitmap)" />
+        <Control Id="Back" Type="PushButton" X="180" Y="243" Width="56" Height="17" Disabled="yes" Text="!(loc.WixUIBack)" />
+        <Control Id="BottomLine" Type="Line" X="0" Y="234" Width="370" Height="0" />
+        <Control Id="Description" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.MyWelcomeDlgDescription)" >
+          <Condition Action="show">NOT Installed OR NOT PATCH</Condition>
+          <Condition Action="hide">Installed AND PATCH</Condition>
+        </Control>
+        <Control Id="PatchDescription" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeUpdateDlgDescriptionUpdate)" >
+          <Condition Action="show">Installed AND PATCH</Condition>
+          <Condition Action="hide">NOT Installed OR NOT PATCH</Condition>
+        </Control>
+        <Control Id="Title" Type="Text" X="135" Y="20" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeDlgTitle)" />
+      </Dialog>
+
+      <Publish Dialog="ExitDialog" Control="Finish" Event="EndDialog" Value="Return" Order="999">1</Publish>
+
+      <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>
+
+      <Publish Dialog="MaintenanceWelcomeDlg" Control="Next" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>
+
+      <Publish Dialog="MaintenanceTypeDlg" Control="RepairButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>
+      <Publish Dialog="MaintenanceTypeDlg" Control="RemoveButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>
+      <Publish Dialog="MaintenanceTypeDlg" Control="Back" Event="NewDialog" Value="MaintenanceWelcomeDlg">1</Publish>
+
+      <Publish Dialog="MyWelcomeDlg" Control="Install" Event="NewDialog" Value="PrepareDlg">1</Publish>
+      <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="WelcomeDlg" Order="2">Installed AND PATCH</Publish>
+
+      <InstallUISequence>
+        <Show Dialog="WelcomeDlg" Before="ProgressDlg">0</Show>
+        <Show Dialog="MyWelcomeDlg" Before="ProgressDlg">NOT Installed</Show>
+      </InstallUISequence>
+
+      <Property Id="ARPNOMODIFY" Value="1" />
+    </UI>
+
+    <UIRef Id="WixUI_Common" />
+  </Fragment>
+</Wix>
\ No newline at end of file
diff --git a/os/windows/WixUI_fio.wxl b/os/windows/WixUI_fio.wxl
new file mode 100755
index 0000000000..11ec736a55
--- /dev/null
+++ b/os/windows/WixUI_fio.wxl
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->
+
+
+<WixLocalization Culture="en-US" Codepage="1252" xmlns="http://schemas.microsoft.com/wix/2006/localization">
+  <!-- _locID@Culture="en-US" _locComment="American English" -->
+  <!-- _locID@Codepage="1252" _locComment="Windows-1252" -->
+
+<String Id="MyWelcomeDlgDescription" Overridable="yes">
+<!-- _locID_text="MyWelcomeDlgDescription" _locComment="MyWelcomeDlgDescription" -->The Setup Wizard will install [ProductName] on your computer. Click Install to continue or Cancel to exit the Setup Wizard.
+</String>
+</WixLocalization>
\ No newline at end of file
diff --git a/os/windows/dobuild.cmd b/os/windows/dobuild.cmd
index 08df3e876d..7b9cb1ddad 100644
--- a/os/windows/dobuild.cmd
+++ b/os/windows/dobuild.cmd
@@ -44,7 +44,10 @@ if exist ..\..\fio.pdb (
 @if ERRORLEVEL 1 goto end
 "%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs
 @if ERRORLEVEL 1 goto end
-"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% WixUI_Minimal_NoEULA.wxs
+@if ERRORLEVEL 1 goto end
+
+"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj WixUI_Minimal_NoEULA.wixobj -loc WixUI_fio.wxl -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi
 :end
 
 if defined SIGN_FIO (
diff --git a/os/windows/eula.rtf b/os/windows/eula.rtf
deleted file mode 100755
index a931017cd644cf9f1723705abb785fad694e113c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1075
zcmaJ=O>g2b5Z!a7{)aiXD+*2u^n(*yl@curD1a&?UraJdtTlFIJ5Yr9?>kA0s=bhd
zzw&10z31_fTb~77XeXN?Df1xgg*?S9V_cg`CT3d-f8?3b-ft8w7|+~ZI>)o55Z0k*
zFh;dQFB361ovyErJTb~xLc@StOc~EDqGo)fDhzwQL4*0pbJS>sHx^=<6qtq#0@h~j
z;E8pE6W>YP|CKuJ@+t}7`ihsrrrdu#gp9_Z+pVWC=r_~jV?EvXSv_^{8%U2eklsTB
z>316lMc0srrv~!aCrG1+wo2&y2}aQj?qH<wLv9p8)R}=t=h4>nudiNAd9X4sJlwq8
zLdXJk7cvTNetwS2LOLj|$*tG|Ei+ON&SZWsEIz}*RFDV_<cxIQ%1z}Fqz9qX4tcRP
zsmu=bL8Vi);0q)IcI&`o&E{>og15#x1@mg7WC9bJpmqp?>eeTY+=R`cfsg9yi?8Z1
zGL%(#X><!vQf>=7w2t-=o(p_sYXPyj5xypD!IX8-EwMa+5=7`6*S;kETVV=PH+%|H
zlAwa}lEl4K3N{Fp!>m#*ATQuOT8?L{B@Cx)cpt{`aJpO%2<Zz$e{qMOdE~ZKl4O#p
zR%m~y+5dTsU&e$V{)#5ia$O4;Ma${yVgaLB3`3X?<7M=+nhax@uj2V^@!AGhkU_P}
z7mUrCi?IkPdLfnjPxP8D)DbzALLqiY7n`8m5j6n`b<5e-3+aV2I&UNrk?-aqZczr>
z_!c<gc%;WCQKd)zlV>WG+WF&nJ%5cS(eyVI-k0vV)7kI$?Oa!FWAhI6M%@>#1E&)O
ctJFH>j~9`=Qj8}VJJEN38312;>EZtP2jP%zcmMzZ

diff --git a/os/windows/install.wxs b/os/windows/install.wxs
index f73ec5e251..7773bb3b86 100755
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -107,7 +107,7 @@
 
 	<WixVariable Id="WixUILicenseRtf" Value="eula.rtf" />
 
-	<UIRef Id="WixUI_Minimal"/>
+	<UIRef Id="WixUI_Minimal_NoEULA"/>
 
 	<MajorUpgrade AllowDowngrades="no" DowngradeErrorMessage="A newer version of the application is already installed."
                   AllowSameVersionUpgrades="yes"/>

From 6308ef297145e73add65ba86bfdbeaf967957d1f Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Tue, 27 Apr 2021 17:41:14 +0000
Subject: [PATCH 10/42] ioengines: don't call zbd_put_io_u() for engines not
 implementing commit

Commit d9ed3e63e528 ("zbd: Fix zone locking for async I/O engines") added
a call to zbd_put_io_u() in the case where td->io_ops->commit callback
is not implemented on an ioengine.

The commit in question fails to mention why this zbd_put_io_u() call was
added for ioengines not implementing the commit callback.

The code in td_io_queue() looks like this:

ret = td->io_ops->queue(td, io_u);
zbd_queue_io_u(td, io_u, ret);

if (!td->io_ops->commit) {
	io_u_mark_submit(td, 1);
	io_u_mark_complete(td, 1);
	zbd_put_io_u(td, io_u);
}

SYNC I/O engine case (e.g. psync):
The zone will be locked by zbd_adjust_block(), td->io_ops->queue(td, io_u),
which for a sync I/O engine will return FIO_Q_COMPLETED.

This return value will be send in to zbd_queue_io_u(), which at the end
of the function, unlocks the zone if the return value from ->queue()
differs from FIO_Q_QUEUED. For a sync I/O engine, the zone will be
unlocked here, and io_u->zbd_put_io function pointer will be set to NULL.

psync does not implement the ->commit() callback, so it will call
zbd_put_io_u(), which will do nothing, because the io_u->zbd_put_io
pointer is NULL.

ASYNC I/O engine case (e.g. io_uring):
The zone will be locked by zbd_adjust_block(), td->io_ops->queue(td, io_u),
which for an async I/O engine will return FIO_Q_QUEUED.

This return value will be send in to zbd_queue_io_u(), which at the end
of the function, unlocks the zone if the return value from ->queue()
differs from FIO_Q_QUEUED. For an async I/O engine, the zone will not be
unlocked here, so the io_u->zbd_put_io function pointer will still be set.

io_uring does implement the ->commit() callback, so it will not call
zbd_put_io_u() here at all.

Instead zbd_put_io_u() will be called by do_io() -> wait_for_completions()
-> io_u_queued_complete() -> ios_completed() -> put_io_u() -> zbd_put_io_u(),
which will unlock the zone and will set the io_u->zbd_put_io function pointer
to NULL.

In conclusion, the zbd_put_io_u() should never had been added in the case
where the ->commit() callback wasn't implemented in the first place,
and removing it shouldn't affect ioengines psync or io_uring.

Commit d9ed3e63e528 ("zbd: Fix zone locking for async I/O engines")
probably made the assumption that an async I/O engine == the ->commit()
callback is implemented, however, this is not true, there are async
I/O engines in tree (and out of tree), that does not implement the
->commit() callback. Instead, an async I/O engine is recognized by
the ->queue() callback returning FIO_Q_QUEUED.

Removing the invalid zbd_put_io_u() call will ensure that a zone is not
prematurely unlocked for async I/O engines that do not implement the
->commit() callback. Unlocking a zone prematurely leads to I/O errors.

Fixes: d9ed3e63e528 ("zbd: Fix zone locking for async I/O engines")
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 ioengines.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ioengines.c b/ioengines.c
index f88b0537f1..3561bb4e6e 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -414,7 +414,6 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
 	if (!td->io_ops->commit) {
 		io_u_mark_submit(td, 1);
 		io_u_mark_complete(td, 1);
-		zbd_put_io_u(td, io_u);
 	}
 
 	if (ret == FIO_Q_COMPLETED) {

From 1fb2bc2f73579bf4b9eb92c54a8479ccc204720c Mon Sep 17 00:00:00 2001
From: Taras Glek <taras@purestorage.com>
Date: Wed, 5 May 2021 09:00:13 -0700
Subject: [PATCH 11/42] C-style comments

---
 engines/nfs.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/engines/nfs.c b/engines/nfs.c
index df09477600..70bfd24e27 100644
--- a/engines/nfs.c
+++ b/engines/nfs.c
@@ -1,4 +1,3 @@
-// https://github.com/axboe/fio/pull/762 sample pull req for new engine
 #include <stdlib.h>
 #include <poll.h>
 #include <nfsc/libnfs.h>
@@ -17,13 +16,13 @@ enum nfs_op_type {
 struct fio_libnfs_options {
 	struct nfs_context *context;
 	char *nfs_url;
-	// the following implements a circular queue of outstanding IOs
-	int outstanding_events; // IOs issued to libnfs, that have not returned yet
-	int prev_requested_event_index; // event last returned via fio_libnfs_event
-	int next_buffered_event; // round robin-pointer within events[]
-	int buffered_event_count; // IOs completed by libnfs faiting for FIO
-	int free_event_buffer_index; // next empty buffer
-	unsigned int queue_depth; // nfs_callback needs this info, but doesn't have fio td structure to pull it from
+	unsigned int queue_depth; /* nfs_callback needs this info, but doesn't have fio td structure to pull it from */
+	/* the following implement a circular queue of outstanding IOs */
+	int outstanding_events; /* IOs issued to libnfs, that have not returned yet */
+	int prev_requested_event_index; /* event last returned via fio_libnfs_event */
+	int next_buffered_event; /* round robin-pointer within events[] */
+	int buffered_event_count; /* IOs completed by libnfs, waiting for FIO */
+	int free_event_buffer_index; /* next free buffer */
 	struct io_u**events;
 };
 
@@ -60,11 +59,11 @@ static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
 	assert(o->events[o->next_buffered_event]);
 	o->events[o->next_buffered_event] = NULL;
 	o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth;
-	// validate our state machine
+	/* validate our state machine */
 	assert(o->buffered_event_count);
 	o->buffered_event_count--;
 	assert(io_u);
-	// assert that fio_libnfs_event is being called in sequential fashion
+	/* assert that fio_libnfs_event is being called in sequential fashion */
 	assert(event == 0 || o->prev_requested_event_index + 1 == event);
 	if (o->buffered_event_count == 0) {
 		o->prev_requested_event_index = -1;
@@ -77,11 +76,11 @@ static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
 static int nfs_event_loop(struct thread_data *td, bool flush) {
 	struct fio_libnfs_options *o = td->eo;
 	struct pollfd pfds[1]; /* nfs:0 */
-	// we already have stuff queued for fio, no need to waste cpu on poll()
+	/* we already have stuff queued for fio, no need to waste cpu on poll() */
 	if (o->buffered_event_count) {
 		return o->buffered_event_count;
 	}
-	// fio core logic seems to stop calling this event-loop if we ever return with 0 events
+	/* fio core logic seems to stop calling this event-loop if we ever return with 0 events */
 	#define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events))
 
 	do {
@@ -130,7 +129,7 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data,
 	if (res < 0) {
 		log_err("Failed NFS operation(code:%d): %s\n", res, nfs_get_error(o->context));
 		io_u->error = -res;
-		// res is used for read math below, don't wanna pass negative there
+		/* res is used for read math below, don't wanna pass negative there */
 		res = 0;
 	} else if (io_u->ddir == DDIR_READ) {
 		memcpy(io_u->buf, data, res);
@@ -138,7 +137,7 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data,
 			log_err("Got NFS EOF, this is probably not expected\n");
 		}
 	}
-	// fio uses resid to track remaining data
+	/* fio uses resid to track remaining data */
 	io_u->resid = io_u->xfer_buflen - res;
 
 	assert(!o->events[o->free_event_buffer_index]);
@@ -248,7 +247,7 @@ static int do_mount(struct thread_data *td, const char *url)
  */
 static int fio_libnfs_setup(struct thread_data *td)
 {
-	// flipping this makes using gdb easier, but tends to hang fio on exit
+	/* Using threads with libnfs causes fio to hang on exit, lower performance */
 	td->o.use_thread = 0;
 	return 0;
 }

From 7654a8d5e4d20e88556e427d6cc5944bcf042e9b Mon Sep 17 00:00:00 2001
From: Taras Glek <taras@purestorage.com>
Date: Wed, 5 May 2021 09:02:20 -0700
Subject: [PATCH 12/42] single line bodies

---
 engines/nfs.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/engines/nfs.c b/engines/nfs.c
index 70bfd24e27..6d4ad7b1f2 100644
--- a/engines/nfs.c
+++ b/engines/nfs.c
@@ -77,9 +77,8 @@ static int nfs_event_loop(struct thread_data *td, bool flush) {
 	struct fio_libnfs_options *o = td->eo;
 	struct pollfd pfds[1]; /* nfs:0 */
 	/* we already have stuff queued for fio, no need to waste cpu on poll() */
-	if (o->buffered_event_count) {
+	if (o->buffered_event_count)
 		return o->buffered_event_count;
-	}
 	/* fio core logic seems to stop calling this event-loop if we ever return with 0 events */
 	#define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events))
 
@@ -105,8 +104,8 @@ static int nfs_event_loop(struct thread_data *td, bool flush) {
 		}
 	} while (SHOULD_WAIT());
 	return o->buffered_event_count;
-}
 #undef SHOULD_WAIT
+}
 
 /*
  * The ->getevents() hook is used to reap completion events from an async
@@ -133,9 +132,8 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data,
 		res = 0;
 	} else if (io_u->ddir == DDIR_READ) {
 		memcpy(io_u->buf, data, res);
-		if (res == 0) {
+		if (res == 0)
 			log_err("Got NFS EOF, this is probably not expected\n");
-		}
 	}
 	/* fio uses resid to track remaining data */
 	io_u->resid = io_u->xfer_buflen - res;
@@ -213,9 +211,8 @@ static int do_mount(struct thread_data *td, const char *url)
 	int path_len = 0;
 	char *mnt_dir = NULL;
 
-	if (options->context) {
+	if (options->context)
 		return 0;
-	}
 
 	options->context = nfs_init_context();
 	if (options->context == NULL) {
@@ -294,9 +291,8 @@ static int fio_libnfs_open(struct thread_data *td, struct fio_file *f)
 	}
 	ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh);
 
-	if (ret != 0) {
+	if (ret != 0)
 		log_err("Failed to open %s: %s\n", f->file_name, nfs_get_error(options->context));
-	}
 	f->engine_data = nfs_data;
 	return ret;
 }
@@ -306,9 +302,8 @@ static int fio_libnfs_close(struct thread_data *td, struct fio_file *f)
 	struct nfs_data *nfs_data = f->engine_data;
 	struct fio_libnfs_options *o = nfs_data->options;
 	int ret = 0;
-	if (nfs_data->nfsfh) {
+	if (nfs_data->nfsfh)
 		ret = nfs_close(o->context, nfs_data->nfsfh);
-	}
 	free(nfs_data);
 	f->engine_data = NULL;
 	return ret;

From 388f111191981b7162ce3283bc33afbe6ca7dc79 Mon Sep 17 00:00:00 2001
From: Taras Glek <taras@purestorage.com>
Date: Wed, 5 May 2021 09:04:56 -0700
Subject: [PATCH 13/42] skip skeleton comments

---
 engines/nfs.c | 37 +++----------------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

diff --git a/engines/nfs.c b/engines/nfs.c
index 6d4ad7b1f2..21be88334d 100644
--- a/engines/nfs.c
+++ b/engines/nfs.c
@@ -46,12 +46,6 @@ static struct fio_option options[] = {
 	},
 };
 
-/*
- * The ->event() hook is called to match an event number with an io_u.
- * After the core has called ->getevents() and it has returned eg 3,
- * the ->event() hook must return the 3 events that have completed for
- * subsequent calls to ->event() with [0-2]. Required.
- */
 static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
 {
 	struct fio_libnfs_options *o = td->eo;
@@ -107,12 +101,6 @@ static int nfs_event_loop(struct thread_data *td, bool flush) {
 #undef SHOULD_WAIT
 }
 
-/*
- * The ->getevents() hook is used to reap completion events from an async
- * io engine. It returns the number of completed events since the last call,
- * which may then be retrieved by calling the ->event() hook with the event
- * numbers. Required.
- */
 static int fio_libnfs_getevents(struct thread_data *td, unsigned int min,
 				  unsigned int max, const struct timespec *t)
 {
@@ -157,16 +145,6 @@ static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) {
 	return nfs_pread_async(o->context,  nfs_data->nfsfh, io_u->offset, io_u->buflen, nfs_callback,  io_u);
 }
 
-/*
- * The ->queue() hook is responsible for initiating io on the io_u
- * being passed in. If the io engine is a synchronous one, io may complete
- * before ->queue() returns. Required.
- *
- * The io engine must transfer in the direction noted by io_u->ddir
- * to the buffer pointed to by io_u->xfer_buf for as many bytes as
- * io_u->xfer_buflen. Residual data count may be set in io_u->resid
- * for a short read/write.
- */
 static enum fio_q_status fio_libnfs_queue(struct thread_data *td,
 					    struct io_u *io_u)
 {
@@ -201,7 +179,9 @@ static enum fio_q_status fio_libnfs_queue(struct thread_data *td,
 	return ret;
 }
 
-/** Do a mount if one has not been done before */
+/*
+ * Do a mount if one has not been done before 
+ */
 static int do_mount(struct thread_data *td, const char *url)
 {
 	size_t event_size = sizeof(struct io_u **) * td->o.iodepth;
@@ -237,11 +217,6 @@ static int do_mount(struct thread_data *td, const char *url)
 	return ret;
 }
 
-/*
- * The init function is called once per thread/process, and should set up
- * any structures that this io engine requires to keep track of io. Not
- * required.
- */
 static int fio_libnfs_setup(struct thread_data *td)
 {
 	/* Using threads with libnfs causes fio to hang on exit, lower performance */
@@ -249,11 +224,6 @@ static int fio_libnfs_setup(struct thread_data *td)
 	return 0;
 }
 
-/*
- * This is paired with the ->init() function and is called when a thread is
- * done doing io. Should tear down anything setup by the ->init() function.
- * Not required.
- */
 static void fio_libnfs_cleanup(struct thread_data *td)
 {
 	struct fio_libnfs_options *o = td->eo;
@@ -342,4 +312,3 @@ static void fio_exit fio_nfs_unregister(void)
 {
 	unregister_ioengine(&ioengine);
 }
-

From c94b8f18181f2aca2e5ad25aa66cb1e354570e9f Mon Sep 17 00:00:00 2001
From: Taras Glek <taras@purestorage.com>
Date: Wed, 5 May 2021 09:11:06 -0700
Subject: [PATCH 14/42] clean up nfs example

---
 examples/nfs.fio | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/nfs.fio b/examples/nfs.fio
index 2449f4154d..f856cebfbe 100644
--- a/examples/nfs.fio
+++ b/examples/nfs.fio
@@ -7,7 +7,6 @@ size=104857600
 lat_percentiles=1
 group_reporting
 numjobs=10
-direct=1
 ramp_time=5s
 filename_format=myfiles.$clientuid.$jobnum.$filenum
 time_based=1
@@ -20,4 +19,4 @@ stonewall
 [read]
 wait_for=write
 rw=randread
-runtime=10s
\ No newline at end of file
+runtime=10s

From cffe80a41cbf9b26446c803177a27f7695f94a31 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 6 May 2021 17:23:31 +0100
Subject: [PATCH 15/42] configure: fix check_min_lib_version() eval

The following shell statement:

  if eval "echo \$$_feature" = "yes" ; then

executes:

  echo $... = "yes"

It does not actually compare the variable named by $_feature to the
string "yes".

Add the missing "test" call so the comparison happens as intended and
wrap the eval so it doesn't include the = "yes".

Fixes: 3e48f7c9de61 ("configure: fix syntax error with NetBSD")
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index a7d82be06b..e886bdc8c0 100755
--- a/configure
+++ b/configure
@@ -142,7 +142,7 @@ check_min_lib_version() {
   fi
   : "${_feature:=${1}}"
   if "${cross_prefix}"pkg-config --version > /dev/null 2>&1; then
-    if eval "echo \$$_feature" = "yes" ; then
+    if test "$(eval echo \"\$$_feature\")" = "yes" ; then
       feature_not_found "$_feature" "$1 >= $2"
     fi
   else

From 193aaf6a41329b1858d75970cdc4e1777b87c07a Mon Sep 17 00:00:00 2001
From: Gonzalez <ahriben.gonzalez@intel.com>
Date: Thu, 6 May 2021 11:15:41 -0700
Subject: [PATCH 16/42] Add Documentation for z unit

---
 HOWTO | 14 ++++++++++----
 fio.1 | 26 +++++++++++++++++++-------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/HOWTO b/HOWTO
index 889526d921..177310f64f 100644
--- a/HOWTO
+++ b/HOWTO
@@ -544,6 +544,9 @@ Parameter types
 		* *Ti* -- means tebi (Ti) or 1024**4
 		* *Pi* -- means pebi (Pi) or 1024**5
 
+	For Zone Block Device Mode:
+	        * *z*  -- means Zone
+
 	With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
 	from those specified in the SI and IEC 80000-13 standards to provide
 	compatibility with old scripts.  For example, 4k means 4096.
@@ -1277,13 +1280,14 @@ I/O type
 .. option:: offset=int
 
 	Start I/O at the provided offset in the file, given as either a fixed size in
-	bytes or a percentage. If a percentage is given, the generated offset will be
+	bytes, zones or a percentage. If a percentage is given, the generated offset will be
 	aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
 	provided. Data before the given offset will not be touched. This
 	effectively caps the file size at `real_size - offset`. Can be combined with
 	:option:`size` to constrain the start and end range of the I/O workload.
 	A percentage can be specified by a number between 1 and 100 followed by '%',
-	for example, ``offset=20%`` to specify 20%.
+	for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as 
+        number of zones using 'z'.
 
 .. option:: offset_align=int
 
@@ -1300,7 +1304,8 @@ I/O type
 	intended to operate on a file in parallel disjoint segments, with even
 	spacing between the starting points. Percentages can be used for this option.
 	If a percentage is given, the generated offset will be aligned to the minimum
-	``blocksize`` or to the value of ``offset_align`` if provided.
+	``blocksize`` or to the value of ``offset_align`` if provided. In ZBD mode, value can
+        also be set as number of zones using 'z'.
 
 .. option:: number_ios=int
 
@@ -1818,7 +1823,8 @@ I/O size
 	If this option is not specified, fio will use the full size of the given
 	files or devices.  If the files do not exist, size must be given. It is also
 	possible to give size as a percentage between 1 and 100. If ``size=20%`` is
-	given, fio will use 20% of the full size of the given files or devices.
+	given, fio will use 20% of the full size of the given files or devices. 
+	In ZBD mode, value can also be set as number of zones using 'z'.
 	Can be combined with :option:`offset` to constrain the start and end range
 	that I/O will be done within.
 
diff --git a/fio.1 b/fio.1
index c3916168f2..e7da5c6826 100644
--- a/fio.1
+++ b/fio.1
@@ -288,6 +288,15 @@ Pi means pebi (Pi) or 1024**5
 .PD
 .RE
 .P
+For Zone Block Device Mode:
+.RS
+.P
+.PD 0
+z means Zone 
+.P
+.PD
+.RE
+.P
 With `kb_base=1024' (the default), the unit prefixes are opposite
 from those specified in the SI and IEC 80000-13 standards to provide
 compatibility with old scripts. For example, 4k means 4096.
@@ -1061,13 +1070,14 @@ should be associated with them.
 .TP
 .BI offset \fR=\fPint[%|z]
 Start I/O at the provided offset in the file, given as either a fixed size in
-bytes or a percentage. If a percentage is given, the generated offset will be
+bytes, zones or a percentage. If a percentage is given, the generated offset will be
 aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if
 provided. Data before the given offset will not be touched. This
 effectively caps the file size at `real_size \- offset'. Can be combined with
 \fBsize\fR to constrain the start and end range of the I/O workload.
 A percentage can be specified by a number between 1 and 100 followed by '%',
-for example, `offset=20%' to specify 20%.
+for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as 
+number of zones using 'z'.
 .TP
 .BI offset_align \fR=\fPint
 If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR
@@ -1082,7 +1092,8 @@ specified). This option is useful if there are several jobs which are
 intended to operate on a file in parallel disjoint segments, with even
 spacing between the starting points. Percentages can be used for this option.
 If a percentage is given, the generated offset will be aligned to the minimum
-\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.
+\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value 
+can be set as number of zones using 'z'.
 .TP
 .BI number_ios \fR=\fPint
 Fio will normally perform I/Os until it has exhausted the size of the region
@@ -1607,9 +1618,9 @@ set to the physical size of the given files or devices if they exist.
 If this option is not specified, fio will use the full size of the given
 files or devices. If the files do not exist, size must be given. It is also
 possible to give size as a percentage between 1 and 100. If `size=20%' is
-given, fio will use 20% of the full size of the given files or devices.
-Can be combined with \fBoffset\fR to constrain the start and end range
-that I/O will be done within.
+given, fio will use 20% of the full size of the given files or devices. In ZBD mode,
+size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to 
+constrain the start and end range that I/O will be done within.
 .TP
 .BI io_size \fR=\fPint[%|z] "\fR,\fB io_limit" \fR=\fPint[%|z]
 Normally fio operates within the region set by \fBsize\fR, which means
@@ -1621,7 +1632,8 @@ will perform I/O within the first 20GiB but exit when 5GiB have been
 done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
 and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
 the 0..20GiB region. Value can be set as percentage: \fBio_size\fR=N%.
-In this case \fBio_size\fR multiplies \fBsize\fR= value.
+In this case \fBio_size\fR multiplies \fBsize\fR= value. In ZBD mode, value can
+also be set as number of zones using 'z'.
 .TP
 .BI filesize \fR=\fPirange(int)
 Individual file sizes. May be a range, in which case fio will select sizes

From 79f488cbd95ca6989031a7ace5ec382313d31b3c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <esandeen@redhat.com>
Date: Fri, 7 May 2021 16:13:05 -0500
Subject: [PATCH 17/42] don't access dlclose'd dynamic ioengine object after
 close

Alexey reported this bug when using dynamically loaded IO engines;
a segfault on the line where we set the dlhandle to NULL after
the dlclose.

I think this is because ops points to the thing we obtained from dlsym:

	ops = dlsym(dlhandle, engine_lib);

and after the final dlclose, the object no longer exists and efforts
to set the handle within it will fail for obvious reasons.
I'm not sure why I hadn't seen this before.

Fixes-RH-Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1956963
Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Fixes: f6931a1 ("fio: move dynamic library handle to io_ops structure")
Tested-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 ioengines.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ioengines.c b/ioengines.c
index 3561bb4e6e..dd61af07a4 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -234,7 +234,6 @@ void free_ioengine(struct thread_data *td)
 	if (td->io_ops->dlhandle) {
 		dprint(FD_IO, "dlclose ioengine %s\n", td->io_ops->name);
 		dlclose(td->io_ops->dlhandle);
-		td->io_ops->dlhandle = NULL;
 	}
 
 	td->io_ops = NULL;

From 6ee607ba9c5129ebf0bac1c42fa0a4700456cb88 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Thu, 6 May 2021 13:18:45 +0000
Subject: [PATCH 18/42] oslib/linux-blkzoned: make sure that we always support
 zone capacity

A common problem is that users upgrade their kernel to support NVMe ZNS
devices, however, they still use the kernel uapi headers provided by their
distro.

This means that even if the kernel will populate the zone capacity fields
for each zone in the zone report returned by the ioctl, fio will not know
how to interpret that data.

This leads to fio writing past the zone capacity, which will lead to
I/O errors.

It is not trivial for a user to realize that the kernel uapi headers
provided by their distro is the reason for these I/O errors.

In order to make it easier for these users, provide a copy of the current
zoned block device kernel uapi structs.

If the kernel uapi headers installed on the system are too old to support
zone capacity, use the locally defined structs instead.
If the installed headers are new enough to support zone capacity, use the
installed headers.

This way, fio will always be able to handle zone capacity (if the kernel
supports it). At the same time, we will not redefine any structs from the
installed headers if they are newer than our locally defined structs.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 oslib/linux-blkzoned.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c
index f37c67fc86..81e4e7f0d5 100644
--- a/oslib/linux-blkzoned.c
+++ b/oslib/linux-blkzoned.c
@@ -23,6 +23,37 @@
 
 #include <linux/blkzoned.h>
 
+/*
+ * If the uapi headers installed on the system lacks zone capacity support,
+ * use our local versions. If the installed headers are recent enough to
+ * support zone capacity, do not redefine any structs.
+ */
+#ifndef CONFIG_HAVE_REP_CAPACITY
+#define BLK_ZONE_REP_CAPACITY	(1 << 0)
+
+struct blk_zone_v2 {
+	__u64	start;          /* Zone start sector */
+	__u64	len;            /* Zone length in number of sectors */
+	__u64	wp;             /* Zone write pointer position */
+	__u8	type;           /* Zone type */
+	__u8	cond;           /* Zone condition */
+	__u8	non_seq;        /* Non-sequential write resources active */
+	__u8	reset;          /* Reset write pointer recommended */
+	__u8	resv[4];
+	__u64	capacity;       /* Zone capacity in number of sectors */
+	__u8	reserved[24];
+};
+#define blk_zone blk_zone_v2
+
+struct blk_zone_report_v2 {
+	__u64	sector;
+	__u32	nr_zones;
+	__u32	flags;
+struct blk_zone zones[0];
+};
+#define blk_zone_report blk_zone_report_v2
+#endif /* CONFIG_HAVE_REP_CAPACITY */
+
 /*
  * Read up to 255 characters from the first line of a file. Strip the trailing
  * newline.
@@ -116,10 +147,8 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
 static uint64_t zone_capacity(struct blk_zone_report *hdr,
 			      struct blk_zone *blkz)
 {
-#ifdef CONFIG_HAVE_REP_CAPACITY
 	if (hdr->flags & BLK_ZONE_REP_CAPACITY)
 		return blkz->capacity << 9;
-#endif
 	return blkz->len << 9;
 }
 

From 418f53993b07e48b5a69db84c9c7209acd53eac3 Mon Sep 17 00:00:00 2001
From: Martin Bukatovic <martin.bukatovic@gmail.com>
Date: Tue, 11 May 2021 09:38:55 +0200
Subject: [PATCH 19/42] Make fill_device to stop writing on EDQUOT

Option fill_device stops writing when we run out of quota as well.

Signed-off-by: Martin Bukatovic <martin.bukatovic@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 HOWTO       |  3 ++-
 backend.c   |  7 ++++---
 filesetup.c | 11 ++++++++---
 fio.1       |  3 ++-
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/HOWTO b/HOWTO
index 177310f64f..f5681c0dca 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1858,7 +1858,8 @@ I/O size
 .. option:: fill_device=bool, fill_fs=bool
 
 	Sets size to something really large and waits for ENOSPC (no space left on
-	device) as the terminating condition. Only makes sense with sequential
+	device) or EDQUOT (disk quota exceeded)
+	as the terminating condition. Only makes sense with sequential
 	write. For a read workload, the mount point will be filled first then I/O
 	started on the result. This option doesn't make sense if operating on a raw
 	device node, since the size of that is already known by the file system.
diff --git a/backend.c b/backend.c
index 399c299e14..6290e0d652 100644
--- a/backend.c
+++ b/backend.c
@@ -393,7 +393,7 @@ static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
 			td_clear_error(td);
 			*retptr = 0;
 			return false;
-		} else if (td->o.fill_device && err == ENOSPC) {
+		} else if (td->o.fill_device && (err == ENOSPC || err == EDQUOT)) {
 			/*
 			 * We expect to hit this error if
 			 * fill_device option is set.
@@ -1105,7 +1105,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
 	if (td->trim_entries)
 		log_err("fio: %lu trim entries leaked?\n", td->trim_entries);
 
-	if (td->o.fill_device && td->error == ENOSPC) {
+	if (td->o.fill_device && (td->error == ENOSPC || td->error == EDQUOT)) {
 		td->error = 0;
 		fio_mark_td_terminate(td);
 	}
@@ -1120,7 +1120,8 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
 
 		if (i) {
 			ret = io_u_queued_complete(td, i);
-			if (td->o.fill_device && td->error == ENOSPC)
+			if (td->o.fill_device &&
+			    (td->error == ENOSPC || td->error == EDQUOT))
 				td->error = 0;
 		}
 
diff --git a/filesetup.c b/filesetup.c
index e664f8b42f..296de5a11a 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -226,11 +226,16 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
 			if (r < 0) {
 				int __e = errno;
 
-				if (__e == ENOSPC) {
+				if (__e == ENOSPC || __e == EDQUOT) {
+					const char *__e_name;
 					if (td->o.fill_device)
 						break;
-					log_info("fio: ENOSPC on laying out "
-						 "file, stopping\n");
+					if (__e == ENOSPC)
+						__e_name = "ENOSPC";
+					else
+						__e_name = "EDQUOT";
+					log_info("fio: %s on laying out "
+						 "file, stopping\n", __e_name);
 				}
 				td_verror(td, errno, "write");
 			} else
diff --git a/fio.1 b/fio.1
index e7da5c6826..533bcf6a52 100644
--- a/fio.1
+++ b/fio.1
@@ -1650,7 +1650,8 @@ of a file. This option is ignored on non-regular files.
 .TP
 .BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
 Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential
+device) or EDQUOT (disk quota exceeded)
+as the terminating condition. Only makes sense with sequential
 write. For a read workload, the mount point will be filled first then I/O
 started on the result. This option doesn't make sense if operating on a raw
 device node, since the size of that is already known by the file system.

From 30bec59eab3908b681cbc2866179f7166a849c83 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 May 2021 07:58:03 -0600
Subject: [PATCH 20/42] os: define EDQUOT to EIO if the OS doesn't provide it

Fixes: 418f53993b07 ("Make fill_device to stop writing on EDQUOT")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 os/os.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/os/os.h b/os/os.h
index b46f416400..e47d3d9706 100644
--- a/os/os.h
+++ b/os/os.h
@@ -7,6 +7,7 @@
 #include <pthread.h>
 #include <unistd.h>
 #include <stdlib.h>
+#include <errno.h>
 
 #include "../arch/arch.h" /* IWYU pragma: export */
 #include "../lib/types.h"
@@ -58,6 +59,10 @@ typedef enum {
 #error "unsupported os"
 #endif
 
+#ifndef EDQUOT
+#define EDQUOT	EIO
+#endif
+
 #ifdef CONFIG_POSIXAIO
 #include <aio.h>
 #ifndef FIO_OS_HAVE_AIOCB_TYPEDEF

From 2984a4fcedcdc5536b2559d634694fb8fecf40c4 Mon Sep 17 00:00:00 2001
From: Lars Kellogg-Stedman <lars@redhat.com>
Date: Wed, 12 May 2021 18:41:43 -0400
Subject: [PATCH 21/42] fix fio2gnuplot to work with new logging format

The logging format updates documented in 1a953d97 were never
propagated to fio2gnuplot, which since then has been failing with a
ValueError exception.

This commit explicits limits fio2gnuplot to only reading the first
4 columns in the log file.

Closes #928
---
 tools/plot/fio2gnuplot | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot
index 78ee82fb80..d2dc81df9b 100755
--- a/tools/plot/fio2gnuplot
+++ b/tools/plot/fio2gnuplot
@@ -198,7 +198,7 @@ def compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir, min_time, max_
 			# Index will be used to remember what file was featuring what value
 			index=index+1
 
-			time, perf, x, block_size = line[1]
+			time, perf, x, block_size = line[1][:4]
 			if (blk_size == 0):
 				try:
 					blk_size=int(block_size)

From 106e14ce87c5b1984727aabf9a48f7284bff21c1 Mon Sep 17 00:00:00 2001
From: Felix Abecassis <fabecassis@nvidia.com>
Date: Thu, 13 May 2021 17:02:40 -0700
Subject: [PATCH 22/42] stat: fix integer overflow in
 convert_agg_kbytes_percent

Assuming that "int" is 32-bit, for high bandwidth values (> 21.5 GB/s)
the expression "mean * 100" will cause an integer overflow before the
conversion to "double" happens.

Signed-off-by: Felix Abecassis <fabecassis@nvidia.com>
---
 stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stat.c b/stat.c
index b7222f465f..a8a96c85a4 100644
--- a/stat.c
+++ b/stat.c
@@ -462,7 +462,7 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i
 {
 	double p_of_agg = 100.0;
 	if (rs && rs->agg[ddir] > 1024) {
-		p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024.0);
+		p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
 
 		if (p_of_agg > 100.0)
 			p_of_agg = 100.0;

From 6399ab79bf410ac317260614c36f60ad76e5aa35 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 14 May 2021 12:52:51 +0000
Subject: [PATCH 23/42] zbd: only put an upper limit on max open zones once

There is an upper limit that is checked for each td, and for each file,
even though a file has a pointer to a zoned_block_device_info that has
already been created. Multiple files, from the same or from another td
can point to the same zoned_block_device_info.
All zoned_block_device_info:s have already been created earlier in the
call chain.

Simplify this by only checking the upper limit on max open zones when a
zoned_block_device_info is created.

This way, max_open_zones is handled from a single location, instead of
potentially being reassigned from a completely different location.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 zbd.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/zbd.c b/zbd.c
index eed796b321..46ff9aeb04 100644
--- a/zbd.c
+++ b/zbd.c
@@ -588,7 +588,8 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 
 	if (ret == 0) {
 		f->zbd_info->model = zbd_model;
-		f->zbd_info->max_open_zones = td->o.max_open_zones;
+		f->zbd_info->max_open_zones =
+			min_not_zero(td->o.max_open_zones, ZBD_MAX_OPEN_ZONES);
 	}
 	return ret;
 }
@@ -726,8 +727,6 @@ int zbd_setup_files(struct thread_data *td)
 		if (zbd_is_seq_job(f))
 			assert(f->min_zone < f->max_zone);
 
-		zbd->max_open_zones = zbd->max_open_zones ?: ZBD_MAX_OPEN_ZONES;
-
 		if (td->o.max_open_zones > 0 &&
 		    zbd->max_open_zones != td->o.max_open_zones) {
 			log_err("Different 'max_open_zones' values\n");

From eaa45783ef5079884f96813e74c6b450dc52d0f0 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 14 May 2021 12:52:51 +0000
Subject: [PATCH 24/42] oslib/linux-blkzoned: move sysfs reading into its own
 function

Move the sysfs reading into its own function so that it can be reused.
This new function will be reused in a following patch.

No functional change intended.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 oslib/linux-blkzoned.c | 62 +++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c
index 81e4e7f0d5..84a64ed301 100644
--- a/oslib/linux-blkzoned.c
+++ b/oslib/linux-blkzoned.c
@@ -74,12 +74,16 @@ static char *read_file(const char *path)
 	return strdup(line);
 }
 
-int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
-			     enum zbd_zoned_model *model)
+/*
+ * Get the value of a sysfs attribute for a block device.
+ *
+ * Returns NULL on failure.
+ * Returns a pointer to a string on success.
+ * The caller is responsible for freeing the memory.
+ */
+static char *blkzoned_get_sysfs_attr(const char *file_name, const char *attr)
 {
-	const char *file_name = f->file_name;
-	char *zoned_attr_path = NULL;
-	char *model_str = NULL;
+	char *attr_path = NULL;
 	struct stat statbuf;
 	char *sys_devno_path = NULL;
 	char *part_attr_path = NULL;
@@ -87,13 +91,7 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
 	char sys_path[PATH_MAX];
 	ssize_t sz;
 	char *delim = NULL;
-
-	if (f->filetype != FIO_TYPE_BLOCK) {
-		*model = ZBD_IGNORE;
-		return 0;
-	}
-
-	*model = ZBD_NONE;
+	char *attr_str = NULL;
 
 	if (stat(file_name, &statbuf) < 0)
 		goto out;
@@ -123,24 +121,44 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
 		*delim = '\0';
 	}
 
-	if (asprintf(&zoned_attr_path,
-		     "/sys/dev/block/%s/queue/zoned", sys_path) < 0)
+	if (asprintf(&attr_path,
+		     "/sys/dev/block/%s/%s", sys_path, attr) < 0)
 		goto out;
 
-	model_str = read_file(zoned_attr_path);
+	attr_str = read_file(attr_path);
+out:
+	free(attr_path);
+	free(part_str);
+	free(part_attr_path);
+	free(sys_devno_path);
+
+	return attr_str;
+}
+
+int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
+			     enum zbd_zoned_model *model)
+{
+	char *model_str = NULL;
+
+	if (f->filetype != FIO_TYPE_BLOCK) {
+		*model = ZBD_IGNORE;
+		return 0;
+	}
+
+	*model = ZBD_NONE;
+
+	model_str = blkzoned_get_sysfs_attr(f->file_name, "queue/zoned");
 	if (!model_str)
-		goto out;
-	dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str);
+		return 0;
+
+	dprint(FD_ZBD, "%s: zbd model string: %s\n", f->file_name, model_str);
 	if (strcmp(model_str, "host-aware") == 0)
 		*model = ZBD_HOST_AWARE;
 	else if (strcmp(model_str, "host-managed") == 0)
 		*model = ZBD_HOST_MANAGED;
-out:
+
 	free(model_str);
-	free(zoned_attr_path);
-	free(part_str);
-	free(part_attr_path);
-	free(sys_devno_path);
+
 	return 0;
 }
 

From d2f442bc0bd507510089d56cd510616093415702 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 14 May 2021 12:53:14 +0000
Subject: [PATCH 25/42] ioengines: add get_max_open_zones zoned block device
 operation

Define a new IO engine operation to get the maximum number of open zones.
Like the existing IO engine operations: .get_zoned_model, .report_zones,
and .reset_wp, this new IO engine operation is only valid for zoned block
devices.

Similarly to the other zbd IO engine operations, also provide a default
implementation inside oslib/linux-blkzoned.c that will be used if the
ioengine does not override it.

The default Linux oslib implementation is implemented similarly to
blkzoned_get_zoned_model(), i.e. it will return a successful error code
even when the sysfs attribute does not exist.
This is because the sysfs max_open_zones attribute was introduced first
in Linux v5.9.
All error handling is still there, so an ioengine that provides its own
implementation will still have its error code respected properly.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 engines/skeleton_external.c | 13 ++++++
 ioengines.h                 |  4 +-
 oslib/blkzoned.h            |  7 +++
 oslib/linux-blkzoned.c      | 21 +++++++++
 zbd.c                       | 90 ++++++++++++++++++++++++++++++++++---
 5 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c
index 7f3e4cb3a1..c79b6f1114 100644
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c
@@ -193,6 +193,18 @@ static int fio_skeleton_reset_wp(struct thread_data *td, struct fio_file *f,
 	return 0;
 }
 
+/*
+ * Hook called for getting the maximum number of open zones for a
+ * ZBD_HOST_MANAGED zoned block device.
+ * A @max_open_zones value set to zero means no limit.
+ */
+static int fio_skeleton_get_max_open_zones(struct thread_data *td,
+					   struct fio_file *f,
+					   unsigned int *max_open_zones)
+{
+	return 0;
+}
+
 /*
  * Note that the structure is exported, so that fio can get it via
  * dlsym(..., "ioengine"); for (and only for) external engines.
@@ -212,6 +224,7 @@ struct ioengine_ops ioengine = {
 	.get_zoned_model = fio_skeleton_get_zoned_model,
 	.report_zones	= fio_skeleton_report_zones,
 	.reset_wp	= fio_skeleton_reset_wp,
+	.get_max_open_zones = fio_skeleton_get_max_open_zones,
 	.options	= options,
 	.option_struct_size	= sizeof(struct fio_skeleton_options),
 };
diff --git a/ioengines.h b/ioengines.h
index 1d01ab0a6d..b3f755b477 100644
--- a/ioengines.h
+++ b/ioengines.h
@@ -8,7 +8,7 @@
 #include "io_u.h"
 #include "zbd_types.h"
 
-#define FIO_IOOPS_VERSION	29
+#define FIO_IOOPS_VERSION	30
 
 #ifndef CONFIG_DYNAMIC_ENGINES
 #define FIO_STATIC	static
@@ -59,6 +59,8 @@ struct ioengine_ops {
 			    uint64_t, struct zbd_zone *, unsigned int);
 	int (*reset_wp)(struct thread_data *, struct fio_file *,
 			uint64_t, uint64_t);
+	int (*get_max_open_zones)(struct thread_data *, struct fio_file *,
+				  unsigned int *);
 	int option_struct_size;
 	struct fio_option *options;
 };
diff --git a/oslib/blkzoned.h b/oslib/blkzoned.h
index 4cc071dc6a..719b041d12 100644
--- a/oslib/blkzoned.h
+++ b/oslib/blkzoned.h
@@ -16,6 +16,8 @@ extern int blkzoned_report_zones(struct thread_data *td,
 				struct zbd_zone *zones, unsigned int nr_zones);
 extern int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
 				uint64_t offset, uint64_t length);
+extern int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+				       unsigned int *max_open_zones);
 #else
 /*
  * Define stubs for systems that do not have zoned block device support.
@@ -44,6 +46,11 @@ static inline int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
 {
 	return -EIO;
 }
+static inline int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+					      unsigned int *max_open_zones)
+{
+	return -EIO;
+}
 #endif
 
 #endif /* FIO_BLKZONED_H */
diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c
index 84a64ed301..6f89ec6f41 100644
--- a/oslib/linux-blkzoned.c
+++ b/oslib/linux-blkzoned.c
@@ -162,6 +162,27 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
 	return 0;
 }
 
+int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+				unsigned int *max_open_zones)
+{
+	char *max_open_str;
+
+	if (f->filetype != FIO_TYPE_BLOCK)
+		return -EIO;
+
+	max_open_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_open_zones");
+	if (!max_open_str)
+		return 0;
+
+	dprint(FD_ZBD, "%s: max open zones supported by device: %s\n",
+	       f->file_name, max_open_str);
+	*max_open_zones = atoll(max_open_str);
+
+	free(max_open_str);
+
+	return 0;
+}
+
 static uint64_t zone_capacity(struct blk_zone_report *hdr,
 			      struct blk_zone *blkz)
 {
diff --git a/zbd.c b/zbd.c
index 46ff9aeb04..68cd58e1b9 100644
--- a/zbd.c
+++ b/zbd.c
@@ -113,6 +113,34 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
 	return ret;
 }
 
+/**
+ * zbd_get_max_open_zones - Get the maximum number of open zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max open zones
+ * @max_open_zones: Upon success, result will be stored here.
+ *
+ * A @max_open_zones value set to zero means no limit.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+			   unsigned int *max_open_zones)
+{
+	int ret;
+
+	if (td->io_ops && td->io_ops->get_max_open_zones)
+		ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
+	else
+		ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
+	if (ret < 0) {
+		td_verror(td, errno, "get max open zones failed");
+		log_err("%s: get max open zones failed (%d).\n",
+			f->file_name, errno);
+	}
+
+	return ret;
+}
+
 /**
  * zbd_zone_idx - convert an offset into a zone number
  * @f: file pointer.
@@ -554,6 +582,51 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 	return ret;
 }
 
+static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f)
+{
+	struct zoned_block_device_info *zbd = f->zbd_info;
+	unsigned int max_open_zones;
+	int ret;
+
+	if (zbd->model != ZBD_HOST_MANAGED) {
+		/* Only host-managed devices have a max open limit */
+		zbd->max_open_zones = td->o.max_open_zones;
+		goto out;
+	}
+
+	/* If host-managed, get the max open limit */
+	ret = zbd_get_max_open_zones(td, f, &max_open_zones);
+	if (ret)
+		return ret;
+
+	if (!max_open_zones) {
+		/* No device limit */
+		zbd->max_open_zones = td->o.max_open_zones;
+	} else if (!td->o.max_open_zones) {
+		/* No user limit. Set limit to device limit */
+		zbd->max_open_zones = max_open_zones;
+	} else if (td->o.max_open_zones <= max_open_zones) {
+		/* Both user limit and dev limit. User limit not too large */
+		zbd->max_open_zones = td->o.max_open_zones;
+	} else {
+		/* Both user limit and dev limit. User limit too large */
+		td_verror(td, EINVAL,
+			  "Specified --max_open_zones is too large");
+		log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
+			td->o.max_open_zones, max_open_zones);
+		return -EINVAL;
+	}
+
+out:
+	/* Ensure that the limit is not larger than FIO's internal limit */
+	zbd->max_open_zones = min_not_zero(zbd->max_open_zones,
+					   (uint32_t) ZBD_MAX_OPEN_ZONES);
+	dprint(FD_ZBD, "%s: using max open zones limit: %"PRIu32"\n",
+	       f->file_name, zbd->max_open_zones);
+
+	return 0;
+}
+
 /*
  * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
  *
@@ -576,9 +649,13 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 	case ZBD_HOST_AWARE:
 	case ZBD_HOST_MANAGED:
 		ret = parse_zone_info(td, f);
+		if (ret)
+			return ret;
 		break;
 	case ZBD_NONE:
 		ret = init_zone_info(td, f);
+		if (ret)
+			return ret;
 		break;
 	default:
 		td_verror(td, EINVAL, "Unsupported zoned model");
@@ -586,12 +663,15 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 		return -EINVAL;
 	}
 
-	if (ret == 0) {
-		f->zbd_info->model = zbd_model;
-		f->zbd_info->max_open_zones =
-			min_not_zero(td->o.max_open_zones, ZBD_MAX_OPEN_ZONES);
+	f->zbd_info->model = zbd_model;
+
+	ret = zbd_set_max_open_zones(td, f);
+	if (ret) {
+		zbd_free_zone_info(f);
+		return ret;
 	}
-	return ret;
+
+	return 0;
 }
 
 void zbd_free_zone_info(struct fio_file *f)

From e8267436fd7a02d819f3d0a2a77527d2f942e08b Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 14 May 2021 12:53:15 +0000
Subject: [PATCH 26/42] engines/libzbc: add support for the get_max_open_zones
 io op

Add support for the new .get_max_open_zones io operation.

zbc.c will only ever call this callback for host-managed devices.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 engines/libzbc.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/engines/libzbc.c b/engines/libzbc.c
index 2aacf7bbeb..3dde93db54 100644
--- a/engines/libzbc.c
+++ b/engines/libzbc.c
@@ -19,6 +19,7 @@ struct libzbc_data {
 	struct zbc_device	*zdev;
 	enum zbc_dev_model	model;
 	uint64_t		nr_sectors;
+	uint32_t		max_open_seq_req;
 };
 
 static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
@@ -32,6 +33,7 @@ static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
 	zbc_get_device_info(ld->zdev, zinfo);
 	ld->model = zinfo->zbd_model;
 	ld->nr_sectors = zinfo->zbd_sectors;
+	ld->max_open_seq_req = zinfo->zbd_max_nr_open_seq_req;
 
 	dprint(FD_ZBD, "%s: vendor_id:%s, type: %s, model: %s\n",
 	       f->file_name, zinfo->zbd_vendor_id,
@@ -335,6 +337,24 @@ static int libzbc_reset_wp(struct thread_data *td, struct fio_file *f,
 	return -ret;
 }
 
+static int libzbc_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+				     unsigned int *max_open_zones)
+{
+	struct libzbc_data *ld;
+	int ret;
+
+	ret = libzbc_open_dev(td, f, &ld);
+	if (ret)
+		return ret;
+
+	if (ld->max_open_seq_req == ZBC_NO_LIMIT)
+		*max_open_zones = 0;
+	else
+		*max_open_zones = ld->max_open_seq_req;
+
+	return 0;
+}
+
 ssize_t libzbc_rw(struct thread_data *td, struct io_u *io_u)
 {
 	struct libzbc_data *ld = td->io_ops_data;
@@ -414,6 +434,7 @@ FIO_STATIC struct ioengine_ops ioengine = {
 	.get_zoned_model	= libzbc_get_zoned_model,
 	.report_zones		= libzbc_report_zones,
 	.reset_wp		= libzbc_reset_wp,
+	.get_max_open_zones	= libzbc_get_max_open_zones,
 	.queue			= libzbc_queue,
 	.flags			= FIO_SYNCIO | FIO_NOEXTEND | FIO_RAWIO,
 };

From d7e3adb683f85e49e078599a08aec7cd7c32d977 Mon Sep 17 00:00:00 2001
From: DevriesL <therkduan@gmail.com>
Date: Tue, 25 May 2021 23:45:11 +0800
Subject: [PATCH 27/42] android: add support for NDK sharedmem

Android add support for NDK sharedmem since API level 26 and prohibit
the directly use of ashmem since API level 29, so we can use sharedmem
if targeting API level is higher than 26.

Signed-off-by: DevriesL <therkduan@gmail.com>
---
 os/os-android.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/os/os-android.h b/os/os-android.h
index 3f1aa9d30a..a81cd815e1 100644
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -71,11 +71,15 @@
 #include <stdio.h>
 #include <linux/ashmem.h>
 #include <linux/shm.h>
+#include <android/api-level.h>
+#if __ANDROID_API__ >= __ANDROID_API_O__
+#include <android/sharedmem.h>
+#else
+#define ASHMEM_DEVICE	"/dev/ashmem"
+#endif
 #define shmid_ds shmid64_ds
 #define SHM_HUGETLB    04000
 
-#define ASHMEM_DEVICE	"/dev/ashmem"
-
 static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
 {
 	int ret=0;
@@ -89,6 +93,16 @@ static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
 	return ret;
 }
 
+#if __ANDROID_API__ >= __ANDROID_API_O__
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+	char keybuf[11];
+
+	sprintf(keybuf, "%d", __key);
+
+	return ASharedMemory_create(keybuf, __size + sizeof(uint64_t));
+}
+#else
 static inline int shmget(key_t __key, size_t __size, int __shmflg)
 {
 	int fd,ret;
@@ -114,6 +128,7 @@ static inline int shmget(key_t __key, size_t __size, int __shmflg)
 	close(fd);
 	return ret;
 }
+#endif
 
 static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
 {

From 0313e938c9c8bb37d71dade239f1f5326677b079 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 26 May 2021 10:10:32 -0600
Subject: [PATCH 28/42] Fio 3.27

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 FIO-VERSION-GEN | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index 294860716c..47af94e9de 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.26
+DEF_VER=fio-3.27
 
 LF='
 '

From 6df25f781e07e373833ec1629e005d36474c3b67 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Thu, 27 May 2021 11:12:31 +0000
Subject: [PATCH 29/42] zbd: add missing client/server support for option
 max_open_zones

Ensure that we convert the max_open_zones option for client/server.

Use __cpu_to_le32()/__le32_to_cpu() rather than
cpu_to_le32()/le32_to_cpu(), since max_open_zones is defined
as int rather than unsigned int in thread_options.h.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 cconv.c          | 2 ++
 server.h         | 2 +-
 thread_options.h | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cconv.c b/cconv.c
index aa06e3ea6e..d4dfb81b3f 100644
--- a/cconv.c
+++ b/cconv.c
@@ -231,6 +231,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->zone_capacity = le64_to_cpu(top->zone_capacity);
 	o->zone_skip = le64_to_cpu(top->zone_skip);
 	o->zone_mode = le32_to_cpu(top->zone_mode);
+	o->max_open_zones = __le32_to_cpu(top->max_open_zones);
 	o->lockmem = le64_to_cpu(top->lockmem);
 	o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
 	o->offset_increment = le64_to_cpu(top->offset_increment);
@@ -573,6 +574,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->zone_capacity = __cpu_to_le64(o->zone_capacity);
 	top->zone_skip = __cpu_to_le64(o->zone_skip);
 	top->zone_mode = __cpu_to_le32(o->zone_mode);
+	top->max_open_zones = __cpu_to_le32(o->max_open_zones);
 	top->lockmem = __cpu_to_le64(o->lockmem);
 	top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
 	top->file_size_low = __cpu_to_le64(o->file_size_low);
diff --git a/server.h b/server.h
index b45b319ba2..8cf3a60b4b 100644
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 89,
+	FIO_SERVER_VER			= 90,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
diff --git a/thread_options.h b/thread_options.h
index 5ecc72d7b5..4d48e46299 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -656,6 +656,7 @@ struct thread_options_pack {
 	uint32_t allow_mounted_write;
 
 	uint32_t zone_mode;
+	int32_t max_open_zones;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);

From 575686bb85fa36f326524c505e83c54abc0d2f2b Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Thu, 27 May 2021 11:12:32 +0000
Subject: [PATCH 30/42] zbd: add a new --ignore_zone_limits option

In commit d2f442bc0bd5 ("ioengines: add get_max_open_zones zoned block
device operation") we added a check that verifies that the specified
--max_open_zones value is lower than the max value reported by the device.

For ZNS devices there is a max open zones and a max active zones limit.
For ZAC/ZBC devices there is only a max open zones limit.

On ZAC/ZBC, there is thus no limit on the amount of zones that can be
in zone state closed.
When doing a write to an empty or closed zone, a ZAC/ZBC drive will
close an arbitrary implicit open zone in order to handle the write.

The ZNS specification has no requirement on closing a zone in order to
handle a write to an empty or closed zone. The drive is free to return
an error.

Even on ZAC/ZBC, you do not want to exceed the max open zones limit,
since it will lead to additional implicit close zone and implicit open
zone operations, which may degrade performance.
However, it seems that this is sometimes done on purpose, in order to
measure the overhead of these additional operations. Therefore, add
an option that allows the user to ignore the reported device limits.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 cconv.c          |  2 ++
 fio.1            |  5 +++++
 options.c        | 10 ++++++++++
 server.h         |  2 +-
 thread_options.h |  2 ++
 zbd.c            |  2 +-
 6 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/cconv.c b/cconv.c
index d4dfb81b3f..74c241063a 100644
--- a/cconv.c
+++ b/cconv.c
@@ -232,6 +232,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->zone_skip = le64_to_cpu(top->zone_skip);
 	o->zone_mode = le32_to_cpu(top->zone_mode);
 	o->max_open_zones = __le32_to_cpu(top->max_open_zones);
+	o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits);
 	o->lockmem = le64_to_cpu(top->lockmem);
 	o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
 	o->offset_increment = le64_to_cpu(top->offset_increment);
@@ -575,6 +576,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->zone_skip = __cpu_to_le64(o->zone_skip);
 	top->zone_mode = __cpu_to_le32(o->zone_mode);
 	top->max_open_zones = __cpu_to_le32(o->max_open_zones);
+	top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits);
 	top->lockmem = __cpu_to_le64(o->lockmem);
 	top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
 	top->file_size_low = __cpu_to_le64(o->file_size_low);
diff --git a/fio.1 b/fio.1
index ab08cb0120..5aa54a4d04 100644
--- a/fio.1
+++ b/fio.1
@@ -835,6 +835,11 @@ threads/processes.
 .BI job_max_open_zones \fR=\fPint
 Limit on the number of simultaneously opened zones per single thread/process.
 .TP
+.BI ignore_zone_limits \fR=\fPbool
+If this isn't set, fio will query the max open zones limit from the zoned block
+device, and exit if the specified \fBmax_open_zones\fR value is larger than the
+limit reported by the device. Default: false.
+.TP
 .BI zone_reset_threshold \fR=\fPfloat
 A number between zero and one that indicates the ratio of logical blocks with
 data to the total number of logical blocks in the test above which zones
diff --git a/options.c b/options.c
index b82a10aa44..a8986d1167 100644
--- a/options.c
+++ b/options.c
@@ -3492,6 +3492,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
 	},
+	{
+		.name	= "ignore_zone_limits",
+		.lname	= "Ignore zone resource limits",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, ignore_zone_limits),
+		.def	= "0",
+		.help	= "Ignore the zone resource limits (max open/active zones) reported by the device",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
 	{
 		.name	= "zone_reset_threshold",
 		.lname	= "Zone reset threshold",
diff --git a/server.h b/server.h
index 8cf3a60b4b..c128df28ad 100644
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 90,
+	FIO_SERVER_VER			= 91,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
diff --git a/thread_options.h b/thread_options.h
index 4d48e46299..05c2d1383e 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -355,6 +355,7 @@ struct thread_options {
 	unsigned int read_beyond_wp;
 	int max_open_zones;
 	unsigned int job_max_open_zones;
+	unsigned int ignore_zone_limits;
 	fio_fp64_t zrt;
 	fio_fp64_t zrf;
 };
@@ -657,6 +658,7 @@ struct thread_options_pack {
 
 	uint32_t zone_mode;
 	int32_t max_open_zones;
+	uint32_t ignore_zone_limits;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
diff --git a/zbd.c b/zbd.c
index 68cd58e1b9..5d9e331ac9 100644
--- a/zbd.c
+++ b/zbd.c
@@ -588,7 +588,7 @@ static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f)
 	unsigned int max_open_zones;
 	int ret;
 
-	if (zbd->model != ZBD_HOST_MANAGED) {
+	if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
 		/* Only host-managed devices have a max open limit */
 		zbd->max_open_zones = td->o.max_open_zones;
 		goto out;

From f34b0a0320e0511c5de7f41c1496f11708ff64c1 Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 2 Jun 2021 15:05:17 +0200
Subject: [PATCH 31/42] ci: Installing missing toolchain

When trying to rebuild a failed build on a real windows system,
the toolchain is missing.

Let's add the toolchain here so we can reuse the script locally too.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 ci/appveyor-install.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh
index c73e4cb53b..5f873a20e0 100755
--- a/ci/appveyor-install.sh
+++ b/ci/appveyor-install.sh
@@ -31,6 +31,7 @@ case "${DISTRO}" in
         pacman.exe --noconfirm -S \
             mingw-w64-${PACKAGE_ARCH}-clang \
             mingw-w64-${PACKAGE_ARCH}-cunit \
+            mingw-w64-${PACKAGE_ARCH}-toolchain \
             mingw-w64-${PACKAGE_ARCH}-lld
         ;;
 esac

From 1a1e8144846b175a5858a92a68bc8e6279a549e4 Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 2 Jun 2021 15:52:06 +0200
Subject: [PATCH 32/42] ci: Reporting installed msys2 packages

When reproducing a build locally, it's important to be on the same
release as the CI.

So let's listi the installed packages so we can compare the two builds more easily.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 ci/appveyor-install.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh
index 5f873a20e0..3137f39ebe 100755
--- a/ci/appveyor-install.sh
+++ b/ci/appveyor-install.sh
@@ -33,6 +33,7 @@ case "${DISTRO}" in
             mingw-w64-${PACKAGE_ARCH}-cunit \
             mingw-w64-${PACKAGE_ARCH}-toolchain \
             mingw-w64-${PACKAGE_ARCH}-lld
+        pacman.exe -Q # List installed packages
         ;;
 esac
 

From 4b0e335a05f3a082a4f051304ba9bb6f36af4432 Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 2 Jun 2021 16:15:59 +0200
Subject: [PATCH 33/42] Makefile: Avoid using built-in stpcpy during clang
 build

Since clang 12, during the clang build, noticed by the CI, the linking
fails as clang optimize some string functions to stpcpy.

  LINK fio
lld-link: error: undefined symbol: stpcpy
>>> referenced by C:\projects\fio\options.c:5305
>>>               options.o:(fio_options_parse)

Two possible implementations :
- Adding stpcpy in fio as the kernel did : https://lore.kernel.org/lkml/20200815002417.1512973-1-ndesaulniers@google.com/T/
- Disable the implicit stpcpy

To avoid adding code into fio, the latter option was used.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile b/Makefile
index ef31737371..f57569d5f6 100644
--- a/Makefile
+++ b/Makefile
@@ -40,6 +40,11 @@ ifdef CONFIG_PDB
   LDFLAGS += -fuse-ld=lld $(LINK_PDBFILE)
 endif
 
+# If clang, do not use builtin stpcpy as it breaks the build
+ifeq ($(CC),clang)
+  FIO_CFLAGS += -fno-builtin-stpcpy
+endif
+
 ifdef CONFIG_GFIO
   PROGS += gfio
 endif

From e1315822835ceaa976a2b8ac6a74ce7bb46b079f Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Fri, 4 Jun 2021 20:32:50 +0900
Subject: [PATCH 34/42] t/zbd: Use max_open_zones that fio fetched from device

Recent commit d2f442bc0bd5 ("ioengines: add get_max_open_zones zoned
block device operation") modified fio to compare --max_open_zones option
value and max_open_zones reported by the device. The device limit is
fetched through sysfs or through an ioengine specific implementation.

The test script currently try to fetch the max open zones limit using
libzbc tools or sg_inq. If either of these fail, default value 128 is
supplied. This default value can be too high when the test script is
run for certain zoned block devices, and can therefore result in fio
error and test case failure.

To avoid the failure, modify the default value used in the test script
from 128 to 0. With this, --max_open_zones=0 is passed to fio, and it
makes fio use the max_open_zones reported by the device. Also add
comments to describe why the test script gets max_open_zones with tools.

Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 t/zbd/functions | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/t/zbd/functions b/t/zbd/functions
index 40ffe1deeb..08a2c629e8 100644
--- a/t/zbd/functions
+++ b/t/zbd/functions
@@ -173,15 +173,23 @@ last_online_zone() {
     fi
 }
 
+# Get max_open_zones of SMR drives using sg_inq or libzbc tools. Two test cases
+# 31 and 32 use this max_open_zones value. The test case 31 uses max_open_zones
+# to decide number of write target zones. The test case 32 passes max_open_zones
+# value to fio with --max_open_zones option. Of note is that fio itself has the
+# feature to get max_open_zones from the device through sysfs or ioengine
+# specific implementation. This max_open_zones fetch by test script is required
+# in case fio is running on an old Linux kernel version which lacks
+# max_open_zones in sysfs, or which lacks zoned block device support completely.
 max_open_zones() {
     local dev=$1
 
     if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
 	if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" \
 		 > /dev/null 2>&1; then
-	    # Non scsi device such as null_blk can not return max open zones.
-	    # Use default value.
-	    echo 128
+	    # When sg_inq can not get max open zones, specify 0 which indicates
+	    # fio to get max open zones limit from the device.
+	    echo 0
 	else
 	    ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 |
 		{

From 351fe91089c3babb06ae421a1abce3632f42b672 Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Fri, 4 Jun 2021 20:32:51 +0900
Subject: [PATCH 35/42] t/zbd: Add ignore_zone_limit option to test with
 special max_open_zones

Recent commit d2f442bc0bd5 ("ioengines: add get_max_open_zones zoned
block device operation") modified fio to compare --max_open_zones option
value and max_open_zones reported by the device. When the option
--max_open_zones is larger than the device limit, fio exits with an
error. However, sometimes it is useful to run fio with --max_open_zones
larger than the device limit to check performance impact of implicit
zone open and close by the zoned block devices. The test script
t/zbd/test-zbd-support has an option -o so that users can specify such
larger max_open_zones value. After the commit, such test runs fail with
the fio error.

To avoid the failure, modify the test script to specify another option
--ignore_zone_limits to fio command, which was added by the commit
575686bb85fa (zbd: add a new --ignore_zone_limits option). This option
is added to fio command only when users specify -o option and special
max_open_zones value to the test script. This change does not affect
default test conditions.

Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 t/zbd/test-zbd-support | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
index 26aff3731b..015fa1dc35 100755
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -1348,6 +1348,7 @@ fi
 if [[ -n ${max_open_zones_opt} ]]; then
 	# Override max_open_zones with the script option value
 	max_open_zones="${max_open_zones_opt}"
+	global_var_opts+=("--ignore_zone_limits=1")
 	job_var_opts+=("--max_open_zones=${max_open_zones_opt}")
 fi
 

From 40d0b84220f7c0ff9c3874656db7f0f8cb6a85e6 Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Fri, 4 Jun 2021 20:32:52 +0900
Subject: [PATCH 36/42] t/zbd: Fix write target zones counting in test case #31

The test case #31 in t/zbd/test-zbd-support writes 128KB data to
sequential write required zones as the preparation for the following
random read test. The data write leaves the target zones in open status.
The test case refers the variable 'nz', which has max_open_zones value,
to decide how many zones to write the data. However, the end condition
of the write target zone loop has a bug. The disk end offset is used as
the loop end condition, which does not match the last target zone when
number of sequential write required zones divided by nz has remainder.
This results in write to more zones than nz=max_open_zones limit and the
test case failure. To fix the bug and to simplify the script, avoid the
loop and utilize zonemode strided to achieve the same data write
pattern. Also specify size and io_size using nz to reliably count the
write target zones.

Even with the fix above, still the number of open zones may exceed
max_open_zones since other test cases executed before the test case 31
may leave open zones on the test target device. To avoid this failure,
reset all zones before the data write.

The failures were observed with libzbc I/O engine after the commit
e8267436fd7a ("engines/libzbc: add support for the get_max_open_zones io
op"), which changed the max_open_zones value fio refers.

Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 t/zbd/test-zbd-support | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
index 015fa1dc35..a684f98807 100755
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -731,32 +731,28 @@ test30() {
 test31() {
     local bs inc nz off opts size
 
-    prep_write
-    # Start with writing 128 KB to max_open_zones sequential zones.
-    bs=128K
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+    # As preparation, write 128 KB to sequential write required zones. Limit
+    # write target zones up to max_open_zones to keep test time reasonable.
+    # To distribute the write target zones evenly, skip certain zones for every
+    # write. Utilize zonemode strided for such write patterns.
+    bs=$((128 * 1024))
     nz=$((max_open_zones))
     if [[ $nz -eq 0 ]]; then
 	nz=128
     fi
-    # shellcheck disable=SC2017
-    inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size)
-	   * zone_size))
-    if [ "$inc" -eq 0 ]; then
-	require_seq_zones $nz || return $SKIP_TESTCASE
-    fi
-    opts=()
-    for ((off = first_sequential_zone_sector * 512; off < disk_size;
-	  off += inc)); do
-	opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs")
-	opts+=("--bs=$bs" "--size=$zone_size" "$(ioengine "libaio")")
-	opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0")
-	opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
-	opts+=(${job_var_opts[@]})
-    done
-    "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
-    # Next, run the test.
     off=$((first_sequential_zone_sector * 512))
     size=$((disk_size - off))
+    inc=$(((size / nz / zone_size) * zone_size))
+    opts=("--name=$dev" "--filename=$dev" "--rw=write" "--bs=${bs}")
+    opts+=("--offset=$off" "--size=$((inc * nz))" "--io_size=$((bs * nz))")
+    opts+=("--zonemode=strided" "--zonesize=${bs}" "--zonerange=${inc}")
+    opts+=("--direct=1")
+    echo "fio ${opts[@]}" >> "${logfile}.${test_number}"
+    "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
+
+    # Next, run the test.
     opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
     opts+=("--bs=$bs" "$(ioengine "psync")" "--rw=randread" "--direct=1")
     opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd")

From dd4620b7f9171edaa10955c4826454a05af27c85 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 10 Jun 2021 16:55:39 +0100
Subject: [PATCH 37/42] io_uring: drop redundant IO_MODE_OFFLOAD check

check_engine_ops() already returns an error if io_submit_mode is
IO_MODE_OFFLOAD and the engine is marked FIO_NO_OFFLOAD.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 engines/io_uring.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/engines/io_uring.c b/engines/io_uring.c
index b962e8041b..9c091e37e6 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -728,12 +728,6 @@ static int fio_ioring_init(struct thread_data *td)
 	struct ioring_data *ld;
 	struct thread_options *to = &td->o;
 
-	if (to->io_submit_mode == IO_MODE_OFFLOAD) {
-		log_err("fio: io_submit_mode=offload is not compatible (or "
-			"useful) with io_uring\n");
-		return 1;
-	}
-
 	/* sqthread submission requires registered files */
 	if (o->sqpoll_thread)
 		o->registerfiles = 1;

From 50cc48d52fec6c74a46e377b23f19ebed532125a Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 14 Jun 2021 13:49:03 +0000
Subject: [PATCH 38/42] zbd: disallow pipes for zonemode=zbd

zoned block device support in fio cannot handle pipes,
so simply reject them and give a clear error message.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 zbd.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/zbd.c b/zbd.c
index 5d9e331ac9..60325d28fa 100644
--- a/zbd.c
+++ b/zbd.c
@@ -32,6 +32,11 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
 {
 	int ret;
 
+	if (f->filetype == FIO_TYPE_PIPE) {
+		log_err("zonemode=zbd does not support pipes\n");
+		return -EINVAL;
+	}
+
 	if (td->io_ops && td->io_ops->get_zoned_model)
 		ret = td->io_ops->get_zoned_model(td, f, model);
 	else

From 9db0cde87d1c928b9d629c6f1b0f8f2ed729d908 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 14 Jun 2021 13:49:04 +0000
Subject: [PATCH 39/42] zbd: allow zonemode=zbd with regular files by emulating
 zones

Currently when using zonemode=zbd and running against a regular file,
fio will fail with:
fio: file hash not empty on exit

Treat regular files just like how we treat regular (non-zoned) block
devices: return ZBD_NONE and let zbd.c emulate zones inside the regular
file/block device.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 zbd.c       | 14 +++++++++++++-
 zbd_types.h |  2 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/zbd.c b/zbd.c
index 60325d28fa..d1db9adc29 100644
--- a/zbd.c
+++ b/zbd.c
@@ -37,6 +37,12 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
 		return -EINVAL;
 	}
 
+	/* If regular file, always emulate zones inside the file. */
+	if (f->filetype == FIO_TYPE_FILE) {
+		*model = ZBD_NONE;
+		return 0;
+	}
+
 	if (td->io_ops && td->io_ops->get_zoned_model)
 		ret = td->io_ops->get_zoned_model(td, f, model);
 	else
@@ -414,7 +420,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 	int i;
 
 	if (zone_size == 0) {
-		log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n",
+		log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
 			f->file_name);
 		return 1;
 	}
@@ -435,6 +441,12 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 		return 1;
 	}
 
+	if (f->real_file_size < zone_size) {
+		log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
+			f->file_name, f->real_file_size, zone_size);
+		return -EINVAL;
+	}
+
 	nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
 	zbd_info = scalloc(1, sizeof(*zbd_info) +
 			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
diff --git a/zbd_types.h b/zbd_types.h
index 5ed41aa06c..d0f4c44e23 100644
--- a/zbd_types.h
+++ b/zbd_types.h
@@ -15,7 +15,7 @@
  */
 enum zbd_zoned_model {
 	ZBD_IGNORE,		/* Ignore file */
-	ZBD_NONE,		/* Regular block device */
+	ZBD_NONE,		/* No zone support. Emulate zones. */
 	ZBD_HOST_AWARE,		/* Host-aware zoned block device */
 	ZBD_HOST_MANAGED,	/* Host-managed zoned block device */
 };

From 2c7dd23e5142e421723ede2557fe868ac32c8265 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 14 Jun 2021 13:49:04 +0000
Subject: [PATCH 40/42] zbd: remove zbd_zoned_model ZBD_IGNORE

For a job with zonemode=zbd, we do not want any file to be ignored.
Each file's file type in that job should be supported by either zbd.c
or the ioengine. If not, we should return an error.
This way, ZBD_IGNORE becomes redundant and can be removed.

By removing ZBD_IGNORE, we know that all files belonging to a job that
has zonemode=zbd set, will either be a zoned block device, or emulate
a zoned block device.

This means that for jobs that have zonemode=zbd, f->zbd_info will always
be non-NULL. This will make the zbd code slightly easier to reason about
and to maintain.

When removing zbd_zoned_model ZBD_IGNORE, define the new first enum value
as 0x1, so that we avoid potential ABI problems with existing binaries.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 engines/libzbc.c            | 6 ++----
 engines/skeleton_external.c | 1 -
 oslib/linux-blkzoned.c      | 6 ++----
 zbd.c                       | 3 +--
 zbd_types.h                 | 7 +++----
 5 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/engines/libzbc.c b/engines/libzbc.c
index 3dde93db54..7f2bc431b4 100644
--- a/engines/libzbc.c
+++ b/engines/libzbc.c
@@ -180,10 +180,8 @@ static int libzbc_get_zoned_model(struct thread_data *td, struct fio_file *f,
 	struct libzbc_data *ld;
 	int ret;
 
-	if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
-		*model = ZBD_IGNORE;
-		return 0;
-	}
+	if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+		return -EINVAL;
 
 	ret = libzbc_open_dev(td, f, &ld);
 	if (ret)
diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c
index c79b6f1114..cff83a10ef 100644
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c
@@ -156,7 +156,6 @@ static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
 /*
  * Hook for getting the zoned model of a zoned block device for zonemode=zbd.
  * The zoned model can be one of (see zbd_types.h):
- * - ZBD_IGNORE: skip regular files
  * - ZBD_NONE: regular block device (zone emulation will be used)
  * - ZBD_HOST_AWARE: host aware zoned block device
  * - ZBD_HOST_MANAGED: host managed zoned block device
diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c
index 6f89ec6f41..4e441d29b8 100644
--- a/oslib/linux-blkzoned.c
+++ b/oslib/linux-blkzoned.c
@@ -140,10 +140,8 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
 {
 	char *model_str = NULL;
 
-	if (f->filetype != FIO_TYPE_BLOCK) {
-		*model = ZBD_IGNORE;
-		return 0;
-	}
+	if (f->filetype != FIO_TYPE_BLOCK)
+		return -EINVAL;
 
 	*model = ZBD_NONE;
 
diff --git a/zbd.c b/zbd.c
index d1db9adc29..aab4d74136 100644
--- a/zbd.c
+++ b/zbd.c
@@ -661,8 +661,6 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 		return ret;
 
 	switch (zbd_model) {
-	case ZBD_IGNORE:
-		return 0;
 	case ZBD_HOST_AWARE:
 	case ZBD_HOST_MANAGED:
 		ret = parse_zone_info(td, f);
@@ -680,6 +678,7 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 		return -EINVAL;
 	}
 
+	assert(f->zbd_info);
 	f->zbd_info->model = zbd_model;
 
 	ret = zbd_set_max_open_zones(td, f);
diff --git a/zbd_types.h b/zbd_types.h
index d0f4c44e23..0a8630cb71 100644
--- a/zbd_types.h
+++ b/zbd_types.h
@@ -14,10 +14,9 @@
  * Zoned block device models.
  */
 enum zbd_zoned_model {
-	ZBD_IGNORE,		/* Ignore file */
-	ZBD_NONE,		/* No zone support. Emulate zones. */
-	ZBD_HOST_AWARE,		/* Host-aware zoned block device */
-	ZBD_HOST_MANAGED,	/* Host-managed zoned block device */
+	ZBD_NONE		= 0x1,	/* No zone support. Emulate zones. */
+	ZBD_HOST_AWARE		= 0x2,	/* Host-aware zoned block device */
+	ZBD_HOST_MANAGED	= 0x3,	/* Host-managed zoned block device */
 };
 
 /*

From 5ddf46d0b2dfe10b9a518db1f936c81e099b2646 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 14 Jun 2021 13:49:05 +0000
Subject: [PATCH 41/42] zbd: change some f->zbd_info conditionals to asserts

Unfortunately, generic fio code calls some zbd_* functions unconditionally.
These functions will be called regardless if zonemode == ZONE_MODE_NONE,
ZONE_MODE_STRIDED or ZONE_MODE_ZBD, and cannot be optimized.

However, some functions are only called when zonemode == ZONE_MODE_ZBD.
Since f->zbd_info will always be non-NULL for a job with zonemode=zbd,
these functions can be optimized to not check if f->zbd_info is set.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 zbd.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/zbd.c b/zbd.c
index aab4d74136..8e99eb95dc 100644
--- a/zbd.c
+++ b/zbd.c
@@ -808,8 +808,7 @@ int zbd_setup_files(struct thread_data *td)
 		struct fio_zone_info *z;
 		int zi;
 
-		if (!zbd)
-			continue;
+		assert(zbd);
 
 		f->min_zone = zbd_zone_idx(f, f->file_offset);
 		f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
@@ -1470,8 +1469,7 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
 	uint32_t zone_idx;
 	uint64_t zone_end;
 
-	if (!zbd_info)
-		return;
+	assert(zbd_info);
 
 	zone_idx = zbd_zone_idx(f, io_u->offset);
 	assert(zone_idx < zbd_info->nr_zones);
@@ -1531,8 +1529,7 @@ static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
 	struct fio_zone_info *z;
 	uint32_t zone_idx;
 
-	if (!zbd_info)
-		return;
+	assert(zbd_info);
 
 	zone_idx = zbd_zone_idx(f, io_u->offset);
 	assert(zone_idx < zbd_info->nr_zones);
@@ -1588,6 +1585,7 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
 
 	assert(td->o.zone_mode == ZONE_MODE_ZBD);
 	assert(td->o.zone_size);
+	assert(f->zbd_info);
 
 	zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
 	z = get_zone(f, zone_idx);
@@ -1662,6 +1660,7 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
 	 * devices with all empty zones. Overwrite the first I/O direction as
 	 * write to make sure data to read exists.
 	 */
+	assert(io_u->file->zbd_info);
 	if (ddir != DDIR_READ || !td_rw(td))
 		return ddir;
 
@@ -1691,9 +1690,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 	uint64_t new_len;
 	int64_t range;
 
-	if (!f->zbd_info)
-		return io_u_accept;
-
+	assert(f->zbd_info);
 	assert(min_bs);
 	assert(is_valid_offset(f, io_u->offset));
 	assert(io_u->buflen);

From a59b12d2a5eb92c1128a5d8ebcd03b1831962ce5 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 14 Jun 2021 13:49:05 +0000
Subject: [PATCH 42/42] t/zbd: update test case 42

Update test case 42 to grep for the new string printed by fio when
--zonesize=0 is supplied.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 t/zbd/test-zbd-support | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
index a684f98807..57e6d05ea7 100755
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -922,7 +922,7 @@ test41() {
 test42() {
     require_regular_block_dev || return $SKIP_TESTCASE
     read_one_block --zonemode=zbd --zonesize=0 |
-	grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd'
+	grep -q 'Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd'
 }
 
 # Check whether fio handles --zonesize=1 correctly for regular block devices.