From 5c27ab3805a6af3ca23a169ebedf93b376f7496c Mon Sep 17 00:00:00 2001
From: Brian Atkinson <batkinson@lanl.gov>
Date: Fri, 10 Mar 2023 09:00:54 -0700
Subject: [PATCH] Fixing race condition between mmap/direct IO

In commit ba30ec9 I got a little overzealous in code cleanup. While I
was trying to remove all the stable page code for Linux, I
misinterpreted why Brian Behlendorf originally had the try rangelock,
drop page lock, and aquire range lock in zfs_fillpage(). This is still
necessary even without stable pages. This has to occur to avoid a race
condition between direct IO writes and pages being faulted in for mmap
files. If the rangelock is not held, then a direct IO write can set
db->db_data = NULL either in:
 1. dmu_write_direct() -> dmu_buf_will_not_fill() ->
    dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data()
 2. dmu_write_direct_done()

This can cause the panic then, withtout the rangelock as
dmu_read_imp() can get a NULL pointer for db->db_data when trying to do
the memcpy. So this rangelock must be held in zfs_fillpage() not matter
what.

There are further semantics on when the rangelock should be held in
zfs_fillpage(). It must only be held when doing zfs_getpage() ->
zfs_fillpage(). The reason for this is mappedread() can call
zfs_fillpage() if the page is not uptodate. This can occur becaue
filemap_fault() will first add the pages to the inode's address_space
mapping and then drop the page lock. This leaves open a window were
mappedread() can be called. Since this can occur, mappedread() will hold
both the page lock and the rangelock. This is perfectly valid and
correct. However, it is important in this case to never grab the
rangelock in zfs_fillpage(). If this happens, then a dead lock will
occur.

Finally it is important to note that the rangelock is first attempted to
be grabbed with zfs_rangelock_tryenter(). The reason for this is the
page lock must be dropped in order to grab the rangelock in this case.
Otherwise there is a race between zfs_fillpage() and zfs_write() ->
update_pages(). In update_pages() the rangelock is already held and it
then grabs the page lock. So if the page lock is not dropped before
acquiring the rangelock in zfs_fillpage() there can be a deadlock.

Below is a stack trace showing the NULL pointer dereference that was
occuring with the dio_mmap ZTS test case before this commit.

[ 7737.430658] BUG: unable to handle kernel NULL pointer dereference at
0000000000000000
[ 7737.438486] PGD 0 P4D 0
[ 7737.441024] Oops: 0000 [#1] SMP NOPTI
[ 7737.444692] CPU: 33 PID: 599346 Comm: fio Kdump: loaded Tainted: P
OE    --------- -  - 4.18.0-408.el8.x86_64 #1
[ 7737.455721] Hardware name: GIGABYTE R272-Z32-00/MZ32-AR0-00, BIOS R21
10/08/2020
[ 7737.463106] RIP: 0010:__memcpy+0x12/0x20
[ 7737.467032] Code: ff 0f 31 48 c1 e2 20 48 09 c2 48 31 d3 e9 79 ff ff
ff 90 90 90 90 90 90 0f 1f 44 00 00 48 89 f8 48 89 d1 48 c1 e9 03 83 e2
07 <f3> 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 f3 a4
[ 7737.485770] RSP: 0000:ffffc1db829e3b60 EFLAGS: 00010246
[ 7737.490987] RAX: ffff9ef195b6f000 RBX: 0000000000001000 RCX:
0000000000000200
[ 7737.498111] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
ffff9ef195b6f000
[ 7737.505235] RBP: ffff9ef195b70000 R08: ffff9eef1d1d0000 R09:
ffff9eef1d1d0000
[ 7737.512358] R10: ffff9eef27968218 R11: 0000000000000000 R12:
0000000000000000
[ 7737.519481] R13: ffff9ef041adb6d8 R14: 00000000005e1000 R15:
0000000000000001
[ 7737.526607] FS:  00007f77fe2bae80(0000) GS:ffff9f0cae840000(0000)
knlGS:0000000000000000
[ 7737.534683] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7737.540423] CR2: 0000000000000000 CR3: 00000003387a6000 CR4:
0000000000350ee0
[ 7737.547553] Call Trace:
[ 7737.550003]  dmu_read_impl+0x11a/0x210 [zfs]
[ 7737.554464]  dmu_read+0x56/0x90 [zfs]
[ 7737.558292]  zfs_fillpage+0x76/0x190 [zfs]
[ 7737.562584]  zfs_getpage+0x4c/0x80 [zfs]
[ 7737.566691]  zpl_readpage_common+0x3b/0x80 [zfs]
[ 7737.571485]  filemap_fault+0x5d6/0xa10
[ 7737.575236]  ? obj_cgroup_charge_pages+0xba/0xd0
[ 7737.579856]  ? xas_load+0x8/0x80
[ 7737.583088]  ? xas_find+0x173/0x1b0
[ 7737.586579]  ? filemap_map_pages+0x84/0x410
[ 7737.590759]  __do_fault+0x38/0xb0
[ 7737.594077]  handle_pte_fault+0x559/0x870
[ 7737.598082]  __handle_mm_fault+0x44f/0x6c0
[ 7737.602181]  handle_mm_fault+0xc1/0x1e0
[ 7737.606019]  do_user_addr_fault+0x1b5/0x440
[ 7737.610207]  do_page_fault+0x37/0x130
[ 7737.613873]  ? page_fault+0x8/0x30
[ 7737.617277]  page_fault+0x1e/0x30
[ 7737.620589] RIP: 0033:0x7f77fbce9140

Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
---
 module/os/linux/zfs/zfs_vnops_os.c | 63 +++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 5 deletions(-)
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 7fe3868fe875..a1c55b81ddc2 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -228,7 +228,8 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
 
 #if defined(_KERNEL)
 
-static int zfs_fillpage(struct inode *ip, struct page *pp);
+static int zfs_fillpage(struct inode *ip, struct page *pp,
+    boolean_t rangelock_held);
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
@@ -303,7 +304,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 			 * In this case we must try and fill the page.
 			 */
 			if (unlikely(!PageUptodate(pp))) {
-				error = zfs_fillpage(ip, pp);
+				error = zfs_fillpage(ip, pp, B_TRUE);
 				if (error) {
 					unlock_page(pp);
 					put_page(pp);
@@ -4008,20 +4009,68 @@ zfs_inactive(struct inode *ip)
  * Fill pages with data from the disk.
  */
 static int
-zfs_fillpage(struct inode *ip, struct page *pp)
+zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held)
 {
+	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
 	size_t io_len = PAGE_SIZE;
+	zfs_locked_range_t *lr = NULL;
 
 	ASSERT3U(io_off, <, i_size);
 
 	if (io_off + io_len > i_size)
 		io_len = i_size - io_off;
 
+	/*
+	 * It is important to hold the rangelock here because it is possible
+	 * a Direct I/O write might be taking place at the same time that a
+	 * page is being faulted in through filemap_fault(). With a Direct I/O
+	 * write, db->db_data will be set to NULL either in:
+	 *  1. dmu_write_direct() -> dmu_buf_will_not_fill() ->
+	 *     dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data()
+	 *  2. dmu_write_direct_done()
+	 * If the  rangelock is not held, then there is a race between faulting
+	 * in a page and writing out a Direct I/O write. Without the rangelock
+	 * a NULL pointer dereference can occur in dmu_read_impl() for
+	 * db->db_data during the mempcy operation.
+	 *
+	 * Another important note here is we have to check to make sure the
+	 * rangelock is not already held from mappedread() -> zfs_fillpage().
+	 * filemap_fault() will first add the page to the inode address_space
+	 * mapping and then will drop the page lock. This leaves open a window
+	 * for mappedread() to begin. In this case he page lock and rangelock,
+	 * are both held and it might have to call here if the page is not
+	 * up to date. In this case the rangelock can not be held twice or a
+	 * deadlock can happen. So the rangelock only needs to be aquired if
+	 * zfs_fillpage() is being called by zfs_getpage().
+	 *
+	 * Finally it is also important to drop the page lock before grabbing
+	 * the rangelock to avoid another deadlock between here and
+	 * zfs_write() -> update_pages(). update_pages() holds both the
+	 * rangelock and the page lock.
+	 */
+	if (rangelock_held == B_FALSE) {
+		/*
+		 * First try grabbing the rangelock. If that can not be done
+		 * the page lock must be dropped before grabbing the rangelock
+		 * to avoid a deadlock with update_pages(). See comment above.
+		 */
+		lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len,
+		    RL_READER);
+		if (lr == NULL) {
+			get_page(pp);
+			unlock_page(pp);
+			lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
+			    io_len, RL_READER);
+			lock_page(pp);
+			put_page(pp);
+		}
+	}
+
 	void *va = kmap(pp);
-	int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+	int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
 	    io_len, va, DMU_READ_PREFETCH);
 	if (io_len != PAGE_SIZE)
 		memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
@@ -4039,6 +4088,10 @@ zfs_fillpage(struct inode *ip, struct page *pp)
 		SetPageUptodate(pp);
 	}
 
+
+	if (rangelock_held == B_FALSE)
+		zfs_rangelock_exit(lr);
+
 	return (error);
 }
 
@@ -4063,7 +4116,7 @@ zfs_getpage(struct inode *ip, struct page *pp)
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
-	error = zfs_fillpage(ip, pp);
+	error = zfs_fillpage(ip, pp, B_FALSE);
 	if (error == 0)
 		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);