From 1d585f85f359068d7e714d9fe70e7071dbed3ce2 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Fri, 27 Jan 2023 15:30:59 -0700 Subject: [PATCH] Removing generic linux calls and mmap cleanup We were using the generic Linux calls to make sure that the page cache was cleaned out before issuing any Direct I/O reads or writes. However, this only matters in the event the file region being written/read from using O_DIRECT was mmap'ed. One of stipulations with O_DIRECT is that is redirected through the ARC in the event the file range is mmap'ed. Becaues of this, it did not make sense to try and invalidate the page cache if we were never intending to have O_DIRECT to work with mmap'ed regions. Also, calls into the generic Linux calls in writes would often lead to lockups as the page lock is dropped in zfs_putpage(). See the stack dump below. In order to just prevent this, we no longer will use the generic linux direct IO wrappers or try and flush out the page cache. Instead if we find the file range has been mmap'ed in since the initial check in zfs_setup_direct() we will just now directly handle that in zfs_read() and zfs_write(). In most case zfs_setup_direct() will prevent O_DIRECT to mmap'ed regions of the file that have been page faulted in, but if that happen when we are issuing the direct I/O request the the normal parts of the ZFS paths will be taken to account for this. It is highly suggested not to mmap a region of file and then write or read directly to the file. In general, that is kind of an isane thing to do... However, we try our best to still have consistency with the ARC. Also, before making this decision I did explore if we could just add a rangelock in zfs_fillpage(), but we can not do that. The reason is when the page is in zfs_readpage_common() it has already been locked by the kernel. So, if we try and grab the rangelock anywhere in that path we can get stuck if another thread is issuing writes to the file region that was mmap'ed in. The reason is update_pages() holds the rangelock and then tries to lock the page. In this case zfs_fillpage() holds the page lock but is stuck in the rangelock waiting and holding the page lock. Deadlock is unavoidable in this case. [260136.244332] INFO: task fio:3791107 blocked for more than 120 seconds. [260136.250867] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260136.258693] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260136.266607] task:fio state:D stack: 0 pid:3791107 ppid:3790841 flags:0x00004080 [260136.275306] Call Trace: [260136.277845] __schedule+0x2d1/0x830 [260136.281432] schedule+0x35/0xa0 [260136.284665] io_schedule+0x12/0x40 [260136.288157] wait_on_page_bit+0x123/0x220 [260136.292258] ? xas_load+0x8/0x80 [260136.295577] ? file_fdatawait_range+0x20/0x20 [260136.300024] filemap_page_mkwrite+0x9b/0xb0 [260136.304295] do_page_mkwrite+0x53/0x90 [260136.308135] ? vm_normal_page+0x1a/0xc0 [260136.312062] do_wp_page+0x298/0x350 [260136.315640] __handle_mm_fault+0x44f/0x6c0 [260136.319826] ? __switch_to_asm+0x41/0x70 [260136.323839] handle_mm_fault+0xc1/0x1e0 [260136.327766] do_user_addr_fault+0x1b5/0x440 [260136.332038] do_page_fault+0x37/0x130 [260136.335792] ? page_fault+0x8/0x30 [260136.339284] page_fault+0x1e/0x30 [260136.342689] RIP: 0033:0x7f6deee7f1b4 [260136.346361] Code: Unable to access opcode bytes at RIP 0x7f6deee7f18a. [260136.352977] RSP: 002b:00007fffe41b6538 EFLAGS: 00010202 [260136.358288] RAX: 00007f6d83049000 RBX: 0000556b63614ec0 RCX: 00007f6d83148fe0 [260136.365508] RDX: 00000000000acfe0 RSI: 00007f6d84e9c030 RDI: 00007f6d8309bfa0 [260136.372730] RBP: 00007f6d84f4a000 R08: ffffffffffffffe0 R09: 0000000000000000 [260136.379946] R10: 00007f6d84f8e810 R11: 00007f6d83049000 R12: 0000000000000001 [260136.387167] R13: 0000556b63614ec0 R14: 0000000000100000 R15: 0000556b63614ee8 [260136.394387] INFO: task fio:3791108 blocked for more than 120 seconds. [260136.400911] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260136.408739] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260136.416651] task:fio state:D stack: 0 pid:3791108 ppid:3790835 flags:0x00004080 [260136.425343] Call Trace: [260136.427883] __schedule+0x2d1/0x830 [260136.431463] ? cv_wait_common+0x12d/0x240 [spl] [260136.436091] schedule+0x35/0xa0 [260136.439321] io_schedule+0x12/0x40 [260136.442814] __lock_page+0x12d/0x230 [260136.446483] ? file_fdatawait_range+0x20/0x20 [260136.450929] zfs_putpage+0x148/0x590 [zfs] [260136.455322] ? rmap_walk_file+0x116/0x290 [260136.459421] ? __mod_memcg_lruvec_state+0x5d/0x160 [260136.464300] zpl_putpage+0x67/0xd0 [zfs] [260136.468495] write_cache_pages+0x197/0x420 [260136.472679] ? zpl_readpage_filler+0x10/0x10 [zfs] [260136.477732] zpl_writepages+0x119/0x130 [zfs] [260136.482352] do_writepages+0xc2/0x1c0 [260136.486103] ? flush_tlb_func_common.constprop.9+0x125/0x220 [260136.491850] __filemap_fdatawrite_range+0xc7/0x100 [260136.496732] filemap_write_and_wait_range+0x30/0x80 [260136.501695] generic_file_direct_write+0x120/0x160 [260136.506575] ? rrw_exit+0xb0/0x1c0 [zfs] [260136.510779] zpl_iter_write+0xdd/0x160 [zfs] [260136.515323] new_sync_write+0x112/0x160 [260136.519255] vfs_write+0xa5/0x1a0 [260136.522662] ksys_write+0x4f/0xb0 [260136.526067] do_syscall_64+0x5b/0x1a0 [260136.529818] entry_SYSCALL_64_after_hwframe+0x65/0xca [260136.534959] RIP: 0033:0x7f9d192c7a17 [260136.538625] Code: Unable to access opcode bytes at RIP 0x7f9d192c79ed. [260136.545236] RSP: 002b:00007ffc8e4ba270 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [260136.552889] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f9d192c7a17 [260136.560108] RDX: 0000000000100000 RSI: 00007f9caea03000 RDI: 0000000000000005 [260136.567329] RBP: 00007f9caea03000 R08: 0000000000000000 R09: 0000000000000000 [260136.574548] R10: 00005558e8975680 R11: 0000000000000293 R12: 0000000000100000 [260136.581767] R13: 00005558e8985ec0 R14: 0000000000100000 R15: 00005558e8985ee8 [260136.588989] INFO: task fio:3791109 blocked for more than 120 seconds. [260136.595513] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260136.603337] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260136.611250] task:fio state:D stack: 0 pid:3791109 ppid:3790838 flags:0x00004080 [260136.619943] Call Trace: [260136.622483] __schedule+0x2d1/0x830 [260136.626064] ? zfs_znode_held+0xe6/0x140 [zfs] [260136.630777] schedule+0x35/0xa0 [260136.634009] cv_wait_common+0x153/0x240 [spl] [260136.638466] ? finish_wait+0x80/0x80 [260136.642129] zfs_rangelock_enter_reader+0xa1/0x1f0 [zfs] [260136.647712] zfs_rangelock_enter_impl+0xbf/0x170 [zfs] [260136.653121] zfs_get_data+0x113/0x770 [zfs] [260136.657567] zil_lwb_commit+0x537/0x780 [zfs] [260136.662187] zil_process_commit_list+0x14c/0x460 [zfs] [260136.667585] zil_commit_writer+0xeb/0x160 [zfs] [260136.672376] zil_commit_impl+0x5d/0xa0 [zfs] [260136.676910] zfs_putpage+0x516/0x590 [zfs] [260136.681279] zpl_putpage+0x67/0xd0 [zfs] [260136.685467] write_cache_pages+0x197/0x420 [260136.689649] ? zpl_readpage_filler+0x10/0x10 [zfs] [260136.694705] zpl_writepages+0x119/0x130 [zfs] [260136.699322] do_writepages+0xc2/0x1c0 [260136.703076] __filemap_fdatawrite_range+0xc7/0x100 [260136.707952] filemap_write_and_wait_range+0x30/0x80 [260136.712920] zpl_iter_read_direct+0x86/0x1b0 [zfs] [260136.717972] ? rrw_exit+0xb0/0x1c0 [zfs] [260136.722174] zpl_iter_read+0x90/0xb0 [zfs] [260136.726536] new_sync_read+0x10f/0x150 [260136.730376] vfs_read+0x91/0x140 [260136.733693] ksys_read+0x4f/0xb0 [260136.737012] do_syscall_64+0x5b/0x1a0 [260136.740764] entry_SYSCALL_64_after_hwframe+0x65/0xca [260136.745906] RIP: 0033:0x7f1bd4687ab4 [260136.749574] Code: Unable to access opcode bytes at RIP 0x7f1bd4687a8a. [260136.756181] RSP: 002b:00007fff63f65170 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [260136.763834] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f1bd4687ab4 [260136.771056] RDX: 0000000000100000 RSI: 00007f1b69dc3000 RDI: 0000000000000005 [260136.778274] RBP: 00007f1b69dc3000 R08: 0000000000000000 R09: 0000000000000000 [260136.785494] R10: 000000008fd0ea42 R11: 0000000000000246 R12: 0000000000100000 [260136.792714] R13: 000055ca4b405ec0 R14: 0000000000100000 R15: 000055ca4b405ee8 [260259.123003] INFO: task kworker/u128:0:3589938 blocked for more than 120 seconds. [260259.130487] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260259.138313] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260259.146224] task:kworker/u128:0 state:D stack: 0 pid:3589938 ppid: 2 flags:0x80004080 [260259.154832] Workqueue: writeback wb_workfn (flush-zfs-540) [260259.160411] Call Trace: [260259.162950] __schedule+0x2d1/0x830 [260259.166531] schedule+0x35/0xa0 [260259.169765] io_schedule+0x12/0x40 [260259.173257] __lock_page+0x12d/0x230 [260259.176921] ? file_fdatawait_range+0x20/0x20 [260259.181368] write_cache_pages+0x1f2/0x420 [260259.185554] ? zpl_readpage_filler+0x10/0x10 [zfs] [260259.190633] zpl_writepages+0x98/0x130 [zfs] [260259.195183] do_writepages+0xc2/0x1c0 [260259.198935] __writeback_single_inode+0x39/0x2f0 [260259.203640] writeback_sb_inodes+0x1e6/0x450 [260259.208002] __writeback_inodes_wb+0x5f/0xc0 [260259.212359] wb_writeback+0x247/0x2e0 [260259.216114] ? get_nr_inodes+0x35/0x50 [260259.219953] wb_workfn+0x37c/0x4d0 [260259.223443] ? __switch_to_asm+0x35/0x70 [260259.227456] ? __switch_to_asm+0x41/0x70 [260259.231469] ? __switch_to_asm+0x35/0x70 [260259.235481] ? __switch_to_asm+0x41/0x70 [260259.239495] ? __switch_to_asm+0x35/0x70 [260259.243505] ? __switch_to_asm+0x41/0x70 [260259.247518] ? __switch_to_asm+0x35/0x70 [260259.251533] ? __switch_to_asm+0x41/0x70 [260259.255545] process_one_work+0x1a7/0x360 [260259.259645] worker_thread+0x30/0x390 [260259.263396] ? create_worker+0x1a0/0x1a0 [260259.267409] kthread+0x10a/0x120 [260259.270730] ? set_kthread_struct+0x40/0x40 [260259.275003] ret_from_fork+0x35/0x40 [260259.278712] INFO: task fio:3791107 blocked for more than 120 seconds. [260259.285240] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260259.293064] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260259.300976] task:fio state:D stack: 0 pid:3791107 ppid:3790841 flags:0x00004080 [260259.309668] Call Trace: [260259.312210] __schedule+0x2d1/0x830 [260259.315787] schedule+0x35/0xa0 [260259.319020] io_schedule+0x12/0x40 [260259.322511] wait_on_page_bit+0x123/0x220 [260259.326611] ? xas_load+0x8/0x80 [260259.329930] ? file_fdatawait_range+0x20/0x20 [260259.334376] filemap_page_mkwrite+0x9b/0xb0 [260259.338650] do_page_mkwrite+0x53/0x90 [260259.342489] ? vm_normal_page+0x1a/0xc0 [260259.346415] do_wp_page+0x298/0x350 [260259.349994] __handle_mm_fault+0x44f/0x6c0 [260259.354181] ? __switch_to_asm+0x41/0x70 [260259.358193] handle_mm_fault+0xc1/0x1e0 [260259.362117] do_user_addr_fault+0x1b5/0x440 [260259.366391] do_page_fault+0x37/0x130 [260259.370145] ? page_fault+0x8/0x30 [260259.373639] page_fault+0x1e/0x30 [260259.377043] RIP: 0033:0x7f6deee7f1b4 [260259.380714] Code: Unable to access opcode bytes at RIP 0x7f6deee7f18a. [260259.387323] RSP: 002b:00007fffe41b6538 EFLAGS: 00010202 [260259.392633] RAX: 00007f6d83049000 RBX: 0000556b63614ec0 RCX: 00007f6d83148fe0 [260259.399853] RDX: 00000000000acfe0 RSI: 00007f6d84e9c030 RDI: 00007f6d8309bfa0 [260259.407074] RBP: 00007f6d84f4a000 R08: ffffffffffffffe0 R09: 0000000000000000 [260259.414291] R10: 00007f6d84f8e810 R11: 00007f6d83049000 R12: 0000000000000001 [260259.421512] R13: 0000556b63614ec0 R14: 0000000000100000 R15: 0000556b63614ee8 [260259.428733] INFO: task fio:3791108 blocked for more than 120 seconds. [260259.435258] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260259.443085] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260259.450997] task:fio state:D stack: 0 pid:3791108 ppid:3790835 flags:0x00004080 [260259.459689] Call Trace: [260259.462228] __schedule+0x2d1/0x830 [260259.465808] ? cv_wait_common+0x12d/0x240 [spl] [260259.470435] schedule+0x35/0xa0 [260259.473669] io_schedule+0x12/0x40 [260259.477161] __lock_page+0x12d/0x230 [260259.480828] ? file_fdatawait_range+0x20/0x20 [260259.485274] zfs_putpage+0x148/0x590 [zfs] [260259.489640] ? rmap_walk_file+0x116/0x290 [260259.493742] ? __mod_memcg_lruvec_state+0x5d/0x160 [260259.498619] zpl_putpage+0x67/0xd0 [zfs] [260259.502813] write_cache_pages+0x197/0x420 [260259.506998] ? zpl_readpage_filler+0x10/0x10 [zfs] [260259.512054] zpl_writepages+0x119/0x130 [zfs] [260259.516672] do_writepages+0xc2/0x1c0 [260259.520423] ? flush_tlb_func_common.constprop.9+0x125/0x220 [260259.526170] __filemap_fdatawrite_range+0xc7/0x100 [260259.531050] filemap_write_and_wait_range+0x30/0x80 [260259.536016] generic_file_direct_write+0x120/0x160 [260259.540896] ? rrw_exit+0xb0/0x1c0 [zfs] [260259.545099] zpl_iter_write+0xdd/0x160 [zfs] [260259.549639] new_sync_write+0x112/0x160 [260259.553566] vfs_write+0xa5/0x1a0 [260259.556971] ksys_write+0x4f/0xb0 [260259.560379] do_syscall_64+0x5b/0x1a0 [260259.564131] entry_SYSCALL_64_after_hwframe+0x65/0xca [260259.569269] RIP: 0033:0x7f9d192c7a17 [260259.572935] Code: Unable to access opcode bytes at RIP 0x7f9d192c79ed. [260259.579549] RSP: 002b:00007ffc8e4ba270 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [260259.587200] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f9d192c7a17 [260259.594419] RDX: 0000000000100000 RSI: 00007f9caea03000 RDI: 0000000000000005 [260259.601639] RBP: 00007f9caea03000 R08: 0000000000000000 R09: 0000000000000000 [260259.608859] R10: 00005558e8975680 R11: 0000000000000293 R12: 0000000000100000 [260259.616078] R13: 00005558e8985ec0 R14: 0000000000100000 R15: 00005558e8985ee8 [260259.623298] INFO: task fio:3791109 blocked for more than 120 seconds. [260259.629827] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260259.637650] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260259.645564] task:fio state:D stack: 0 pid:3791109 ppid:3790838 flags:0x00004080 [260259.654254] Call Trace: [260259.656794] __schedule+0x2d1/0x830 [260259.660373] ? zfs_znode_held+0xe6/0x140 [zfs] [260259.665081] schedule+0x35/0xa0 [260259.668313] cv_wait_common+0x153/0x240 [spl] [260259.672768] ? finish_wait+0x80/0x80 [260259.676441] zfs_rangelock_enter_reader+0xa1/0x1f0 [zfs] [260259.682026] zfs_rangelock_enter_impl+0xbf/0x170 [zfs] [260259.687432] zfs_get_data+0x113/0x770 [zfs] [260259.691876] zil_lwb_commit+0x537/0x780 [zfs] [260259.696497] zil_process_commit_list+0x14c/0x460 [zfs] [260259.701895] zil_commit_writer+0xeb/0x160 [zfs] [260259.706689] zil_commit_impl+0x5d/0xa0 [zfs] [260259.711228] zfs_putpage+0x516/0x590 [zfs] [260259.715589] zpl_putpage+0x67/0xd0 [zfs] [260259.719775] write_cache_pages+0x197/0x420 [260259.723959] ? zpl_readpage_filler+0x10/0x10 [zfs] [260259.729013] zpl_writepages+0x119/0x130 [zfs] [260259.733632] do_writepages+0xc2/0x1c0 [260259.737384] __filemap_fdatawrite_range+0xc7/0x100 [260259.742264] filemap_write_and_wait_range+0x30/0x80 [260259.747229] zpl_iter_read_direct+0x86/0x1b0 [zfs] [260259.752286] ? rrw_exit+0xb0/0x1c0 [zfs] [260259.756487] zpl_iter_read+0x90/0xb0 [zfs] [260259.760855] new_sync_read+0x10f/0x150 [260259.764696] vfs_read+0x91/0x140 [260259.768013] ksys_read+0x4f/0xb0 [260259.771332] do_syscall_64+0x5b/0x1a0 [260259.775087] entry_SYSCALL_64_after_hwframe+0x65/0xca [260259.780225] RIP: 0033:0x7f1bd4687ab4 [260259.783893] Code: Unable to access opcode bytes at RIP 0x7f1bd4687a8a. [260259.790503] RSP: 002b:00007fff63f65170 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [260259.798157] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f1bd4687ab4 [260259.805377] RDX: 0000000000100000 RSI: 00007f1b69dc3000 RDI: 0000000000000005 [260259.812592] RBP: 00007f1b69dc3000 R08: 0000000000000000 R09: 0000000000000000 [260259.819814] R10: 000000008fd0ea42 R11: 0000000000000246 R12: 0000000000100000 [260259.827032] R13: 000055ca4b405ec0 R14: 0000000000100000 R15: 000055ca4b405ee8 [260382.001731] INFO: task kworker/u128:0:3589938 blocked for more than 120 seconds. [260382.009227] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260382.017053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260382.024963] task:kworker/u128:0 state:D stack: 0 pid:3589938 ppid: 2 flags:0x80004080 [260382.033568] Workqueue: writeback wb_workfn (flush-zfs-540) [260382.039141] Call Trace: [260382.041683] __schedule+0x2d1/0x830 [260382.045271] schedule+0x35/0xa0 [260382.048503] io_schedule+0x12/0x40 [260382.051994] __lock_page+0x12d/0x230 [260382.055662] ? file_fdatawait_range+0x20/0x20 [260382.060107] write_cache_pages+0x1f2/0x420 [260382.064293] ? zpl_readpage_filler+0x10/0x10 [zfs] [260382.069379] zpl_writepages+0x98/0x130 [zfs] [260382.073919] do_writepages+0xc2/0x1c0 [260382.077672] __writeback_single_inode+0x39/0x2f0 [260382.082379] writeback_sb_inodes+0x1e6/0x450 [260382.086738] __writeback_inodes_wb+0x5f/0xc0 [260382.091097] wb_writeback+0x247/0x2e0 [260382.094850] ? get_nr_inodes+0x35/0x50 [260382.098689] wb_workfn+0x37c/0x4d0 [260382.102181] ? __switch_to_asm+0x35/0x70 [260382.106194] ? __switch_to_asm+0x41/0x70 [260382.110207] ? __switch_to_asm+0x35/0x70 [260382.114221] ? __switch_to_asm+0x41/0x70 [260382.118231] ? __switch_to_asm+0x35/0x70 [260382.122244] ? __switch_to_asm+0x41/0x70 [260382.126256] ? __switch_to_asm+0x35/0x70 [260382.130273] ? __switch_to_asm+0x41/0x70 [260382.134284] process_one_work+0x1a7/0x360 [260382.138384] worker_thread+0x30/0x390 [260382.142136] ? create_worker+0x1a0/0x1a0 [260382.146150] kthread+0x10a/0x120 [260382.149469] ? set_kthread_struct+0x40/0x40 [260382.153741] ret_from_fork+0x35/0x40 [260382.157448] INFO: task fio:3791107 blocked for more than 120 seconds. [260382.163977] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260382.171802] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260382.179715] task:fio state:D stack: 0 pid:3791107 ppid:3790841 flags:0x00004080 [260382.188409] Call Trace: [260382.190945] __schedule+0x2d1/0x830 [260382.194527] schedule+0x35/0xa0 [260382.197757] io_schedule+0x12/0x40 [260382.201249] wait_on_page_bit+0x123/0x220 [260382.205350] ? xas_load+0x8/0x80 [260382.208668] ? file_fdatawait_range+0x20/0x20 [260382.213114] filemap_page_mkwrite+0x9b/0xb0 [260382.217386] do_page_mkwrite+0x53/0x90 [260382.221227] ? vm_normal_page+0x1a/0xc0 [260382.225152] do_wp_page+0x298/0x350 [260382.228733] __handle_mm_fault+0x44f/0x6c0 [260382.232919] ? __switch_to_asm+0x41/0x70 [260382.236930] handle_mm_fault+0xc1/0x1e0 [260382.240856] do_user_addr_fault+0x1b5/0x440 [260382.245132] do_page_fault+0x37/0x130 [260382.248883] ? page_fault+0x8/0x30 [260382.252375] page_fault+0x1e/0x30 [260382.255781] RIP: 0033:0x7f6deee7f1b4 [260382.259451] Code: Unable to access opcode bytes at RIP 0x7f6deee7f18a. [260382.266059] RSP: 002b:00007fffe41b6538 EFLAGS: 00010202 [260382.271373] RAX: 00007f6d83049000 RBX: 0000556b63614ec0 RCX: 00007f6d83148fe0 [260382.278591] RDX: 00000000000acfe0 RSI: 00007f6d84e9c030 RDI: 00007f6d8309bfa0 [260382.285813] RBP: 00007f6d84f4a000 R08: ffffffffffffffe0 R09: 0000000000000000 [260382.293030] R10: 00007f6d84f8e810 R11: 00007f6d83049000 R12: 0000000000000001 [260382.300249] R13: 0000556b63614ec0 R14: 0000000000100000 R15: 0000556b63614ee8 [260382.307472] INFO: task fio:3791108 blocked for more than 120 seconds. [260382.313997] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [260382.321823] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [260382.329734] task:fio state:D stack: 0 pid:3791108 ppid:3790835 flags:0x00004080 [260382.338427] Call Trace: [260382.340967] __schedule+0x2d1/0x830 [260382.344547] ? cv_wait_common+0x12d/0x240 [spl] [260382.349173] schedule+0x35/0xa0 [260382.352406] io_schedule+0x12/0x40 [260382.355899] __lock_page+0x12d/0x230 [260382.359563] ? file_fdatawait_range+0x20/0x20 [260382.364010] zfs_putpage+0x148/0x590 [zfs] [260382.368379] ? rmap_walk_file+0x116/0x290 [260382.372479] ? __mod_memcg_lruvec_state+0x5d/0x160 [260382.377358] zpl_putpage+0x67/0xd0 [zfs] [260382.381552] write_cache_pages+0x197/0x420 [260382.385739] ? zpl_readpage_filler+0x10/0x10 [zfs] [260382.390791] zpl_writepages+0x119/0x130 [zfs] [260382.395410] do_writepages+0xc2/0x1c0 [260382.399161] ? flush_tlb_func_common.constprop.9+0x125/0x220 [260382.404907] __filemap_fdatawrite_range+0xc7/0x100 [260382.409790] filemap_write_and_wait_range+0x30/0x80 [260382.414752] generic_file_direct_write+0x120/0x160 [260382.419632] ? rrw_exit+0xb0/0x1c0 [zfs] [260382.423838] zpl_iter_write+0xdd/0x160 [zfs] [260382.428379] new_sync_write+0x112/0x160 [260382.432304] vfs_write+0xa5/0x1a0 [260382.435711] ksys_write+0x4f/0xb0 [260382.439115] do_syscall_64+0x5b/0x1a0 [260382.442866] entry_SYSCALL_64_after_hwframe+0x65/0xca [260382.448007] RIP: 0033:0x7f9d192c7a17 [260382.451675] Code: Unable to access opcode bytes at RIP 0x7f9d192c79ed. [260382.458286] RSP: 002b:00007ffc8e4ba270 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [260382.465938] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f9d192c7a17 [260382.473158] RDX: 0000000000100000 RSI: 00007f9caea03000 RDI: 0000000000000005 [260382.480379] RBP: 00007f9caea03000 R08: 0000000000000000 R09: 0000000000000000 [260382.487597] R10: 00005558e8975680 R11: 0000000000000293 R12: 0000000000100000 [260382.494814] R13: 00005558e8985ec0 R14: 0000000000100000 R15: 00005558e8985ee8 Signed-off-by: Brian Atkinson --- config/kernel-generic_file_direct_write.m4 | 120 --------------------- config/kernel.m4 | 2 - include/os/linux/zfs/sys/zpl.h | 27 ----- module/os/linux/zfs/zpl_file.c | 95 ++++------------ module/zfs/zfs_vnops.c | 63 +++++++---- 5 files changed, 62 insertions(+), 245 deletions(-) delete mode 100644 config/kernel-generic_file_direct_write.m4 diff --git a/config/kernel-generic_file_direct_write.m4 b/config/kernel-generic_file_direct_write.m4 deleted file mode 100644 index d665b62d6eb0..000000000000 --- a/config/kernel-generic_file_direct_write.m4 +++ /dev/null @@ -1,120 +0,0 @@ -dnl # -dnl # Check generic_file_direct_write() interface -dnl # -dnl # Both HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER and -dnl # HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER_WITH_LOFF will align with -dnl @ HAVE_VFS_RW_ITERATE as they are valid in kernels >= 3.16. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILE_DIRECT_WRITE], [ - ZFS_LINUX_TEST_SRC([generic_file_direct_write_iov_iter], [ - #include - ], [ - struct kiocb *kiocb = NULL; - struct iov_iter *iter = NULL; - ssize_t ret __attribute__ ((unused)); - - ret = generic_file_direct_write(kiocb, iter); - ]) - - ZFS_LINUX_TEST_SRC([generic_file_direct_write_iov_iter_loff], [ - #include - ], [ - struct kiocb *kiocb = NULL; - struct iov_iter *iter = NULL; - loff_t off = 0; - ssize_t ret __attribute__ ((unused)); - - ret = generic_file_direct_write(kiocb, iter, off); - ]) - - ZFS_LINUX_TEST_SRC([generic_file_direct_write_iovec_loff], [ - #include - ], [ - struct kiocb *kiocb = NULL; - const struct iovec *iovec = NULL; - unsigned long nr_segs = 0; - loff_t pos = 0; - size_t count = 0; - size_t ocount = 0; - ssize_t ret __attribute__ ((unused)); - - ret = generic_file_direct_write(kiocb, iovec, &nr_segs, pos, - count, ocount); - ]) - - ZFS_LINUX_TEST_SRC([generic_file_direct_write_iovec_loff_ptr], [ - #include - ], [ - struct kiocb *kiocb = NULL; - const struct iovec *iovec = NULL; - unsigned long nr_segs; - loff_t pos = 0; - loff_t *ppos = NULL; - size_t count = 0; - size_t ocount = 0; - ssize_t ret __attribute__ ((unused)); - - ret = generic_file_direct_write(kiocb, iovec, &nr_segs, pos, - ppos, count, ocount); - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILE_DIRECT_WRITE], [ - dnl # - dnl # Linux 4.7 change - dnl # - AC_MSG_CHECKING([whether generic_file_direct_write() passes iov_iter]) - ZFS_LINUX_TEST_RESULT([generic_file_direct_write_iov_iter], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER, 1, - [generic_file_direct_write() passes iov_iter]) - ], [ - AC_MSG_RESULT([no]) - - dnl # - dnl # Linux 3.16 change - dnl # - AC_MSG_CHECKING( - [whether generic_file_direct_write() passes iov_iter with loff]) - ZFS_LINUX_TEST_RESULT( - [generic_file_direct_write_iov_iter_loff], [ - AC_MSG_RESULT([yes]) - AC_DEFINE( - HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER_WITH_LOFF, 1, - [generic_file_direct_write() passes iov_iter with loff]) - ], [ - AC_MSG_RESULT([no]) - - dnl # - dnl # Linux 3.15 change - dnl # - AC_MSG_CHECKING( - [whether generic_file_direct_write() passes struct iovec]) - ZFS_LINUX_TEST_RESULT([generic_file_direct_write_iovec_loff], [ - AC_MSG_RESULT([yes]) - AC_DEFINE( - HAVE_GENERIC_FILE_DIRECT_WRITE_IOVEC, 1, - [generic_file_direct_write() passes struct iovec]) - ], [ - AC_MSG_RESULT([no]) - - dnl # - dnl # Covers Linux 3.10 - dnl # - AC_MSG_CHECKING( - [whether generic_file_direct_write() passes struct iovec with loff ptr]) - ZFS_LINUX_TEST_RESULT( - [generic_file_direct_write_iovec_loff_ptr], [ - AC_MSG_RESULT([yes]) - AC_DEFINE( - HAVE_GENERIC_FILE_DIRECT_WRITE_IOVEC_LOFF_PTR, 1, - [generic_file_direct_write() passes struct iovec with loff ptr]) - ], [ - ZFS_LINUX_TEST_ERROR( - [generic_file_direct_write]) - AC_MSG_RESULT([no]) - ]) - ]) - ]) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 9533a56bb2c0..83da863fce2c 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -67,7 +67,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_GENHD_FLAGS ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO - ZFS_AC_KERNEL_SRC_GENERIC_FILE_DIRECT_WRITE ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE @@ -225,7 +224,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_GENHD_FLAGS ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO - ZFS_AC_KERNEL_GENERIC_FILE_DIRECT_WRITE ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index a7d34fd6ca09..c8eefe4fe5da 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -293,31 +293,4 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #define zpl_inode_set_mtime_to_ts(ip, ts) (ip->i_mtime = ts) #endif -/* - * HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER* align with HAVE_VFS_RW_ITERATE - */ -#if defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER) -/* 4.7 API */ -#define zpl_generic_file_direct_write(iocb, iter, off) \ - generic_file_direct_write(iocb, iter) - -#elif defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER_WITH_LOFF) -/* 3.16 API */ -#define zpl_generic_file_direct_write(iocb, iter, off) \ - generic_file_direct_write(iocb, iter, off) - -#elif defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOVEC) -/* 3.15 API */ -#define zpl_generic_file_direct_write(iocb, vec, segs, pos, ppos, cnt, ocnt) \ - generic_file_direct_write(iocb, vec, segs, pos, cnt, ocnt) - -#elif defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOVEC_LOFF_PTR) -/* 3.10 API */ -#define zpl_generic_file_direct_write(iocb, vec, segs, pos, ppos, cnt, ocnt) \ - generic_file_direct_write(iocb, vec, segs, pos, ppos, cnt, ocnt) - -#else -#error "Unsupported kernel" -#endif - #endif /* _SYS_ZPL_H */ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 7c599feca0e3..5b7c4e4beaf2 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -351,16 +351,6 @@ zpl_iter_read_direct(struct kiocb *kiocb, struct iov_iter *to) zfs_uio_t uio; ssize_t ret; - /* - * Attempt to flush out any pages from the page cache. On error - * fallback to the buffered path. - */ - ret = filemap_write_and_wait_range(filp->f_mapping, kiocb->ki_pos, - kiocb->ki_pos + count - 1); - - if (ret < 0) - return (ret); - zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); /* On error, return to fallback to the buffered path. */ @@ -508,6 +498,7 @@ zpl_iter_write_direct(struct kiocb *kiocb, struct iov_iter *from) return (error); wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; return (wrote); } @@ -533,22 +524,9 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) if (direct == ZFS_DIRECT_IO_ERR) { return (-error); } else if (direct == ZFS_DIRECT_IO_ENABLED) { - /* - * zpl_generic_file_direct_write() will attempt to flush out any - * pages in the page cache and invalidate them. If this is - * successful it will cal the direct_IO - * address_space_operation (zpl_iter_write_direct()). - */ - ssize_t wrote = zpl_generic_file_direct_write(kiocb, from, - kiocb->ki_pos); + ssize_t wrote = zpl_iter_write_direct(kiocb, from); if (wrote >= 0 || wrote != -EAGAIN) { - /* - * generic_file_direct_write() will update - * kiocb->ki_pos on a successful Direct IO write. - */ - IMPLY(wrote >= 0, - (offset + wrote) == kiocb->ki_pos); return (wrote); } @@ -619,16 +597,6 @@ zpl_aio_read_direct(struct kiocb *kiocb, const struct iovec *iov, if (ret) return (ret); - /* - * Attempt to flush out any pages from the page cache. On error - * fallback to the buffered path. - */ - ret = filemap_write_and_wait_range(filp->f_mapping, kiocb->ki_pos, - kiocb->ki_pos + iov_length(iov, nr_segs) - 1); - - if (ret < 0) - return (ret); - zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); @@ -771,6 +739,7 @@ zpl_aio_write_direct(struct kiocb *kiocb, const struct iovec *iov, return (error); ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; return (wrote); } @@ -805,21 +774,9 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, if (direct == ZFS_DIRECT_IO_ERR) { return (-error); } else if (direct == ZFS_DIRECT_IO_ENABLED) { - /* - * zpl_generic_file_direct_write() will attempt to flush out any - * pages in the page cahce and invalidate them. If this is - * successful it will call the direct_IO - * address_space_operation (zpl_aio_write_direct()). - */ - ssize_t wrote = zpl_generic_file_direct_write(kiocb, iov, - &nr_segs, pos, &kiocb->ki_pos, count, ocount); + ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); if (wrote >= 0 || wrote != -EAGAIN) { - /* - * generic_file_direct_write() will update - * kiocb->ki_pos on a successful Direct IO write. - */ - IMPLY(wrote >= 0, (pos + wrote) == kiocb->ki_pos); return (wrote); } @@ -835,35 +792,37 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, #endif /* HAVE_VFS_RW_ITERATE */ +static ssize_t +zpl_direct_IO_impl(void) +{ + /* + * All O_DIRCT requests should be handled by + * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code + * should call the direct_IO address_space_operations function. We set + * this code path to be fatal if it is executed. + */ + VERIFY(0); + return (0); +} + #if defined(HAVE_VFS_RW_ITERATE) #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { - if (iov_iter_rw(iter) == WRITE) - return (zpl_iter_write_direct(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - if (iov_iter_rw(iter) == WRITE) - return (zpl_iter_write_direct(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - if (rw == WRITE) - return (zpl_iter_write_direct(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); + return (zpl_direct_IO_impl()); } #else #error "Unknown direct IO interface" @@ -876,23 +835,13 @@ static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - if (rw == WRITE) - return (zpl_aio_write_direct(kiocb, iov, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iov, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - const struct iovec *iovp = iov_iter_iovec(iter); - unsigned long nr_segs = iter->nr_segs; - - ASSERT3S(pos, ==, kiocb->ki_pos); - if (rw == WRITE) - return (zpl_aio_write_direct(kiocb, iovp, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #else #error "Unknown direct IO interface" diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 415341270519..76fa7e25ba52 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -230,8 +230,15 @@ zfs_check_direct_enabled(znode_t *zp, int ioflags, int *error) * is it allows the property to be safely set on a dataset without forcing * all of the applications to be aware of the alignment restrictions. When * O_DIRECT is explicitly requested by an application return EINVAL if the - * request is unaligned. In all cases, if the file has been accessed via - * mmap(2) then perform buffered IO to keep the mapped region synchronized. + * request is unaligned. In all cases, if the range for this request has + * been mmap'ed then we will perform buffered I/O to keep the mapped region + * synhronized with the ARC. + * + * It is possible that a file's pages could be mmap'ed after it is checked + * here. If so, that is handled according in zfs_read() and zfs_write(). See + * comments in the following two areas for how this handled: + * zfs_read() -> mappedread() + * zfs_write() -> update_pages() */ int zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, @@ -415,9 +422,23 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) else #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + nbytes - 1) && - !(uio->uio_extflg & UIO_DIRECT)) { + zfs_uio_offset(uio) + nbytes - 1)) { + /* + * It is possible that a files pages have been mmap'ed + * since our check for Direct I/O reads and the read + * being issued. In this case, we will use the ARC to + * keep it synchronized with the page cache. In order + * to do this we temporarily remove the UIO_DIRECT + * flag. + */ + boolean_t uio_direct_mmap = B_FALSE; + if (uio->uio_extflg & UIO_DIRECT) { + uio->uio_extflg &= ~UIO_DIRECT; + uio_direct_mmap = B_TRUE; + } error = mappedread(zp, nbytes, uio); + if (uio_direct_mmap) + uio->uio_extflg |= UIO_DIRECT; } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); @@ -453,8 +474,14 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * remainder of the file can be read using the ARC. */ uio->uio_extflg &= ~UIO_DIRECT; - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, - dio_remaining_resid); + + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + dio_remaining_resid - 1)) { + error = mappedread(zp, dio_remaining_resid, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, + dio_remaining_resid); + } uio->uio_extflg |= UIO_DIRECT; if (error != 0) @@ -871,25 +898,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) /* * There is a a window where a file's pages can be mmap'ed after - * the write has started. We may have temporarily removed the - * UIO_DIRECT flag as we are still growing the blocksize but - * the O_DIRECT flag is still present. This check for the - * O_DIRECT flag has always been present before calling - * update_pages(). If we remove the check for O_DIRECT we can - * wind up in a deadlock between update_pages() and - * zpl_writepages() -> write_cache_pages(). - * - * XXX - In reality, this can probably be fixed by adding a - * rangelock to zfs_fillpage(). There has always been a window - * where we can start a write, but the pages are not mmapp'ed - * till later. Ideally we would want to only check using - * zn_has_cached_data() while holding the rangelock to remove - * this window and the addition of also grabbing a rangelock - * in zfs_fillpage(). + * the Direct I/O write has started. In this case we will still + * call update_pages() to make sure there is consistency + * between the ARC and the page cache. This is unfortunate + * situation as the data will be read back into the ARC after + * the Direct I/O write has completed, but this is the pentalty + * for writing to a mmap'ed region of the file using O_DIRECT. */ if (tx_bytes && - zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && - !(ioflag & O_DIRECT)) { + zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); }