From f44236a1b05b583862eb8b06dd8412660fdc7fad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Mar 2017 16:54:24 -0400 Subject: [PATCH 0001/1590] cgroup, kthread: close race window where new kthreads can be migrated to non-root cgroups commit 77f88796cee819b9c4562b0b6b44691b3b7755b1 upstream. Creation of a kthread goes through a couple interlocked stages between the kthread itself and its creator. Once the new kthread starts running, it initializes itself and wakes up the creator. The creator then can further configure the kthread and then let it start doing its job by waking it up. In this configuration-by-creator stage, the creator is the only one that can wake it up but the kthread is visible to userland. When altering the kthread's attributes from userland is allowed, this is fine; however, for cases where CPU affinity is critical, kthread_bind() is used to first disable affinity changes from userland and then set the affinity. This also prevents the kthread from being migrated into non-root cgroups as that can affect the CPU affinity and many other things. Unfortunately, the cgroup side of protection is racy. While the PF_NO_SETAFFINITY flag prevents further migrations, userland can win the race before the creator sets the flag with kthread_bind() and put the kthread in a non-root cgroup, which can lead to all sorts of problems including incorrect CPU affinity and starvation. This bug got triggered by userland which periodically tries to migrate all processes in the root cpuset cgroup to a non-root one. Per-cpu workqueue workers got caught while being created and ended up with incorrected CPU affinity breaking concurrency management and sometimes stalling workqueue execution. This patch adds task->no_cgroup_migration which disallows the task to be migrated by userland. kthreadd starts with the flag set making every child kthread start in the root cgroup with migration disallowed. The flag is cleared after the kthread finishes initialization by which time PF_NO_SETAFFINITY is set if the kthread should stay in the root cgroup. It'd be better to wait for the initialization instead of failing but I couldn't think of a way of implementing that without adding either a new PF flag, or sleeping and retrying from waiting side. Even if userland depends on changing cgroup membership of a kthread, it either has to be synchronized with kthread_create() or periodically repeat, so it's unlikely that this would break anything. v2: Switch to a simpler implementation using a new task_struct bit field suggested by Oleg. Signed-off-by: Tejun Heo Suggested-by: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Reported-and-debugged-by: Chris Mason Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup.h | 21 +++++++++++++++++++++ include/linux/sched.h | 4 ++++ kernel/cgroup.c | 9 +++++---- kernel/kthread.c | 3 +++ 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c83c23f0577bd9..307ae63ef262ce 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -570,6 +570,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } +static inline void cgroup_init_kthreadd(void) +{ + /* + * kthreadd is inherited by all kthreads, keep it in the root so + * that the new kthreads are guaranteed to stay in the root until + * initialization is finished. + */ + current->no_cgroup_migration = 1; +} + +static inline void cgroup_kthread_ready(void) +{ + /* + * This kthread finished initialization. The creator should have + * set PF_NO_SETAFFINITY if this kthread should stay in the root. + */ + current->no_cgroup_migration = 0; +} + #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -590,6 +609,8 @@ static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } +static inline void cgroup_init_kthreadd(void) {} +static inline void cgroup_kthread_ready(void) {} static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) diff --git a/include/linux/sched.h b/include/linux/sched.h index 75d9a57e212e8b..f425eb3318abff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1584,6 +1584,10 @@ struct task_struct { #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif +#ifdef CONFIG_CGROUPS + /* disallow userland-initiated cgroup migration */ + unsigned no_cgroup_migration:1; +#endif unsigned long atomic_flags; /* Flags needing atomic access. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4e2f3de0e40bff..a3d2aad2443f33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2920,11 +2920,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_NO_SETAFFINITY and become - * trapped in a cpuset, or RT worker may be born in a cgroup - * with no rt_runtime allocated. Just say no. + * kthreads may acquire PF_NO_SETAFFINITY during initialization. + * If userland migrates such a kthread to a non-root cgroup, it can + * become trapped in a cpuset, or RT kthread may be born in a + * cgroup with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { + if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out_unlock_rcu; } diff --git a/kernel/kthread.c b/kernel/kthread.c index be2cc1f9dd571b..c2c911a106cfeb 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -18,6 +18,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -205,6 +206,7 @@ static int kthread(void *_create) ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { + cgroup_kthread_ready(); __kthread_parkme(&self); ret = threadfn(data); } @@ -530,6 +532,7 @@ int kthreadd(void *unused) set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; + cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); From ef599fa52429f7f8fea2d1bf83a49b0377c7dc24 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 27 Mar 2017 17:07:40 +0800 Subject: [PATCH 0002/1590] tcmu: Fix possible overwrite of t_data_sg's last iov[] commit ab22d2604c86ceb01bb2725c9860b88a7dd383bb upstream. If there has BIDI data, its first iov[] will overwrite the last iov[] for se_cmd->t_data_sg. To fix this, we can just increase the iov pointer, but this may introuduce a new memory leakage bug: If the se_cmd->data_length and se_cmd->t_bidi_data_sg->length are all not aligned up to the DATA_BLOCK_SIZE, the actual length needed maybe larger than just sum of them. So, this could be avoided by rounding all the data lengthes up to DATA_BLOCK_SIZE. Reviewed-by: Mike Christie Tested-by: Ilias Tsitsimpis Reviewed-by: Bryant G. Ly Signed-off-by: Xiubo Li Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_user.c | 34 +++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 70c143a5c38c3f..8d9f5f9c32a78f 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -389,6 +389,20 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d return true; } +static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd) +{ + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE); + + if (se_cmd->se_cmd_flags & SCF_BIDI) { + BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents)); + data_length += round_up(se_cmd->t_bidi_data_sg->length, + DATA_BLOCK_SIZE); + } + + return data_length; +} + static sense_reason_t tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) { @@ -402,7 +416,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) uint32_t cmd_head; uint64_t cdb_off; bool copy_to_data_area; - size_t data_length; + size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd); DECLARE_BITMAP(old_bitmap, DATA_BLOCK_BITS); if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) @@ -428,11 +442,6 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) mb = udev->mb_addr; cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ - data_length = se_cmd->data_length; - if (se_cmd->se_cmd_flags & SCF_BIDI) { - BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents)); - data_length += se_cmd->t_bidi_data_sg->length; - } if ((command_size > (udev->cmdr_size / 2)) || data_length > udev->data_size) { pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu " @@ -502,11 +511,14 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) entry->req.iov_dif_cnt = 0; /* Handle BIDI commands */ - iov_cnt = 0; - alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg, - se_cmd->t_bidi_data_nents, &iov, &iov_cnt, false); - entry->req.iov_bidi_cnt = iov_cnt; - + if (se_cmd->se_cmd_flags & SCF_BIDI) { + iov_cnt = 0; + iov++; + alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg, + se_cmd->t_bidi_data_nents, &iov, &iov_cnt, + false); + entry->req.iov_bidi_cnt = iov_cnt; + } /* cmd's data_bitmap is what changed in process */ bitmap_xor(tcmu_cmd->data_bitmap, old_bitmap, udev->data_bitmap, DATA_BLOCK_BITS); From 890aec8eae096624763bc3de313d154a3ba9d9d6 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 27 Mar 2017 17:07:41 +0800 Subject: [PATCH 0003/1590] tcmu: Fix wrongly calculating of the base_command_size commit abe342a5b4b5aa579f6bf40ba73447c699e6b579 upstream. The t_data_nents and t_bidi_data_nents are the numbers of the segments, but it couldn't be sure the block size equals to size of the segment. For the worst case, all the blocks are discontiguous and there will need the same number of iovecs, that's to say: blocks == iovs. So here just set the number of iovs to block count needed by tcmu cmd. Tested-by: Ilias Tsitsimpis Reviewed-by: Mike Christie Signed-off-by: Xiubo Li Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_user.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 8d9f5f9c32a78f..86ef1400f27eb2 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -403,6 +403,13 @@ static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd) return data_length; } +static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd) +{ + size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd); + + return data_length / DATA_BLOCK_SIZE; +} + static sense_reason_t tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) { @@ -430,8 +437,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) * expensive to tell how many regions are freed in the bitmap */ base_command_size = max(offsetof(struct tcmu_cmd_entry, - req.iov[se_cmd->t_bidi_data_nents + - se_cmd->t_data_nents]), + req.iov[tcmu_cmd_get_block_cnt(tcmu_cmd)]), sizeof(struct tcmu_cmd_entry)); command_size = base_command_size + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE); From 5ef6f4dec559007650c762b11ca30dd74866299a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 31 Mar 2017 10:35:25 +0800 Subject: [PATCH 0004/1590] tcmu: Skip Data-Out blocks before gathering Data-In buffer for BIDI case commit a5d68ba85801a78c892a0eb8efb711e293ed314b upstream. For the bidirectional case, the Data-Out buffer blocks will always at the head of the tcmu_cmd's bitmap, and before gathering the Data-In buffer, first of all it should skip the Data-Out ones, or the device supporting BIDI commands won't work. Fixed: 26418649eead ("target/user: Introduce data_bitmap, replace data_length/data_head/data_tail") Reported-by: Ilias Tsitsimpis Tested-by: Ilias Tsitsimpis Signed-off-by: Xiubo Li Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_user.c | 48 +++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 86ef1400f27eb2..1a83456a65a005 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -306,24 +306,50 @@ static void free_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd) DATA_BLOCK_BITS); } -static void gather_data_area(struct tcmu_dev *udev, unsigned long *cmd_bitmap, - struct scatterlist *data_sg, unsigned int data_nents) +static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, + bool bidi) { + struct se_cmd *se_cmd = cmd->se_cmd; int i, block; int block_remaining = 0; void *from, *to; size_t copy_bytes, from_offset; - struct scatterlist *sg; + struct scatterlist *sg, *data_sg; + unsigned int data_nents; + DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS); + + bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS); + + if (!bidi) { + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + } else { + uint32_t count; + + /* + * For bidi case, the first count blocks are for Data-Out + * buffer blocks, and before gathering the Data-In buffer + * the Data-Out buffer blocks should be discarded. + */ + count = DIV_ROUND_UP(se_cmd->data_length, DATA_BLOCK_SIZE); + while (count--) { + block = find_first_bit(bitmap, DATA_BLOCK_BITS); + clear_bit(block, bitmap); + } + + data_sg = se_cmd->t_bidi_data_sg; + data_nents = se_cmd->t_bidi_data_nents; + } for_each_sg(data_sg, sg, data_nents, i) { int sg_remaining = sg->length; to = kmap_atomic(sg_page(sg)) + sg->offset; while (sg_remaining > 0) { if (block_remaining == 0) { - block = find_first_bit(cmd_bitmap, + block = find_first_bit(bitmap, DATA_BLOCK_BITS); block_remaining = DATA_BLOCK_SIZE; - clear_bit(block, cmd_bitmap); + clear_bit(block, bitmap); } copy_bytes = min_t(size_t, sg_remaining, block_remaining); @@ -600,19 +626,11 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * se_cmd->scsi_sense_length); free_data_area(udev, cmd); } else if (se_cmd->se_cmd_flags & SCF_BIDI) { - DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS); - /* Get Data-In buffer before clean up */ - bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS); - gather_data_area(udev, bitmap, - se_cmd->t_bidi_data_sg, se_cmd->t_bidi_data_nents); + gather_data_area(udev, cmd, true); free_data_area(udev, cmd); } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { - DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS); - - bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS); - gather_data_area(udev, bitmap, - se_cmd->t_data_sg, se_cmd->t_data_nents); + gather_data_area(udev, cmd, false); free_data_area(udev, cmd); } else if (se_cmd->data_direction == DMA_TO_DEVICE) { free_data_area(udev, cmd); From f584803c49427ba9623adf93d7078cbe9775b027 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 13 Apr 2017 14:56:26 -0700 Subject: [PATCH 0005/1590] thp: fix MADV_DONTNEED vs. MADV_FREE race commit 58ceeb6bec86d9140f9d91d71a710e963523d063 upstream. Both MADV_DONTNEED and MADV_FREE handled with down_read(mmap_sem). It's critical to not clear pmd intermittently while handling MADV_FREE to avoid race with MADV_DONTNEED: CPU0: CPU1: madvise_free_huge_pmd() pmdp_huge_get_and_clear_full() madvise_dontneed() zap_pmd_range() pmd_trans_huge(*pmd) == 0 (without ptl) // skip the pmd set_pmd_at(); // pmd is re-established It results in MADV_DONTNEED skipping the pmd, leaving it not cleared. It violates MADV_DONTNEED interface and can result is userspace misbehaviour. Basically it's the same race as with numa balancing in change_huge_pmd(), but a bit simpler to mitigate: we don't need to preserve dirty/young flags here due to MADV_FREE functionality. [kirill.shutemov@linux.intel.com: Urgh... Power is special again] Link: http://lkml.kernel.org/r/20170303102636.bhd2zhtpds4mt62a@black.fi.intel.com Link: http://lkml.kernel.org/r/20170302151034.27829-4-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Minchan Kim Cc: Minchan Kim Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/huge_memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 917555cf6be064..d5b2b759f76fff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1380,8 +1380,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, deactivate_page(page); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); From 5c9d0832022959a8879e082c745281865f5336f0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 13 Apr 2017 14:56:28 -0700 Subject: [PATCH 0006/1590] thp: fix MADV_DONTNEED vs clear soft dirty race commit 5b7abeae3af8c08c577e599dd0578b9e3ee6687b upstream. Yet another instance of the same race. Fix is identical to change_huge_pmd(). See "thp: fix MADV_DONTNEED vs. numa balancing race" for more details. Link: http://lkml.kernel.org/r/20170302151034.27829-5-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35b92d81692f09..b1517b6dcbdd17 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -899,7 +899,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); From d19f745ea3a989f9d037dfbc26abcccfe7710723 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Apr 2017 14:56:40 -0700 Subject: [PATCH 0007/1590] zsmalloc: expand class bit commit 85d492f28d056c40629fc25d79f54da618a29dc4 upstream. Now 64K page system, zsamlloc has 257 classes so 8 class bit is not enough. With that, it corrupts the system when zsmalloc stores 65536byte data(ie, index number 256) so that this patch increases class bit for simple fix for stable backport. We should clean up this mess soon. index size 0 32 1 288 .. .. 204 52256 256 65536 Fixes: 3783689a1 ("zsmalloc: introduce zspage structure") Link: http://lkml.kernel.org/r/1492042622-12074-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b0bc023d25c539..1689bb58e0d170 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -280,7 +280,7 @@ struct zs_pool { struct zspage { struct { unsigned int fullness:FULLNESS_BITS; - unsigned int class:CLASS_BITS; + unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; unsigned int magic:MAGIC_VAL_BITS; }; From 975a7ea950c6d9bcb3666505c6ca9df9b404105c Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Fri, 14 Apr 2017 14:22:41 -0400 Subject: [PATCH 0008/1590] orangefs: free superblock when mount fails commit 1ec1688c5360e14dde4094d6acbf7516bf6db37e upstream. Otherwise lockdep says: [ 1337.483798] ================================================ [ 1337.483999] [ BUG: lock held when returning to user space! ] [ 1337.484252] 4.11.0-rc6 #19 Not tainted [ 1337.484423] ------------------------------------------------ [ 1337.484626] mount/14766 is leaving the kernel with locks still held! [ 1337.484841] 1 lock held by mount/14766: [ 1337.485017] #0: (&type->s_umount_key#33/1){+.+.+.}, at: [] sget_userns+0x2af/0x520 Caught by xfstests generic/413 which tried to mount with the unsupported mount option dax. Then xfstests generic/422 ran sync which deadlocks. Signed-off-by: Martin Brandenburg Acked-by: Mike Marshall Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/orangefs/devorangefs-req.c | 9 +++++++-- fs/orangefs/orangefs-kernel.h | 1 + fs/orangefs/super.c | 23 ++++++++++++++++------- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index f419dd999581e9..fe2cbeb90772aa 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -208,14 +208,19 @@ static ssize_t orangefs_devreq_read(struct file *file, continue; /* * Skip ops whose filesystem we don't know about unless - * it is being mounted. + * it is being mounted or unmounted. It is possible for + * a filesystem we don't know about to be unmounted if + * it fails to mount in the kernel after userspace has + * been sent the mount request. */ /* XXX: is there a better way to detect this? */ } else if (ret == -1 && !(op->upcall.type == ORANGEFS_VFS_OP_FS_MOUNT || op->upcall.type == - ORANGEFS_VFS_OP_GETATTR)) { + ORANGEFS_VFS_OP_GETATTR || + op->upcall.type == + ORANGEFS_VFS_OP_FS_UMOUNT)) { gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: skipping op tag %llu %s\n", llu(op->tag), get_opname_string(op)); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3bf803d732c5b3..45dd8f27b2ac34 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -249,6 +249,7 @@ struct orangefs_sb_info_s { char devname[ORANGEFS_MAX_SERVER_ADDR_LEN]; struct super_block *sb; int mount_pending; + int no_list; struct list_head list; }; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index cd261c8de53a17..629d8c917fa679 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -493,7 +493,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (ret) { d = ERR_PTR(ret); - goto free_op; + goto free_sb_and_op; } /* @@ -519,6 +519,9 @@ struct dentry *orangefs_mount(struct file_system_type *fst, spin_unlock(&orangefs_superblocks_lock); op_release(new_op); + /* Must be removed from the list now. */ + ORANGEFS_SB(sb)->no_list = 0; + if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) @@ -533,6 +536,10 @@ struct dentry *orangefs_mount(struct file_system_type *fst, return dget(sb->s_root); +free_sb_and_op: + /* Will call orangefs_kill_sb with sb not in list. */ + ORANGEFS_SB(sb)->no_list = 1; + deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); if (ret == -EINVAL) { @@ -558,12 +565,14 @@ void orangefs_kill_sb(struct super_block *sb) */ orangefs_unmount_sb(sb); - /* remove the sb from our list of orangefs specific sb's */ - - spin_lock(&orangefs_superblocks_lock); - __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */ - ORANGEFS_SB(sb)->list.prev = NULL; - spin_unlock(&orangefs_superblocks_lock); + if (!ORANGEFS_SB(sb)->no_list) { + /* remove the sb from our list of orangefs specific sb's */ + spin_lock(&orangefs_superblocks_lock); + /* not list_del_init */ + __list_del_entry(&ORANGEFS_SB(sb)->list); + ORANGEFS_SB(sb)->list.prev = NULL; + spin_unlock(&orangefs_superblocks_lock); + } /* * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us From b29a17524bc11e4a8b0a688eae535491bac18001 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 18 Mar 2017 21:53:05 -0400 Subject: [PATCH 0009/1590] drm/nouveau/mpeg: mthd returns true on success now commit 83bce9c2baa51e439480a713119a73d3c8b61083 upstream. Signed-off-by: Ilia Mirkin Fixes: 590801c1a3 ("drm/nouveau/mpeg: remove dependence on namedb/engctx lookup") Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.c | 2 +- drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv44.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.c b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.c index 003ac915eaadad..8a8895246d26a2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.c @@ -198,7 +198,7 @@ nv31_mpeg_intr(struct nvkm_engine *engine) } if (type == 0x00000010) { - if (!nv31_mpeg_mthd(mpeg, mthd, data)) + if (nv31_mpeg_mthd(mpeg, mthd, data)) show &= ~0x01000000; } } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv44.c b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv44.c index e536f37e24b0c7..c3cf02ed468ea1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv44.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv44.c @@ -172,7 +172,7 @@ nv44_mpeg_intr(struct nvkm_engine *engine) } if (type == 0x00000010) { - if (!nv44_mpeg_mthd(subdev->device, mthd, data)) + if (nv44_mpeg_mthd(subdev->device, mthd, data)) show &= ~0x01000000; } } From e6bcbdc59356b8c6e01df435bacf0aec452122e6 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 18 Mar 2017 16:23:10 -0400 Subject: [PATCH 0010/1590] drm/nouveau/mmu/nv4a: use nv04 mmu rather than the nv44 one commit f94773b9f5ecd1df7c88c2e921924dd41d2020cc upstream. The NV4A (aka NV44A) is an oddity in the family. It only comes in AGP and PCI varieties, rather than a core PCIE chip with a bridge for AGP/PCI as necessary. As a result, it appears that the MMU is also non-functional. For AGP cards, the vast majority of the NV4A lineup, this worked out since we force AGP cards to use the nv04 mmu. However for PCI variants, this did not work. Switching to the NV04 MMU makes it work like a charm. Thanks to mwk for the suggestion. This should be a no-op for NV4A AGP boards, as they were using it already. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70388 Signed-off-by: Ilia Mirkin Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index e0d7f8472ac652..d741ff88e4054d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -714,7 +714,7 @@ nv4a_chipset = { .i2c = nv04_i2c_new, .imem = nv40_instmem_new, .mc = nv44_mc_new, - .mmu = nv44_mmu_new, + .mmu = nv04_mmu_new, .pci = nv40_pci_new, .therm = nv40_therm_new, .timer = nv41_timer_new, From fd3be7eaff1441cbc5c0a3abf50b519d721d6c0c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 12 Apr 2017 00:31:16 +0000 Subject: [PATCH 0011/1590] drm/etnaviv: fix missing unlock on error in etnaviv_gpu_submit() commit 45abdf35cf82e4270328c7237e7812de960ac560 upstream. Add the missing unlock before return from function etnaviv_gpu_submit() in the error handling case. lst: fixed label name. Fixes: f3cd1b064f11 ("drm/etnaviv: (re-)protect fence allocation with GPU mutex") Signed-off-by: Wei Yongjun Signed-off-by: Lucas Stach Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index b87d27859141fb..a336754698f876 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1305,7 +1305,7 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu, if (!fence) { event_free(gpu, event); ret = -ENOMEM; - goto out_pm_put; + goto out_unlock; } gpu->event[event].fence = fence; @@ -1345,6 +1345,7 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu, hangcheck_timer_reset(gpu); ret = 0; +out_unlock: mutex_unlock(&gpu->lock); out_pm_put: From 3d8d2f2344767cbab597027e2cde21d1a8b32cca Mon Sep 17 00:00:00 2001 From: Germano Percossi Date: Fri, 7 Apr 2017 12:29:36 +0100 Subject: [PATCH 0012/1590] CIFS: reconnect thread reschedule itself commit 18ea43113f5b74a97dd4be9bddbac10d68b1a6ce upstream. In case of error, smb2_reconnect_server reschedule itself with a delay, to avoid being too aggressive. Signed-off-by: Germano Percossi Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index bdd32925a15e95..7080dac3592c85 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1987,6 +1987,9 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list; int tcon_exist = false; + int rc; + int resched = false; + /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ mutex_lock(&server->reconnect_mutex); @@ -2014,13 +2017,18 @@ void smb2_reconnect_server(struct work_struct *work) spin_unlock(&cifs_tcp_ses_lock); list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { - if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon)) + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon); + if (!rc) cifs_reopen_persistent_handles(tcon); + else + resched = true; list_del_init(&tcon->rlist); cifs_put_tcon(tcon); } cifs_dbg(FYI, "Reconnecting tcons finished\n"); + if (resched) + queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ); mutex_unlock(&server->reconnect_mutex); /* now we can safely release srv struct */ From 730fecb3401f8a7cc84e8d7b454976efdfdc65c9 Mon Sep 17 00:00:00 2001 From: Germano Percossi Date: Fri, 7 Apr 2017 12:29:38 +0100 Subject: [PATCH 0013/1590] CIFS: store results of cifs_reopen_file to avoid infinite wait commit 1fa839b4986d648b907d117275869a0e46c324b9 upstream. This fixes Continuous Availability when errors during file reopen are encountered. cifs_user_readv and cifs_user_writev would wait for ever if results of cifs_reopen_file are not stored and for later inspection. In fact, results are checked and, in case of errors, a chain of function calls leading to reads and writes to be scheduled in a separate thread is skipped. These threads will wake up the corresponding waiters once reads and writes are done. However, given the return value is not stored, when rc is checked for errors a previous one (always zero) is inspected instead. This leads to pending reads/writes added to the list, making cifs_user_readv and cifs_user_writev wait for ever. Signed-off-by: Germano Percossi Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1cd0e2eefc667a..3925758f6ddea4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2597,7 +2597,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->credits = credits; if (!wdata->cfile->invalidHandle || - !cifs_reopen_file(wdata->cfile, false)) + !(rc = cifs_reopen_file(wdata->cfile, false))) rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); if (rc) { @@ -3002,7 +3002,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->credits = credits; if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); error: if (rc) { @@ -3577,7 +3577,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); From 0ea2dcf1f9e691645c22d9103d699e834a7fb8e6 Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Mon, 10 Apr 2017 20:44:25 -0700 Subject: [PATCH 0014/1590] Input: xpad - add support for Razer Wildcat gamepad commit 5376366886251e2f8f248704adb620a4bc4c0937 upstream. Signed-off-by: Cameron Gutman Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index bbe15243b8e7ab..f397a5b6910fcf 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -201,6 +201,7 @@ static const struct xpad_device { { 0x1430, 0x8888, "TX6500+ Dance Pad (first generation)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX }, { 0x146b, 0x0601, "BigBen Interactive XBOX 360 Controller", 0, XTYPE_XBOX360 }, { 0x1532, 0x0037, "Razer Sabertooth", 0, XTYPE_XBOX360 }, + { 0x1532, 0x0a03, "Razer Wildcat", 0, XTYPE_XBOXONE }, { 0x15e4, 0x3f00, "Power A Mini Pro Elite", 0, XTYPE_XBOX360 }, { 0x15e4, 0x3f0a, "Xbox Airflo wired controller", 0, XTYPE_XBOX360 }, { 0x15e4, 0x3f10, "Batarang Xbox 360 controller", 0, XTYPE_XBOX360 }, @@ -329,6 +330,7 @@ static struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x24c6), /* PowerA Controllers */ XPAD_XBOXONE_VENDOR(0x24c6), /* PowerA Controllers */ XPAD_XBOX360_VENDOR(0x1532), /* Razer Sabertooth */ + XPAD_XBOXONE_VENDOR(0x1532), /* Razer Wildcat */ XPAD_XBOX360_VENDOR(0x15e4), /* Numark X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x162e), /* Joytech X-Box 360 controllers */ { } From 085656dad4b0aa530ecd8730c00381abbd01e2c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Apr 2017 10:10:28 +0200 Subject: [PATCH 0015/1590] perf/x86: Avoid exposing wrong/stale data in intel_pmu_lbr_read_32() commit f2200ac311302fcdca6556fd0c5127eab6c65a3e upstream. When the perf_branch_entry::{in_tx,abort,cycles} fields were added, intel_pmu_lbr_read_32() wasn't updated to initialize them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: 135c5612c460 ("perf/x86/intel: Support Haswell/v4 LBR format") Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/intel/lbr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 81b321ace8e019..f924629836a8ec 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -507,6 +507,9 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].to = msr_lastbranch.to; cpuc->lbr_entries[i].mispred = 0; cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].in_tx = 0; + cpuc->lbr_entries[i].abort = 0; + cpuc->lbr_entries[i].cycles = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; From 41d8b02f6448af73203720c5ecf52eab798b1a40 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 12 Apr 2017 16:27:19 +0100 Subject: [PATCH 0016/1590] x86/efi: Don't try to reserve runtime regions commit 6f6266a561306e206e0e31a5038f029b6a7b1d89 upstream. Reserving a runtime region results in splitting the EFI memory descriptors for the runtime region. This results in runtime region descriptors with bogus memory mappings, leading to interesting crashes like the following during a kexec: general protection fault: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.11.0-rc1 #53 Hardware name: Wiwynn Leopard-Orv2/Leopard-DDR BW, BIOS LBM05 09/30/2016 RIP: 0010:virt_efi_set_variable() ... Call Trace: efi_delete_dummy_variable() efi_enter_virtual_mode() start_kernel() ? set_init_arg() x86_64_start_reservations() x86_64_start_kernel() start_cpu() ... Kernel panic - not syncing: Fatal exception Runtime regions will not be freed and do not need to be reserved, so skip the memmap modification in this case. Signed-off-by: Omar Sandoval Signed-off-by: Matt Fleming Cc: Ard Biesheuvel Cc: Dave Young Cc: Linus Torvalds Cc: Peter Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Fixes: 8e80632fb23f ("efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()") Link: http://lkml.kernel.org/r/20170412152719.9779-2-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/platform/efi/quirks.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 30031d5293c483..cdfe8c62895981 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -201,6 +201,10 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } + /* No need to reserve regions that will never be freed. */ + if (md.attribute & EFI_MEMORY_RUNTIME) + return; + size += addr % EFI_PAGE_SIZE; size = round_up(size, EFI_PAGE_SIZE); addr = round_down(addr, EFI_PAGE_SIZE); From 59bf2308895337fdf70b3a183e7f3c8723030982 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 4 Apr 2017 18:15:01 +0200 Subject: [PATCH 0017/1590] x86/signals: Fix lower/upper bound reporting in compat siginfo commit cfac6dfa42bddfa9711b20d486e521d1a41ab09f upstream. Put the right values from the original siginfo into the userspace compat-siginfo. This fixes the 32-bit MPX "tabletest" testcase on 64-bit kernels. Signed-off-by: Joerg Roedel Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: a4455082dc6f0 ('x86/signals: Add missing signal_compat code for x86 features') Link: http://lkml.kernel.org/r/1491322501-5054-1-git-send-email-joro@8bytes.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/signal_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9ac..71beb28600d453 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } From a9826aa4860a9ca1268a9ab5c10fc5e65a72b4ef Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 6 Apr 2017 09:04:31 -0700 Subject: [PATCH 0018/1590] x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions commit 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd upstream. Before we rework the "pmem api" to stop abusing __copy_user_nocache() for memcpy_to_pmem() we need to fix cases where we may strand dirty data in the cpu cache. The problem occurs when copy_from_iter_pmem() is used for arbitrary data transfers from userspace. There is no guarantee that these transfers, performed by dax_iomap_actor(), will have aligned destinations or aligned transfer lengths. Backstop the usage __copy_user_nocache() with explicit cache management in these unaligned cases. Yes, copy_from_iter_pmem() is now too big for an inline, but addressing that is saved for a later patch that moves the entirety of the "pmem api" into the pmem driver directly. Fixes: 5de490daec8b ("pmem: add copy_from_iter_pmem() and clear_pmem()") Cc: Cc: Jan Kara Cc: Jeff Moyer Cc: Ingo Molnar Cc: Christoph Hellwig Cc: "H. Peter Anvin" Cc: Al Viro Cc: Thomas Gleixner Cc: Matthew Wilcox Reviewed-by: Ross Zwisler Signed-off-by: Toshi Kani Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pmem.h | 42 +++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 2c1ebeb4d7376d..529bb4a6487a9e 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. */ static inline void arch_wb_cache_pmem(void *addr, size_t size) { @@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size) clwb(p); } -/* - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec - * iterators, so for other types (bvec & kvec) we must do a cache write-back. - */ -static inline bool __iter_needs_pmem_wb(struct iov_iter *i) -{ - return iter_is_iovec(i) == false; -} - /** * arch_copy_from_iter_pmem - copy data from an iterator to PMEM * @addr: PMEM destination address @@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, /* TODO: skip the write-back by always using non-temporal stores */ len = copy_from_iter_nocache(addr, bytes, i); - if (__iter_needs_pmem_wb(i)) + /* + * In the iovec case on x86_64 copy_from_iter_nocache() uses + * non-temporal stores for the bulk of the transfer, but we need + * to manually flush if the transfer is unaligned. A cached + * memory copy is used when destination or size is not naturally + * aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + * + * In the non-iovec case the entire destination needs to be + * flushed. + */ + if (iter_is_iovec(i)) { + unsigned long flushed, dest = (unsigned long) addr; + + if (bytes < 8) { + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) + arch_wb_cache_pmem(addr, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + arch_wb_cache_pmem(addr, 1); + } + + flushed = dest - (unsigned long) addr; + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) + arch_wb_cache_pmem(addr + bytes - 1, 1); + } + } else arch_wb_cache_pmem(addr, bytes); return len; From ec980b6f7dcc006f6859c0871c0d7298281f0394 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 10 Apr 2017 17:14:27 +0200 Subject: [PATCH 0019/1590] x86/vdso: Ensure vdso32_enabled gets set to valid values only commit c06989da39cdb10604d572c8c7ea8c8c97f3c483 upstream. vdso_enabled can be set to arbitrary integer values via the kernel command line 'vdso32=' parameter or via 'sysctl abi.vsyscall32'. load_vdso32() only maps VDSO if vdso_enabled == 1, but ARCH_DLINFO_IA32 merily checks for vdso_enabled != 0. As a consequence the AT_SYSINFO_EHDR auxiliary vector for the VDSO_ENTRY is emitted with a NULL pointer which causes a segfault when the application tries to use the VDSO. Restrict the valid arguments on the command line and the sysctl to 0 and 1. Fixes: b0b49f2673f0 ("x86, vdso: Remove compat vdso support") Signed-off-by: Mathias Krause Acked-by: Andy Lutomirski Cc: Peter Zijlstra Cc: Roland McGrath Link: http://lkml.kernel.org/r/1491424561-7187-1-git-send-email-minipli@googlemail.com Link: http://lkml.kernel.org/r/20170410151723.518412863@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/vdso/vdso32-setup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 7853b53959cd35..3f9d1a83891adf 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -30,8 +30,10 @@ static int __init vdso32_setup(char *s) { vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso32_enabled > 1) + if (vdso32_enabled > 1) { pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + vdso32_enabled = 0; + } return 1; } @@ -62,13 +64,18 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include +static const int zero; +static const int one = 1; + static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&zero, + .extra2 = (int *)&one, }, {} }; From 5e29a45f1ef042a87933c0a72eb84fbc490358fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Apr 2017 17:14:28 +0200 Subject: [PATCH 0020/1590] x86/vdso: Plug race between mapping and ELF header setup commit 6fdc6dd90272ce7e75d744f71535cfbd8d77da81 upstream. The vsyscall32 sysctl can racy against a concurrent fork when it switches from disabled to enabled: arch_setup_additional_pages() if (vdso32_enabled) --> No mapping sysctl.vsysscall32() --> vdso32_enabled = true create_elf_tables() ARCH_DLINFO_IA32 if (vdso32_enabled) { --> Add VDSO entry with NULL pointer Make ARCH_DLINFO_IA32 check whether the VDSO mapping has been set up for the newly forked process or not. Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Cc: Peter Zijlstra Cc: Mathias Krause Link: http://lkml.kernel.org/r/20170410151723.602367196@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/elf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e7f155c3045e1b..94aad6364b4742 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -278,7 +278,7 @@ struct task_struct; #define ARCH_DLINFO_IA32 \ do { \ - if (vdso32_enabled) { \ + if (VDSO_CURRENT_BASE) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ From 0b914aa8cdc68c4e97ee48e5143cecd514cf1e6d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 27 Mar 2017 21:53:38 -0700 Subject: [PATCH 0021/1590] acpi, nfit, libnvdimm: fix interleave set cookie calculation (64-bit comparison) commit b03b99a329a14b7302f37c3ea6da3848db41c8c5 upstream. While reviewing the -stable patch for commit 86ef58a4e35e "nfit, libnvdimm: fix interleave set cookie calculation" Ben noted: "This is returning an int, thus it's effectively doing a 32-bit comparison and not the 64-bit comparison you say is needed." Update the compare operation to be immune to this integer demotion problem. Cc: Nicholas Moulin Fixes: 86ef58a4e35e ("nfit, libnvdimm: fix interleave set cookie calculation") Reported-by: Ben Hutchings Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/nfit/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index d1664df001f83c..9ef3941eeff0f0 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1617,7 +1617,11 @@ static int cmp_map(const void *m0, const void *m1) const struct nfit_set_info_map *map0 = m0; const struct nfit_set_info_map *map1 = m1; - return map0->region_offset - map1->region_offset; + if (map0->region_offset < map1->region_offset) + return -1; + else if (map0->region_offset > map1->region_offset) + return 1; + return 0; } /* Retrieve the nth entry referencing this spa */ From 6ef2f0178649b3d3c502217c22c2dfe365c74091 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Apr 2017 00:23:42 +0200 Subject: [PATCH 0022/1590] ACPI / scan: Set the visited flag for all enumerated devices commit f406270bf73d71ea7b35ee3f7a08a44f6594c9b1 upstream. Commit 10c7e20b2ff3 (ACPI / scan: fix enumeration (visited) flags for bus rescans) attempted to fix a problem with ACPI-based enumerateion of I2C/SPI devices, but it forgot to ensure that the visited flag will be set for all of the other enumerated devices, so fix that. Fixes: 10c7e20b2ff3 (ACPI / scan: fix enumeration (visited) flags for bus rescans) Link: https://bugzilla.kernel.org/show_bug.cgi?id=194885 Reported-and-tested-by: Kevin Locke Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/scan.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 5a2fdf156ec9e5..dd3786acba8925 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1827,15 +1827,20 @@ static void acpi_bus_attach(struct acpi_device *device) return; device->flags.match_driver = true; - if (!ret) { - ret = device_attach(&device->dev); - if (ret < 0) - return; - - if (!ret && device->pnp.type.platform_id) - acpi_default_enumeration(device); + if (ret > 0) { + acpi_device_set_enumerated(device); + goto ok; } + ret = device_attach(&device->dev); + if (ret < 0) + return; + + if (ret > 0 || !device->pnp.type.platform_id) + acpi_device_set_enumerated(device); + else + acpi_default_enumeration(device); + ok: list_for_each_entry(child, &device->children, node) acpi_bus_attach(child); From 1681bab7c450508b26f33368a8d1ec57d3fabfcc Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Apr 2017 14:15:20 -0400 Subject: [PATCH 0023/1590] parisc: fix bugs in pa_memcpy commit 409c1b250e30ad0e48b4d15d7319b4e18c046c4f upstream. The patch 554bfeceb8a22d448cd986fc9efce25e833278a1 ("parisc: Fix access fault handling in pa_memcpy()") reimplements the pa_memcpy function. Unfortunatelly, it makes the kernel unbootable. The crash happens in the function ide_complete_cmd where memcpy is called with the same source and destination address. This patch fixes a few bugs in pa_memcpy: * When jumping to .Lcopy_loop_16 for the first time, don't skip the instruction "ldi 31,t0" (this bug made the kernel unbootable) * Use the COND macro when comparing length, so that the comparison is 64-bit (a theoretical issue, in case the length is greater than 0xffffffff) * Don't use the COND macro after the "extru" instruction (the PA-RISC specification says that the upper 32-bits of extru result are undefined, although they are set to zero in practice) * Fix exception addresses in .Lcopy16_fault and .Lcopy8_fault * Rename .Lcopy_loop_4 to .Lcopy_loop_8 (so that it is consistent with .Lcopy8_fault) Fixes: 554bfeceb8a2 ("parisc: Fix access fault handling in pa_memcpy()") Signed-off-by: Mikulas Patocka Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/lib/lusercopy.S | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/parisc/lib/lusercopy.S b/arch/parisc/lib/lusercopy.S index f01188c044ee83..85c28bb80fb743 100644 --- a/arch/parisc/lib/lusercopy.S +++ b/arch/parisc/lib/lusercopy.S @@ -201,7 +201,7 @@ ENTRY_CFI(pa_memcpy) add dst,len,end /* short copy with less than 16 bytes? */ - cmpib,>>=,n 15,len,.Lbyte_loop + cmpib,COND(>>=),n 15,len,.Lbyte_loop /* same alignment? */ xor src,dst,t0 @@ -216,7 +216,7 @@ ENTRY_CFI(pa_memcpy) /* loop until we are 64-bit aligned */ .Lalign_loop64: extru dst,31,3,t1 - cmpib,=,n 0,t1,.Lcopy_loop_16 + cmpib,=,n 0,t1,.Lcopy_loop_16_start 20: ldb,ma 1(srcspc,src),t1 21: stb,ma t1,1(dstspc,dst) b .Lalign_loop64 @@ -225,6 +225,7 @@ ENTRY_CFI(pa_memcpy) ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) +.Lcopy_loop_16_start: ldi 31,t0 .Lcopy_loop_16: cmpb,COND(>>=),n t0,len,.Lword_loop @@ -267,7 +268,7 @@ ENTRY_CFI(pa_memcpy) /* loop until we are 32-bit aligned */ .Lalign_loop32: extru dst,31,2,t1 - cmpib,=,n 0,t1,.Lcopy_loop_4 + cmpib,=,n 0,t1,.Lcopy_loop_8 20: ldb,ma 1(srcspc,src),t1 21: stb,ma t1,1(dstspc,dst) b .Lalign_loop32 @@ -277,7 +278,7 @@ ENTRY_CFI(pa_memcpy) ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) -.Lcopy_loop_4: +.Lcopy_loop_8: cmpib,COND(>>=),n 15,len,.Lbyte_loop 10: ldw 0(srcspc,src),t1 @@ -299,7 +300,7 @@ ENTRY_CFI(pa_memcpy) ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) - b .Lcopy_loop_4 + b .Lcopy_loop_8 ldo -16(len),len .Lbyte_loop: @@ -324,7 +325,7 @@ ENTRY_CFI(pa_memcpy) .Lunaligned_copy: /* align until dst is 32bit-word-aligned */ extru dst,31,2,t1 - cmpib,COND(=),n 0,t1,.Lcopy_dstaligned + cmpib,=,n 0,t1,.Lcopy_dstaligned 20: ldb 0(srcspc,src),t1 ldo 1(src),src 21: stb,ma t1,1(dstspc,dst) @@ -362,7 +363,7 @@ ENTRY_CFI(pa_memcpy) cmpiclr,<> 1,t0,%r0 b,n .Lcase1 .Lcase0: - cmpb,= %r0,len,.Lcda_finish + cmpb,COND(=) %r0,len,.Lcda_finish nop 1: ldw,ma 4(srcspc,src), a3 @@ -376,7 +377,7 @@ ENTRY_CFI(pa_memcpy) 1: ldw,ma 4(srcspc,src), a3 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) ldo -1(len),len - cmpb,=,n %r0,len,.Ldo0 + cmpb,COND(=),n %r0,len,.Ldo0 .Ldo4: 1: ldw,ma 4(srcspc,src), a0 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) @@ -402,7 +403,7 @@ ENTRY_CFI(pa_memcpy) 1: stw,ma t0, 4(dstspc,dst) ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) ldo -4(len),len - cmpb,<> %r0,len,.Ldo4 + cmpb,COND(<>) %r0,len,.Ldo4 nop .Ldo0: shrpw a2, a3, %sar, t0 @@ -436,14 +437,14 @@ ENTRY_CFI(pa_memcpy) /* fault exception fixup handlers: */ #ifdef CONFIG_64BIT .Lcopy16_fault: -10: b .Lcopy_done - std,ma t1,8(dstspc,dst) + b .Lcopy_done +10: std,ma t1,8(dstspc,dst) ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) #endif .Lcopy8_fault: -10: b .Lcopy_done - stw,ma t1,4(dstspc,dst) + b .Lcopy_done +10: stw,ma t1,4(dstspc,dst) ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) .exit From 60174fb3eaa6c84273f282125c4cccdc701549a8 Mon Sep 17 00:00:00 2001 From: "Cohen, Eugene" Date: Tue, 4 Apr 2017 16:27:43 +0100 Subject: [PATCH 0024/1590] efi/libstub: Skip GOP with PIXEL_BLT_ONLY format commit 540f4c0e894f7e46a66dfa424b16424cbdc12c38 upstream. The UEFI Specification permits Graphics Output Protocol (GOP) instances without direct framebuffer access. This is indicated in the Mode structure with a PixelFormat enumeration value of PIXEL_BLT_ONLY. Given that the kernel does not know how to drive a Blt() only framebuffer (which is only permitted before ExitBootServices() anyway), we should disregard such framebuffers when looking for a GOP instance that is suitable for use as the boot console. So modify the EFI GOP initialization to not use a PIXEL_BLT_ONLY instance, preventing attempts later in boot to use an invalid screen_info.lfb_base address. Signed-off-by: Eugene Cohen [ Moved the Blt() only check into the loop and clarified that Blt() only GOPs are unusable by the kernel. ] Signed-off-by: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: leif.lindholm@linaro.org Cc: linux-efi@vger.kernel.org Cc: lorenzo.pieralisi@arm.com Fixes: 9822504c1fa5 ("efifb: Enable the efi-framebuffer platform driver ...") Link: http://lkml.kernel.org/r/20170404152744.26687-2-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/gop.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c index 932742e4cf2314..24c461dea7afb1 100644 --- a/drivers/firmware/efi/libstub/gop.c +++ b/drivers/firmware/efi/libstub/gop.c @@ -149,7 +149,8 @@ setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, status = __gop_query32(sys_table_arg, gop32, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + if (status == EFI_SUCCESS && (!first_gop || conout_found) && + info->pixel_format != PIXEL_BLT_ONLY) { /* * Systems that use the UEFI Console Splitter may * provide multiple GOP devices, not all of which are @@ -266,7 +267,8 @@ setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, status = __gop_query64(sys_table_arg, gop64, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + if (status == EFI_SUCCESS && (!first_gop || conout_found) && + info->pixel_format != PIXEL_BLT_ONLY) { /* * Systems that use the UEFI Console Splitter may * provide multiple GOP devices, not all of which are From eff58f9084a099fd7afe54b0bdcb68d4fbdbdd89 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 4 Apr 2017 16:27:44 +0100 Subject: [PATCH 0025/1590] efi/fb: Avoid reconfiguration of BAR that covers the framebuffer commit 55d728a40d368ba80443be85c02e641fc9082a3f upstream. On UEFI systems, the PCI subsystem is enumerated by the firmware, and if a graphical framebuffer is exposed via a PCI device, its base address and size are exposed to the OS via the Graphics Output Protocol (GOP). On arm64 PCI systems, the entire PCI hierarchy is reconfigured from scratch at boot. This may result in the GOP framebuffer address to become stale, if the BAR covering the framebuffer is modified. This will cause the framebuffer to become unresponsive, and may in some cases result in unpredictable behavior if the range is reassigned to another device. So add a non-x86 quirk to the EFI fb driver to find the BAR associated with the GOP base address, and claim the BAR resource so that the PCI core will not move it. Signed-off-by: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: leif.lindholm@linaro.org Cc: linux-efi@vger.kernel.org Cc: lorenzo.pieralisi@arm.com Fixes: 9822504c1fa5 ("efifb: Enable the efi-framebuffer platform driver ...") Link: http://lkml.kernel.org/r/20170404152744.26687-3-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/efifb.c | 66 ++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index 37a37c4d04cbeb..6f2e729a308f78 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include