Skip to content

Commit

Permalink
mm: replace remap_file_pages() syscall with emulation
Browse files Browse the repository at this point in the history
remap_file_pages(2) was invented to be able efficiently map parts of
huge file into limited 32-bit virtual address space such as in database
workloads.

Nonlinear mappings are pain to support and it seems there's no
legitimate use-cases nowadays since 64-bit systems are widely available.

Let's drop it and get rid of all these special-cased code.

The patch replaces the syscall with emulation which creates new VMA on
each remap_file_pages(), unless they it can be merged with an adjacent
one.

I didn't find *any* real code that uses remap_file_pages(2) to test
emulation impact on.  I've checked Debian code search and source of all
packages in ALT Linux.  No real users: libc wrappers, mentions in
strace, gdb, valgrind and this kind of stuff.

There are few basic tests in LTP for the syscall.  They work just fine
with emulation.

To test performance impact, I've written small test case which
demonstrate pretty much worst case scenario: map 4G shmfs file, write to
begin of every page pgoff of the page, remap pages in reverse order,
read every page.

The test creates 1 million of VMAs if emulation is in use, so I had to
set vm.max_map_count to 1100000 to avoid -ENOMEM.

Before:		23.3 ( +-  4.31% ) seconds
After:		43.9 ( +-  0.85% ) seconds
Slowdown:	1.88x

I believe we can live with that.

Test case:

        #define _GNU_SOURCE
        #include <assert.h>
        #include <stdlib.h>
        #include <stdio.h>
        #include <sys/mman.h>

        #define MB	(1024UL * 1024)
        #define SIZE	(4096 * MB)

        int main(int argc, char **argv)
        {
                unsigned long *p;
                long i, pass;

                for (pass = 0; pass < 10; pass++) {
                        p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE,
                                        MAP_SHARED | MAP_ANONYMOUS, -1, 0);
                        if (p == MAP_FAILED) {
                                perror("mmap");
                                return -1;
                        }

                        for (i = 0; i < SIZE / 4096; i++)
                                p[i * 4096 / sizeof(*p)] = i;

                        for (i = 0; i < SIZE / 4096; i++) {
                                if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096,
                                                0, (SIZE - 4096 * (i + 1)) >> 12, 0)) {
                                        perror("remap_file_pages");
                                        return -1;
                                }
                        }

                        for (i = SIZE / 4096 - 1; i >= 0; i--)
                                assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1);

                        munmap(p, SIZE);
                }

                return 0;
        }

[[email protected]: fix spello]
[[email protected]: initialize populate before usage]
[[email protected]: grab file ref to prevent race while mmaping]
Signed-off-by: "Kirill A. Shutemov" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Dave Jones <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Armin Rigo <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
Cc: Hugh Dickins <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
kiryl authored and torvalds committed Feb 10, 2015
1 parent 3c48687 commit c8d78c1
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 298 deletions.
7 changes: 3 additions & 4 deletions Documentation/vm/remap_file_pages.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit
virtual address space. This use-case is not critical anymore since 64-bit
systems are widely available.

The plan is to deprecate the syscall and replace it with an emulation.
The emulation will create new VMAs instead of nonlinear mappings. It's
going to work slower for rare users of remap_file_pages() but ABI is
preserved.
The syscall is deprecated and replaced it with an emulation now. The
emulation creates new VMAs instead of nonlinear mappings. It's going to
work slower for rare users of remap_file_pages() but ABI is preserved.

One side effect of emulation (apart from performance) is that user can hit
vm.max_map_count limit more easily due to additional VMAs. See comment for
Expand Down
8 changes: 6 additions & 2 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
unsigned long size, pgoff_t pgoff);
static inline int generic_file_remap_pages(struct vm_area_struct *vma,
unsigned long addr, unsigned long size, pgoff_t pgoff)
{
BUG();
return 0;
}
int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
Expand Down
2 changes: 1 addition & 1 deletion mm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#

mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o

Expand Down
283 changes: 0 additions & 283 deletions mm/fremap.c

This file was deleted.

69 changes: 69 additions & 0 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
return vm_munmap(addr, len);
}


/*
* Emulation of deprecated remap_file_pages() syscall.
*/
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;

pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
"See Documentation/vm/remap_file_pages.txt.\n",
current->comm, current->pid);

if (prot)
return ret;
start = start & PAGE_MASK;
size = size & PAGE_MASK;

if (start + size <= start)
return ret;

/* Does pgoff wrap? */
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;

down_write(&mm->mmap_sem);
vma = find_vma(mm, start);

if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;

if (start < vma->vm_start || start + size > vma->vm_end)
goto out;

if (pgoff == linear_page_index(vma, start)) {
ret = 0;
goto out;
}

prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED) {
flags |= MAP_LOCKED;
/* drop PG_Mlocked flag for over-mapped range */
munlock_vma_pages_range(vma, start, start + size);
}

file = get_file(vma->vm_file);
ret = do_mmap_pgoff(vma->vm_file, start, size,
prot, flags, pgoff, &populate);
fput(file);
out:
up_write(&mm->mmap_sem);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
ret = 0;
return ret;
}

static inline void verify_mm_writelocked(struct mm_struct *mm)
{
#ifdef CONFIG_DEBUG_VM
Expand Down
Loading

0 comments on commit c8d78c1

Please sign in to comment.