diff --git a/debian/changelog b/debian/changelog index d187062bd..78ab586b4 100644 --- a/debian/changelog +++ b/debian/changelog @@ -11,6 +11,13 @@ linux (3.16.2-4) UNRELEASED; urgency=medium * Bump ABI to 2 (Closes: #761874) * ata: Enable SATA_ZPODD * tracing: Enable TRACER_SNAPSHOT + * Add memfd_create() and shared memory sealing (Closes: #760702): + - mm: allow drivers to prevent new writable mappings + - shm: add sealing API + - shm: add memfd_create() syscall + - shm: wait for pins to be released when sealing + - mm: Add memfd_create() system call + - [arm*,m68k,mips*,powerpc*,s390*,sparc*] Wire up memfd_create() [ Ian Campbell ] * [armhf] Enable support for Exynos5 systems. (Closes: #759291) diff --git a/debian/patches/features/all/kdbus/ARM-wire-up-memfd_create-syscall.patch b/debian/patches/features/all/kdbus/ARM-wire-up-memfd_create-syscall.patch new file mode 100644 index 000000000..dee26d278 --- /dev/null +++ b/debian/patches/features/all/kdbus/ARM-wire-up-memfd_create-syscall.patch @@ -0,0 +1,35 @@ +From: Russell King +Date: Sat, 9 Aug 2014 08:43:11 +0100 +Subject: ARM: wire up memfd_create syscall +Origin: https://git.kernel.org/linus/e57e41931134e09fc6c03c8d4eb19d516cc6e59b +Bug-Debian: https://bugs.debian.org/760702 + +Add the memfd_create syscall to ARM. + +Signed-off-by: Russell King +[bwh: Backported to 3.16: + - Adjust context + - Insert unimplemented-syscall entries for seccomp and getrandom] +--- +--- a/arch/arm/include/uapi/asm/unistd.h ++++ b/arch/arm/include/uapi/asm/unistd.h +@@ -409,6 +409,7 @@ + #define __NR_sched_setattr (__NR_SYSCALL_BASE+380) + #define __NR_sched_getattr (__NR_SYSCALL_BASE+381) + #define __NR_renameat2 (__NR_SYSCALL_BASE+382) ++#define __NR_memfd_create (__NR_SYSCALL_BASE+385) + + /* + * The following SWIs are ARM private. +--- a/arch/arm/kernel/calls.S ++++ b/arch/arm/kernel/calls.S +@@ -392,6 +392,9 @@ + /* 380 */ CALL(sys_sched_setattr) + CALL(sys_sched_getattr) + CALL(sys_renameat2) ++ CALL(sys_ni_syscall) /* seccomp */ ++ CALL(sys_ni_syscall) /* getrandom */ ++/* 385 */ CALL(sys_memfd_create) + #ifndef syscalls_counted + .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls + #define syscalls_counted diff --git a/debian/patches/features/all/kdbus/MIPS-Wire-up-new-syscalls-getrandom-and-memfd_create.patch b/debian/patches/features/all/kdbus/MIPS-Wire-up-new-syscalls-getrandom-and-memfd_create.patch new file mode 100644 index 000000000..b94e5abd3 --- /dev/null +++ b/debian/patches/features/all/kdbus/MIPS-Wire-up-new-syscalls-getrandom-and-memfd_create.patch @@ -0,0 +1,112 @@ +From: Ralf Baechle +Date: Tue, 26 Aug 2014 03:03:40 +0200 +Subject: MIPS: Wire up new syscalls getrandom and memfd_create. +Origin: http://git.linux-mips.org/?p=ralf/upstream-sfr.git;a=commit;h=42944521af97a3b25516f15f3149aec3779656dc +Bug-Debian: https://bugs.debian.org/760702 + +Signed-off-by: Ralf Baechle +[bwh: Backported to 3.16: + - Adjust context + - Only wire up memfd_create + - Insert unimplemented-syscall entries for seccomp and getrandom] +--- +--- a/arch/mips/include/uapi/asm/unistd.h ++++ b/arch/mips/include/uapi/asm/unistd.h +@@ -372,16 +372,17 @@ + #define __NR_sched_setattr (__NR_Linux + 349) + #define __NR_sched_getattr (__NR_Linux + 350) + #define __NR_renameat2 (__NR_Linux + 351) ++#define __NR_memfd_create (__NR_Linux + 354) + + /* + * Offset of the last Linux o32 flavoured syscall + */ +-#define __NR_Linux_syscalls 351 ++#define __NR_Linux_syscalls 354 + + #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ + + #define __NR_O32_Linux 4000 +-#define __NR_O32_Linux_syscalls 351 ++#define __NR_O32_Linux_syscalls 354 + + #if _MIPS_SIM == _MIPS_SIM_ABI64 + +@@ -701,16 +702,17 @@ + #define __NR_sched_setattr (__NR_Linux + 309) + #define __NR_sched_getattr (__NR_Linux + 310) + #define __NR_renameat2 (__NR_Linux + 311) ++#define __NR_memfd_create (__NR_Linux + 314) + + /* + * Offset of the last Linux 64-bit flavoured syscall + */ +-#define __NR_Linux_syscalls 311 ++#define __NR_Linux_syscalls 314 + + #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ + + #define __NR_64_Linux 5000 +-#define __NR_64_Linux_syscalls 311 ++#define __NR_64_Linux_syscalls 314 + + #if _MIPS_SIM == _MIPS_SIM_NABI32 + +@@ -1034,15 +1036,16 @@ + #define __NR_sched_setattr (__NR_Linux + 313) + #define __NR_sched_getattr (__NR_Linux + 314) + #define __NR_renameat2 (__NR_Linux + 315) ++#define __NR_memfd_create (__NR_Linux + 318) + + /* + * Offset of the last N32 flavoured syscall + */ +-#define __NR_Linux_syscalls 315 ++#define __NR_Linux_syscalls 318 + + #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ + + #define __NR_N32_Linux 6000 +-#define __NR_N32_Linux_syscalls 315 ++#define __NR_N32_Linux_syscalls 318 + + #endif /* _UAPI_ASM_UNISTD_H */ +--- a/arch/mips/kernel/scall32-o32.S ++++ b/arch/mips/kernel/scall32-o32.S +@@ -578,3 +578,6 @@ EXPORT(sys_call_table) + PTR sys_sched_setattr + PTR sys_sched_getattr /* 4350 */ + PTR sys_renameat2 ++ PTR sys_ni_syscall /* seccomp */ ++ PTR sys_ni_syscall /* getrandom */ ++ PTR sys_memfd_create +--- a/arch/mips/kernel/scall64-64.S ++++ b/arch/mips/kernel/scall64-64.S +@@ -431,4 +431,7 @@ EXPORT(sys_call_table) + PTR sys_sched_setattr + PTR sys_sched_getattr /* 5310 */ + PTR sys_renameat2 ++ PTR sys_ni_syscall /* seccomp */ ++ PTR sys_ni_syscall /* getrandom */ ++ PTR sys_memfd_create + .size sys_call_table,.-sys_call_table +--- a/arch/mips/kernel/scall64-n32.S ++++ b/arch/mips/kernel/scall64-n32.S +@@ -424,4 +424,7 @@ EXPORT(sysn32_call_table) + PTR sys_sched_setattr + PTR sys_sched_getattr + PTR sys_renameat2 /* 6315 */ ++ PTR sys_ni_syscall /* seccomp */ ++ PTR sys_ni_syscall /* getrandom */ ++ PTR sys_memfd_create + .size sysn32_call_table,.-sysn32_call_table +--- a/arch/mips/kernel/scall64-o32.S ++++ b/arch/mips/kernel/scall64-o32.S +@@ -557,4 +557,7 @@ EXPORT(sys32_call_table) + PTR sys_sched_setattr + PTR sys_sched_getattr /* 4350 */ + PTR sys_renameat2 ++ PTR sys_ni_syscall /* seccomp */ ++ PTR sys_ni_syscall /* getrandom */ ++ PTR sys_memfd_create + .size sys32_call_table,.-sys32_call_table diff --git a/debian/patches/features/all/kdbus/arm64-compat-wire-up-memfd_create-syscall.patch b/debian/patches/features/all/kdbus/arm64-compat-wire-up-memfd_create-syscall.patch new file mode 100644 index 000000000..4da43a5d0 --- /dev/null +++ b/debian/patches/features/all/kdbus/arm64-compat-wire-up-memfd_create-syscall.patch @@ -0,0 +1,24 @@ +From: Ben Hutchings +Date: Sun, 14 Sep 2014 20:08:27 +0100 +Subject: arm64: compat: wire up memfd_create syscall for aarch32 +Forwarded: not-needed +Bug-Debian: https://bugs.debian.org/760702 + +Implemented upstream by a97a42c47608d0bb6f2dfc2e162cc84a27beb43a, +but the arm64 compat layer looks rather different in 3.16. +--- +--- a/arch/arm64/include/asm/unistd32.h ++++ b/arch/arm64/include/asm/unistd32.h +@@ -404,8 +404,11 @@ __SYSCALL(379, sys_finit_module) + __SYSCALL(380, sys_sched_setattr) + __SYSCALL(381, sys_sched_getattr) + __SYSCALL(382, sys_renameat2) ++__SYSCALL(383, sys_ni_syscall) /* 383 for seccomp */ ++__SYSCALL(384, sys_ni_syscall) /* 384 for getrandom */ ++__SYSCALL(385, sys_memfd_create) + +-#define __NR_compat_syscalls 383 ++#define __NR_compat_syscalls 386 + + /* + * Compat syscall numbers used by the AArch64 kernel. diff --git a/debian/patches/features/all/kdbus/asm-generic-add-memfd_create-system-call-to-unistd.h.patch b/debian/patches/features/all/kdbus/asm-generic-add-memfd_create-system-call-to-unistd.h.patch new file mode 100644 index 000000000..d081993b3 --- /dev/null +++ b/debian/patches/features/all/kdbus/asm-generic-add-memfd_create-system-call-to-unistd.h.patch @@ -0,0 +1,37 @@ +From: Will Deacon +Date: Mon, 11 Aug 2014 14:24:47 +0100 +Subject: asm-generic: add memfd_create system call to unistd.h +Origin: https://git.kernel.org/linus/503e6636b6f96056210062be703356f4253b6db9 +Bug-Debian: https://bugs.debian.org/760702 + +Commit 9183df25fe7b ("shm: add memfd_create() syscall") added a new +system call (memfd_create) but didn't update the asm-generic unistd +header. + +This patch adds the new system call to the asm-generic version of +unistd.h so that it can be used by architectures such as arm64. + +Cc: Arnd Bergmann +Reviewed-by: David Herrmann +Signed-off-by: Will Deacon +[bwh: Backported to 3.16: + - Adjust context + - Insert unimplemented-syscall entries for seccomp and getrandom] +--- +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -699,9 +699,13 @@ __SYSCALL(__NR_sched_setattr, sys_sched_ + __SYSCALL(__NR_sched_getattr, sys_sched_getattr) + #define __NR_renameat2 276 + __SYSCALL(__NR_renameat2, sys_renameat2) ++__SYSCALL(277, sys_ni_syscall) ++__SYSCALL(278, sys_ni_syscall) ++#define __NR_memfd_create 279 ++__SYSCALL(__NR_memfd_create, sys_memfd_create) + + #undef __NR_syscalls +-#define __NR_syscalls 277 ++#define __NR_syscalls 280 + + /* + * All syscalls below here should go away really, diff --git a/debian/patches/features/all/kdbus/m68k-Wire-up-memfd_create.patch b/debian/patches/features/all/kdbus/m68k-Wire-up-memfd_create.patch new file mode 100644 index 000000000..b6e1f3949 --- /dev/null +++ b/debian/patches/features/all/kdbus/m68k-Wire-up-memfd_create.patch @@ -0,0 +1,40 @@ +From: Geert Uytterhoeven +Date: Mon, 11 Aug 2014 21:42:49 +0200 +Subject: m68k: Wire up memfd_create +Origin: https://git.kernel.org/linus/4ed7800987b1b082f8fc98c5cb7eb20cf74280a8 +Bug-Debian: https://bugs.debian.org/760702 + +Signed-off-by: Geert Uytterhoeven +[bwh: Backported to 3.16: + - Adjust context + - Insert unimplemented-syscall entry for getrandom] +--- +--- a/arch/m68k/include/asm/unistd.h ++++ b/arch/m68k/include/asm/unistd.h +@@ -4,7 +4,7 @@ + #include + + +-#define NR_syscalls 352 ++#define NR_syscalls 354 + + #define __ARCH_WANT_OLD_READDIR + #define __ARCH_WANT_OLD_STAT +--- a/arch/m68k/include/uapi/asm/unistd.h ++++ b/arch/m68k/include/uapi/asm/unistd.h +@@ -357,5 +357,6 @@ + #define __NR_sched_setattr 349 + #define __NR_sched_getattr 350 + #define __NR_renameat2 351 ++#define __NR_memfd_create 353 + + #endif /* _UAPI_ASM_M68K_UNISTD_H_ */ +--- a/arch/m68k/kernel/syscalltable.S ++++ b/arch/m68k/kernel/syscalltable.S +@@ -372,4 +372,6 @@ ENTRY(sys_call_table) + .long sys_sched_setattr + .long sys_sched_getattr /* 350 */ + .long sys_renameat2 ++ .long sys_ni_syscall /* getrandom */ ++ .long sys_memfd_create + diff --git a/debian/patches/features/all/kdbus/mm-allow-drivers-to-prevent-new-writable-mappings.patch b/debian/patches/features/all/kdbus/mm-allow-drivers-to-prevent-new-writable-mappings.patch new file mode 100644 index 000000000..838442d8d --- /dev/null +++ b/debian/patches/features/all/kdbus/mm-allow-drivers-to-prevent-new-writable-mappings.patch @@ -0,0 +1,191 @@ +From: David Herrmann +Date: Fri, 8 Aug 2014 14:25:25 -0700 +Subject: mm: allow drivers to prevent new writable mappings +Origin: https://git.kernel.org/linus/4bb5f5d9395bc112d93a134d8f5b05611eddc9c0 +Bug-Debian: https://bugs.debian.org/760702 + +This patch (of 6): + +The i_mmap_writable field counts existing writable mappings of an +address_space. To allow drivers to prevent new writable mappings, make +this counter signed and prevent new writable mappings if it is negative. +This is modelled after i_writecount and DENYWRITE. + +This will be required by the shmem-sealing infrastructure to prevent any +new writable mappings after the WRITE seal has been set. In case there +exists a writable mapping, this operation will fail with EBUSY. + +Note that we rely on the fact that iff you already own a writable mapping, +you can increase the counter without using the helpers. This is the same +that we do for i_writecount. + +Signed-off-by: David Herrmann +Acked-by: Hugh Dickins +Cc: Michael Kerrisk +Cc: Ryan Lortie +Cc: Lennart Poettering +Cc: Daniel Mack +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[bwh: Adjust context to apply after aufs3-mmap.patch] +--- + fs/inode.c | 1 + + include/linux/fs.h | 29 +++++++++++++++++++++++++++-- + kernel/fork.c | 2 +- + mm/mmap.c | 30 ++++++++++++++++++++++++------ + mm/swap_state.c | 1 + + 5 files changed, 54 insertions(+), 9 deletions(-) + +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -166,6 +166,7 @@ int inode_init_always(struct super_block + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; ++ atomic_set(&mapping->i_mmap_writable, 0); + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); + mapping->private_data = NULL; + mapping->backing_dev_info = &default_backing_dev_info; +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -387,7 +387,7 @@ struct address_space { + struct inode *host; /* owner: inode, block_device */ + struct radix_tree_root page_tree; /* radix tree of all pages */ + spinlock_t tree_lock; /* and lock protecting it */ +- unsigned int i_mmap_writable;/* count VM_SHARED mappings */ ++ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ + struct rb_root i_mmap; /* tree of private and shared mappings */ + struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ + struct mutex i_mmap_mutex; /* protect tree, count, list */ +@@ -470,10 +470,35 @@ static inline int mapping_mapped(struct + * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff + * marks vma as VM_SHARED if it is shared, and the file was opened for + * writing i.e. vma may be mprotected writable even if now readonly. ++ * ++ * If i_mmap_writable is negative, no new writable mappings are allowed. You ++ * can only deny writable mappings, if none exists right now. + */ + static inline int mapping_writably_mapped(struct address_space *mapping) + { +- return mapping->i_mmap_writable != 0; ++ return atomic_read(&mapping->i_mmap_writable) > 0; ++} ++ ++static inline int mapping_map_writable(struct address_space *mapping) ++{ ++ return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? ++ 0 : -EPERM; ++} ++ ++static inline void mapping_unmap_writable(struct address_space *mapping) ++{ ++ atomic_dec(&mapping->i_mmap_writable); ++} ++ ++static inline int mapping_deny_writable(struct address_space *mapping) ++{ ++ return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? ++ 0 : -EBUSY; ++} ++ ++static inline void mapping_allow_writable(struct address_space *mapping) ++{ ++ atomic_inc(&mapping->i_mmap_writable); + } + + /* +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -426,7 +426,7 @@ static int dup_mmap(struct mm_struct *mm + atomic_dec(&inode->i_writecount); + mutex_lock(&mapping->i_mmap_mutex); + if (tmp->vm_flags & VM_SHARED) +- mapping->i_mmap_writable++; ++ atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + if (unlikely(tmp->vm_flags & VM_NONLINEAR)) +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -216,7 +216,7 @@ static void __remove_shared_vm_struct(st + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&file_inode(file)->i_writecount); + if (vma->vm_flags & VM_SHARED) +- mapping->i_mmap_writable--; ++ mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) +@@ -617,7 +617,7 @@ static void __vma_link_file(struct vm_ar + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&file_inode(file)->i_writecount); + if (vma->vm_flags & VM_SHARED) +- mapping->i_mmap_writable++; ++ atomic_inc(&mapping->i_mmap_writable); + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) +@@ -1572,6 +1572,17 @@ munmap_back: + if (error) + goto free_vma; + } ++ if (vm_flags & VM_SHARED) { ++ error = mapping_map_writable(file->f_mapping); ++ if (error) ++ goto allow_write_and_free_vma; ++ } ++ ++ /* ->mmap() can change vma->vm_file, but must guarantee that ++ * vma_link() below can deny write-access if VM_DENYWRITE is set ++ * and map writably if VM_SHARED is set. This usually means the ++ * new file must not have been exposed to user-space, yet. ++ */ + vma->vm_file = get_file(file); + error = file->f_op->mmap(file, vma); + if (error) +@@ -1611,8 +1622,12 @@ munmap_back: + + vma_link(mm, vma, prev, rb_link, rb_parent); + /* Once vma denies write, undo our temporary denial count */ +- if (vm_flags & VM_DENYWRITE) +- allow_write_access(file); ++ if (file) { ++ if (vm_flags & VM_SHARED) ++ mapping_unmap_writable(file->f_mapping); ++ if (vm_flags & VM_DENYWRITE) ++ allow_write_access(file); ++ } + file = vma->vm_file; + out: + perf_event_mmap(vma); +@@ -1641,14 +1656,17 @@ out: + return addr; + + unmap_and_free_vma: +- if (vm_flags & VM_DENYWRITE) +- allow_write_access(file); + vma_fput(vma); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; ++ if (vm_flags & VM_SHARED) ++ mapping_unmap_writable(file->f_mapping); ++allow_write_and_free_vma: ++ if (vm_flags & VM_DENYWRITE) ++ allow_write_access(file); + free_vma: + kmem_cache_free(vm_area_cachep, vma); + unacct_error: +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -39,6 +39,7 @@ static struct backing_dev_info swap_back + struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), ++ .i_mmap_writable = ATOMIC_INIT(0), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } diff --git a/debian/patches/features/all/kdbus/powerpc-Wire-up-sys_seccomp-sys_getrandom-and-sys_me.patch b/debian/patches/features/all/kdbus/powerpc-Wire-up-sys_seccomp-sys_getrandom-and-sys_me.patch new file mode 100644 index 000000000..f56a8a5db --- /dev/null +++ b/debian/patches/features/all/kdbus/powerpc-Wire-up-sys_seccomp-sys_getrandom-and-sys_me.patch @@ -0,0 +1,52 @@ +From: Pranith Kumar +Date: Mon, 1 Sep 2014 14:23:07 -0400 +Subject: powerpc: Wire up sys_seccomp(), sys_getrandom() and + sys_memfd_create() +Origin: https://git.kernel.org/linus/7d59deb50aafbdc01b52aed209d202d827261cb0 +Bug-Debian: https://bugs.debian.org/760702 + +This patch wires up three new syscalls for powerpc. The three +new syscalls are seccomp, getrandom and memfd_create. + +Signed-off-by: Pranith Kumar +Reviewed-by: David Herrmann +[bwh: Backported to 3.16: + - Adjust context + - Only wire up memfd_create + - Insert unimplemented-syscall entries for seccomp and getrandom] +--- +diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h +index 542bc0f..7d8a600 100644 +--- a/arch/powerpc/include/asm/systbl.h ++++ b/arch/powerpc/include/asm/systbl.h +@@ -362,3 +362,6 @@ SYSCALL(ni_syscall) /* sys_kcmp */ + SYSCALL_SPU(sched_setattr) + SYSCALL_SPU(sched_getattr) + SYSCALL_SPU(renameat2) ++SYSCALL_SPU(ni_syscall) /* sys_seccomp */ ++SYSCALL_SPU(ni_syscall) /* sys_getrandom */ ++SYSCALL_SPU(memfd_create) +diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h +index 5ce5552..4e9af3f 100644 +--- a/arch/powerpc/include/asm/unistd.h ++++ b/arch/powerpc/include/asm/unistd.h +@@ -12,7 +12,7 @@ + #include + + +-#define __NR_syscalls 358 ++#define __NR_syscalls 361 + + #define __NR__exit __NR_exit + #define NR_syscalls __NR_syscalls +diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h +index 2d526f7..0688fc0 100644 +--- a/arch/powerpc/include/uapi/asm/unistd.h ++++ b/arch/powerpc/include/uapi/asm/unistd.h +@@ -380,5 +380,6 @@ + #define __NR_sched_setattr 355 + #define __NR_sched_getattr 356 + #define __NR_renameat2 357 ++#define __NR_memfd_create 360 + + #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */ diff --git a/debian/patches/features/all/kdbus/s390-wire-up-memfd_create-syscall.patch b/debian/patches/features/all/kdbus/s390-wire-up-memfd_create-syscall.patch new file mode 100644 index 000000000..e583fd492 --- /dev/null +++ b/debian/patches/features/all/kdbus/s390-wire-up-memfd_create-syscall.patch @@ -0,0 +1,40 @@ +From: Heiko Carstens +Date: Mon, 11 Aug 2014 14:50:37 +0200 +Subject: s390: wire up memfd_create syscall +Origin: https://git.kernel.org/linus/7bb1cdbfe2b07d9272b4b132511c82527314b00f +Bug-Debian: https://bugs.debian.org/760702 + +Signed-off-by: Heiko Carstens +Signed-off-by: Martin Schwidefsky +[bwh: Backported to 3.16: + - Adjust context + - Insert unimplemented-syscall entries for seccomp and getrandom] +--- +--- a/arch/s390/include/uapi/asm/unistd.h ++++ b/arch/s390/include/uapi/asm/unistd.h +@@ -283,7 +283,8 @@ + #define __NR_sched_setattr 345 + #define __NR_sched_getattr 346 + #define __NR_renameat2 347 +-#define NR_syscalls 348 ++#define __NR_memfd_create 350 ++#define NR_syscalls 351 + + /* + * There are some system calls that are not present on 64 bit, some +--- a/arch/s390/kernel/compat_wrapper.c ++++ b/arch/s390/kernel/compat_wrapper.c +@@ -214,3 +214,4 @@ COMPAT_SYSCALL_WRAP3(finit_module, int, + COMPAT_SYSCALL_WRAP3(sched_setattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, flags); + COMPAT_SYSCALL_WRAP4(sched_getattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, size, unsigned int, flags); + COMPAT_SYSCALL_WRAP5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags); ++COMPAT_SYSCALL_WRAP2(memfd_create, const char __user *, uname, unsigned int, flags) +--- a/arch/s390/kernel/syscalls.S ++++ b/arch/s390/kernel/syscalls.S +@@ -356,3 +356,6 @@ SYSCALL(sys_finit_module,sys_finit_modul + SYSCALL(sys_sched_setattr,sys_sched_setattr,compat_sys_sched_setattr) /* 345 */ + SYSCALL(sys_sched_getattr,sys_sched_getattr,compat_sys_sched_getattr) + SYSCALL(sys_renameat2,sys_renameat2,compat_sys_renameat2) ++SYSCALL(sys_ni_syscall,sys_ni_syscall,compat_sys_ni_syscall) /* seccomp */ ++SYSCALL(sys_ni_syscall,sys_ni_syscall,compat_sys_ni_syscall) /* getrandom */ ++SYSCALL(sys_memfd_create,sys_memfd_create,compat_sys_memfd_create) /* 350 */ diff --git a/debian/patches/features/all/kdbus/selftests-add-memfd-sealing-page-pinning-tests.patch b/debian/patches/features/all/kdbus/selftests-add-memfd-sealing-page-pinning-tests.patch new file mode 100644 index 000000000..bf3ccd3b1 --- /dev/null +++ b/debian/patches/features/all/kdbus/selftests-add-memfd-sealing-page-pinning-tests.patch @@ -0,0 +1,524 @@ +From: David Herrmann +Date: Fri, 8 Aug 2014 14:25:34 -0700 +Subject: selftests: add memfd/sealing page-pinning tests +Origin: https://git.kernel.org/linus/87b2d44026e0e315a7401551e95b189ac4b28217 +Bug-Debian: https://bugs.debian.org/760702 + +Setting SEAL_WRITE is not possible if there're pending GUP users. This +commit adds selftests for memfd+sealing that use FUSE to create pending +page-references. FUSE is very helpful here in that it allows us to delay +direct-IO operations for an arbitrary amount of time. This way, we can +force the kernel to pin pages and then run our normal selftests. + +Signed-off-by: David Herrmann +Acked-by: Hugh Dickins +Cc: Michael Kerrisk +Cc: Ryan Lortie +Cc: Lennart Poettering +Cc: Daniel Mack +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + tools/testing/selftests/memfd/.gitignore | 2 + + tools/testing/selftests/memfd/Makefile | 14 +- + tools/testing/selftests/memfd/fuse_mnt.c | 110 +++++++++ + tools/testing/selftests/memfd/fuse_test.c | 311 +++++++++++++++++++++++++ + tools/testing/selftests/memfd/run_fuse_test.sh | 14 ++ + 5 files changed, 450 insertions(+), 1 deletion(-) + create mode 100644 tools/testing/selftests/memfd/fuse_mnt.c + create mode 100644 tools/testing/selftests/memfd/fuse_test.c + create mode 100644 tools/testing/selftests/memfd/run_fuse_test.sh + +diff --git a/tools/testing/selftests/memfd/.gitignore b/tools/testing/selftests/memfd/.gitignore +index bcc8ee2..afe87c4 100644 +--- a/tools/testing/selftests/memfd/.gitignore ++++ b/tools/testing/selftests/memfd/.gitignore +@@ -1,2 +1,4 @@ ++fuse_mnt ++fuse_test + memfd_test + memfd-test-file +diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile +index 36653b9..6816c49 100644 +--- a/tools/testing/selftests/memfd/Makefile ++++ b/tools/testing/selftests/memfd/Makefile +@@ -7,6 +7,7 @@ ifeq ($(ARCH),x86_64) + ARCH := X86 + endif + ++CFLAGS += -D_FILE_OFFSET_BITS=64 + CFLAGS += -I../../../../arch/x86/include/generated/uapi/ + CFLAGS += -I../../../../arch/x86/include/uapi/ + CFLAGS += -I../../../../include/uapi/ +@@ -25,5 +26,16 @@ ifeq ($(ARCH),X86) + endif + @./memfd_test || echo "memfd_test: [FAIL]" + ++build_fuse: ++ifeq ($(ARCH),X86) ++ gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt ++ gcc $(CFLAGS) fuse_test.c -o fuse_test ++else ++ echo "Not an x86 target, can't build memfd selftest" ++endif ++ ++run_fuse: build_fuse ++ @./run_fuse_test.sh || echo "fuse_test: [FAIL]" ++ + clean: +- $(RM) memfd_test ++ $(RM) memfd_test fuse_test +diff --git a/tools/testing/selftests/memfd/fuse_mnt.c b/tools/testing/selftests/memfd/fuse_mnt.c +new file mode 100644 +index 0000000..feacf12 +--- /dev/null ++++ b/tools/testing/selftests/memfd/fuse_mnt.c +@@ -0,0 +1,110 @@ ++/* ++ * memfd test file-system ++ * This file uses FUSE to create a dummy file-system with only one file /memfd. ++ * This file is read-only and takes 1s per read. ++ * ++ * This file-system is used by the memfd test-cases to force the kernel to pin ++ * pages during reads(). Due to the 1s delay of this file-system, this is a ++ * nice way to test race-conditions against get_user_pages() in the kernel. ++ * ++ * We use direct_io==1 to force the kernel to use direct-IO for this ++ * file-system. ++ */ ++ ++#define FUSE_USE_VERSION 26 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static const char memfd_content[] = "memfd-example-content"; ++static const char memfd_path[] = "/memfd"; ++ ++static int memfd_getattr(const char *path, struct stat *st) ++{ ++ memset(st, 0, sizeof(*st)); ++ ++ if (!strcmp(path, "/")) { ++ st->st_mode = S_IFDIR | 0755; ++ st->st_nlink = 2; ++ } else if (!strcmp(path, memfd_path)) { ++ st->st_mode = S_IFREG | 0444; ++ st->st_nlink = 1; ++ st->st_size = strlen(memfd_content); ++ } else { ++ return -ENOENT; ++ } ++ ++ return 0; ++} ++ ++static int memfd_readdir(const char *path, ++ void *buf, ++ fuse_fill_dir_t filler, ++ off_t offset, ++ struct fuse_file_info *fi) ++{ ++ if (strcmp(path, "/")) ++ return -ENOENT; ++ ++ filler(buf, ".", NULL, 0); ++ filler(buf, "..", NULL, 0); ++ filler(buf, memfd_path + 1, NULL, 0); ++ ++ return 0; ++} ++ ++static int memfd_open(const char *path, struct fuse_file_info *fi) ++{ ++ if (strcmp(path, memfd_path)) ++ return -ENOENT; ++ ++ if ((fi->flags & 3) != O_RDONLY) ++ return -EACCES; ++ ++ /* force direct-IO */ ++ fi->direct_io = 1; ++ ++ return 0; ++} ++ ++static int memfd_read(const char *path, ++ char *buf, ++ size_t size, ++ off_t offset, ++ struct fuse_file_info *fi) ++{ ++ size_t len; ++ ++ if (strcmp(path, memfd_path) != 0) ++ return -ENOENT; ++ ++ sleep(1); ++ ++ len = strlen(memfd_content); ++ if (offset < len) { ++ if (offset + size > len) ++ size = len - offset; ++ ++ memcpy(buf, memfd_content + offset, size); ++ } else { ++ size = 0; ++ } ++ ++ return size; ++} ++ ++static struct fuse_operations memfd_ops = { ++ .getattr = memfd_getattr, ++ .readdir = memfd_readdir, ++ .open = memfd_open, ++ .read = memfd_read, ++}; ++ ++int main(int argc, char *argv[]) ++{ ++ return fuse_main(argc, argv, &memfd_ops, NULL); ++} +diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c +new file mode 100644 +index 0000000..67908b1 +--- /dev/null ++++ b/tools/testing/selftests/memfd/fuse_test.c +@@ -0,0 +1,311 @@ ++/* ++ * memfd GUP test-case ++ * This tests memfd interactions with get_user_pages(). We require the ++ * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This ++ * file-system delays _all_ reads by 1s and forces direct-IO. This means, any ++ * read() on files in that file-system will pin the receive-buffer pages for at ++ * least 1s via get_user_pages(). ++ * ++ * We use this trick to race ADD_SEALS against a write on a memfd object. The ++ * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use ++ * the read() syscall with our memory-mapped memfd object as receive buffer to ++ * force the kernel to write into our memfd object. ++ */ ++ ++#define _GNU_SOURCE ++#define __EXPORTED_HEADERS__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MFD_DEF_SIZE 8192 ++#define STACK_SIZE 65535 ++ ++static int sys_memfd_create(const char *name, ++ unsigned int flags) ++{ ++ return syscall(__NR_memfd_create, name, flags); ++} ++ ++static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) ++{ ++ int r, fd; ++ ++ fd = sys_memfd_create(name, flags); ++ if (fd < 0) { ++ printf("memfd_create(\"%s\", %u) failed: %m\n", ++ name, flags); ++ abort(); ++ } ++ ++ r = ftruncate(fd, sz); ++ if (r < 0) { ++ printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); ++ abort(); ++ } ++ ++ return fd; ++} ++ ++static __u64 mfd_assert_get_seals(int fd) ++{ ++ long r; ++ ++ r = fcntl(fd, F_GET_SEALS); ++ if (r < 0) { ++ printf("GET_SEALS(%d) failed: %m\n", fd); ++ abort(); ++ } ++ ++ return r; ++} ++ ++static void mfd_assert_has_seals(int fd, __u64 seals) ++{ ++ __u64 s; ++ ++ s = mfd_assert_get_seals(fd); ++ if (s != seals) { ++ printf("%llu != %llu = GET_SEALS(%d)\n", ++ (unsigned long long)seals, (unsigned long long)s, fd); ++ abort(); ++ } ++} ++ ++static void mfd_assert_add_seals(int fd, __u64 seals) ++{ ++ long r; ++ __u64 s; ++ ++ s = mfd_assert_get_seals(fd); ++ r = fcntl(fd, F_ADD_SEALS, seals); ++ if (r < 0) { ++ printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", ++ fd, (unsigned long long)s, (unsigned long long)seals); ++ abort(); ++ } ++} ++ ++static int mfd_busy_add_seals(int fd, __u64 seals) ++{ ++ long r; ++ __u64 s; ++ ++ r = fcntl(fd, F_GET_SEALS); ++ if (r < 0) ++ s = 0; ++ else ++ s = r; ++ ++ r = fcntl(fd, F_ADD_SEALS, seals); ++ if (r < 0 && errno != EBUSY) { ++ printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", ++ fd, (unsigned long long)s, (unsigned long long)seals); ++ abort(); ++ } ++ ++ return r; ++} ++ ++static void *mfd_assert_mmap_shared(int fd) ++{ ++ void *p; ++ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ ++ return p; ++} ++ ++static void *mfd_assert_mmap_private(int fd) ++{ ++ void *p; ++ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ | PROT_WRITE, ++ MAP_PRIVATE, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ ++ return p; ++} ++ ++static int global_mfd = -1; ++static void *global_p = NULL; ++ ++static int sealing_thread_fn(void *arg) ++{ ++ int sig, r; ++ ++ /* ++ * This thread first waits 200ms so any pending operation in the parent ++ * is correctly started. After that, it tries to seal @global_mfd as ++ * SEAL_WRITE. This _must_ fail as the parent thread has a read() into ++ * that memory mapped object still ongoing. ++ * We then wait one more second and try sealing again. This time it ++ * must succeed as there shouldn't be anyone else pinning the pages. ++ */ ++ ++ /* wait 200ms for FUSE-request to be active */ ++ usleep(200000); ++ ++ /* unmount mapping before sealing to avoid i_mmap_writable failures */ ++ munmap(global_p, MFD_DEF_SIZE); ++ ++ /* Try sealing the global file; expect EBUSY or success. Current ++ * kernels will never succeed, but in the future, kernels might ++ * implement page-replacements or other fancy ways to avoid racing ++ * writes. */ ++ r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); ++ if (r >= 0) { ++ printf("HURRAY! This kernel fixed GUP races!\n"); ++ } else { ++ /* wait 1s more so the FUSE-request is done */ ++ sleep(1); ++ ++ /* try sealing the global file again */ ++ mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); ++ } ++ ++ return 0; ++} ++ ++static pid_t spawn_sealing_thread(void) ++{ ++ uint8_t *stack; ++ pid_t pid; ++ ++ stack = malloc(STACK_SIZE); ++ if (!stack) { ++ printf("malloc(STACK_SIZE) failed: %m\n"); ++ abort(); ++ } ++ ++ pid = clone(sealing_thread_fn, ++ stack + STACK_SIZE, ++ SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, ++ NULL); ++ if (pid < 0) { ++ printf("clone() failed: %m\n"); ++ abort(); ++ } ++ ++ return pid; ++} ++ ++static void join_sealing_thread(pid_t pid) ++{ ++ waitpid(pid, NULL, 0); ++} ++ ++int main(int argc, char **argv) ++{ ++ static const char zero[MFD_DEF_SIZE]; ++ int fd, mfd, r; ++ void *p; ++ int was_sealed; ++ pid_t pid; ++ ++ if (argc < 2) { ++ printf("error: please pass path to file in fuse_mnt mount-point\n"); ++ abort(); ++ } ++ ++ /* open FUSE memfd file for GUP testing */ ++ printf("opening: %s\n", argv[1]); ++ fd = open(argv[1], O_RDONLY | O_CLOEXEC); ++ if (fd < 0) { ++ printf("cannot open(\"%s\"): %m\n", argv[1]); ++ abort(); ++ } ++ ++ /* create new memfd-object */ ++ mfd = mfd_assert_new("kern_memfd_fuse", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ ++ /* mmap memfd-object for writing */ ++ p = mfd_assert_mmap_shared(mfd); ++ ++ /* pass mfd+mapping to a separate sealing-thread which tries to seal ++ * the memfd objects with SEAL_WRITE while we write into it */ ++ global_mfd = mfd; ++ global_p = p; ++ pid = spawn_sealing_thread(); ++ ++ /* Use read() on the FUSE file to read into our memory-mapped memfd ++ * object. This races the other thread which tries to seal the ++ * memfd-object. ++ * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. ++ * This guarantees that the receive-buffer is pinned for 1s until the ++ * data is written into it. The racing ADD_SEALS should thus fail as ++ * the pages are still pinned. */ ++ r = read(fd, p, MFD_DEF_SIZE); ++ if (r < 0) { ++ printf("read() failed: %m\n"); ++ abort(); ++ } else if (!r) { ++ printf("unexpected EOF on read()\n"); ++ abort(); ++ } ++ ++ was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; ++ ++ /* Wait for sealing-thread to finish and verify that it ++ * successfully sealed the file after the second try. */ ++ join_sealing_thread(pid); ++ mfd_assert_has_seals(mfd, F_SEAL_WRITE); ++ ++ /* *IF* the memfd-object was sealed at the time our read() returned, ++ * then the kernel did a page-replacement or canceled the read() (or ++ * whatever magic it did..). In that case, the memfd object is still ++ * all zero. ++ * In case the memfd-object was *not* sealed, the read() was successfull ++ * and the memfd object must *not* be all zero. ++ * Note that in real scenarios, there might be a mixture of both, but ++ * in this test-cases, we have explicit 200ms delays which should be ++ * enough to avoid any in-flight writes. */ ++ ++ p = mfd_assert_mmap_private(mfd); ++ if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { ++ printf("memfd sealed during read() but data not discarded\n"); ++ abort(); ++ } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { ++ printf("memfd sealed after read() but data discarded\n"); ++ abort(); ++ } ++ ++ close(mfd); ++ close(fd); ++ ++ printf("fuse: DONE\n"); ++ ++ return 0; ++} +diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh +new file mode 100644 +index 0000000..69b930e +--- /dev/null ++++ b/tools/testing/selftests/memfd/run_fuse_test.sh +@@ -0,0 +1,14 @@ ++#!/bin/sh ++ ++if test -d "./mnt" ; then ++ fusermount -u ./mnt ++ rmdir ./mnt ++fi ++ ++set -e ++ ++mkdir mnt ++./fuse_mnt ./mnt ++./fuse_test ./mnt/memfd ++fusermount -u ./mnt ++rmdir ./mnt diff --git a/debian/patches/features/all/kdbus/selftests-add-memfd_create-sealing-tests.patch b/debian/patches/features/all/kdbus/selftests-add-memfd_create-sealing-tests.patch new file mode 100644 index 000000000..3ba6b8c19 --- /dev/null +++ b/debian/patches/features/all/kdbus/selftests-add-memfd_create-sealing-tests.patch @@ -0,0 +1,991 @@ +From: David Herrmann +Date: Fri, 8 Aug 2014 14:25:32 -0700 +Subject: selftests: add memfd_create() + sealing tests +Origin: https://git.kernel.org/linus/4f5ce5e8d7e2da3c714df8a7fa42edb9f992fc52 +Bug-Debian: https://bugs.debian.org/760702 + +Some basic tests to verify sealing on memfds works as expected and +guarantees the advertised semantics. + +Signed-off-by: David Herrmann +Acked-by: Hugh Dickins +Cc: Michael Kerrisk +Cc: Ryan Lortie +Cc: Lennart Poettering +Cc: Daniel Mack +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + tools/testing/selftests/Makefile | 1 + + tools/testing/selftests/memfd/.gitignore | 2 + + tools/testing/selftests/memfd/Makefile | 29 + + tools/testing/selftests/memfd/memfd_test.c | 913 +++++++++++++++++++++++++++++ + 4 files changed, 945 insertions(+) + create mode 100644 tools/testing/selftests/memfd/.gitignore + create mode 100644 tools/testing/selftests/memfd/Makefile + create mode 100644 tools/testing/selftests/memfd/memfd_test.c + +--- a/tools/testing/selftests/Makefile ++++ b/tools/testing/selftests/Makefile +@@ -2,6 +2,7 @@ TARGETS = breakpoints + TARGETS += cpu-hotplug + TARGETS += efivarfs + TARGETS += kcmp ++TARGETS += memfd + TARGETS += memory-hotplug + TARGETS += mqueue + TARGETS += mount +--- /dev/null ++++ b/tools/testing/selftests/memfd/.gitignore +@@ -0,0 +1,2 @@ ++memfd_test ++memfd-test-file +--- /dev/null ++++ b/tools/testing/selftests/memfd/Makefile +@@ -0,0 +1,29 @@ ++uname_M := $(shell uname -m 2>/dev/null || echo not) ++ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) ++ifeq ($(ARCH),i386) ++ ARCH := X86 ++endif ++ifeq ($(ARCH),x86_64) ++ ARCH := X86 ++endif ++ ++CFLAGS += -I../../../../arch/x86/include/generated/uapi/ ++CFLAGS += -I../../../../arch/x86/include/uapi/ ++CFLAGS += -I../../../../include/uapi/ ++CFLAGS += -I../../../../include/ ++ ++all: ++ifeq ($(ARCH),X86) ++ gcc $(CFLAGS) memfd_test.c -o memfd_test ++else ++ echo "Not an x86 target, can't build memfd selftest" ++endif ++ ++run_tests: all ++ifeq ($(ARCH),X86) ++ gcc $(CFLAGS) memfd_test.c -o memfd_test ++endif ++ @./memfd_test || echo "memfd_test: [FAIL]" ++ ++clean: ++ $(RM) memfd_test +--- /dev/null ++++ b/tools/testing/selftests/memfd/memfd_test.c +@@ -0,0 +1,913 @@ ++#define _GNU_SOURCE ++#define __EXPORTED_HEADERS__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MFD_DEF_SIZE 8192 ++#define STACK_SIZE 65535 ++ ++static int sys_memfd_create(const char *name, ++ unsigned int flags) ++{ ++ return syscall(__NR_memfd_create, name, flags); ++} ++ ++static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) ++{ ++ int r, fd; ++ ++ fd = sys_memfd_create(name, flags); ++ if (fd < 0) { ++ printf("memfd_create(\"%s\", %u) failed: %m\n", ++ name, flags); ++ abort(); ++ } ++ ++ r = ftruncate(fd, sz); ++ if (r < 0) { ++ printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); ++ abort(); ++ } ++ ++ return fd; ++} ++ ++static void mfd_fail_new(const char *name, unsigned int flags) ++{ ++ int r; ++ ++ r = sys_memfd_create(name, flags); ++ if (r >= 0) { ++ printf("memfd_create(\"%s\", %u) succeeded, but failure expected\n", ++ name, flags); ++ close(r); ++ abort(); ++ } ++} ++ ++static __u64 mfd_assert_get_seals(int fd) ++{ ++ long r; ++ ++ r = fcntl(fd, F_GET_SEALS); ++ if (r < 0) { ++ printf("GET_SEALS(%d) failed: %m\n", fd); ++ abort(); ++ } ++ ++ return r; ++} ++ ++static void mfd_assert_has_seals(int fd, __u64 seals) ++{ ++ __u64 s; ++ ++ s = mfd_assert_get_seals(fd); ++ if (s != seals) { ++ printf("%llu != %llu = GET_SEALS(%d)\n", ++ (unsigned long long)seals, (unsigned long long)s, fd); ++ abort(); ++ } ++} ++ ++static void mfd_assert_add_seals(int fd, __u64 seals) ++{ ++ long r; ++ __u64 s; ++ ++ s = mfd_assert_get_seals(fd); ++ r = fcntl(fd, F_ADD_SEALS, seals); ++ if (r < 0) { ++ printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", ++ fd, (unsigned long long)s, (unsigned long long)seals); ++ abort(); ++ } ++} ++ ++static void mfd_fail_add_seals(int fd, __u64 seals) ++{ ++ long r; ++ __u64 s; ++ ++ r = fcntl(fd, F_GET_SEALS); ++ if (r < 0) ++ s = 0; ++ else ++ s = r; ++ ++ r = fcntl(fd, F_ADD_SEALS, seals); ++ if (r >= 0) { ++ printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected\n", ++ fd, (unsigned long long)s, (unsigned long long)seals); ++ abort(); ++ } ++} ++ ++static void mfd_assert_size(int fd, size_t size) ++{ ++ struct stat st; ++ int r; ++ ++ r = fstat(fd, &st); ++ if (r < 0) { ++ printf("fstat(%d) failed: %m\n", fd); ++ abort(); ++ } else if (st.st_size != size) { ++ printf("wrong file size %lld, but expected %lld\n", ++ (long long)st.st_size, (long long)size); ++ abort(); ++ } ++} ++ ++static int mfd_assert_dup(int fd) ++{ ++ int r; ++ ++ r = dup(fd); ++ if (r < 0) { ++ printf("dup(%d) failed: %m\n", fd); ++ abort(); ++ } ++ ++ return r; ++} ++ ++static void *mfd_assert_mmap_shared(int fd) ++{ ++ void *p; ++ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ ++ return p; ++} ++ ++static void *mfd_assert_mmap_private(int fd) ++{ ++ void *p; ++ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ, ++ MAP_PRIVATE, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ ++ return p; ++} ++ ++static int mfd_assert_open(int fd, int flags, mode_t mode) ++{ ++ char buf[512]; ++ int r; ++ ++ sprintf(buf, "/proc/self/fd/%d", fd); ++ r = open(buf, flags, mode); ++ if (r < 0) { ++ printf("open(%s) failed: %m\n", buf); ++ abort(); ++ } ++ ++ return r; ++} ++ ++static void mfd_fail_open(int fd, int flags, mode_t mode) ++{ ++ char buf[512]; ++ int r; ++ ++ sprintf(buf, "/proc/self/fd/%d", fd); ++ r = open(buf, flags, mode); ++ if (r >= 0) { ++ printf("open(%s) didn't fail as expected\n"); ++ abort(); ++ } ++} ++ ++static void mfd_assert_read(int fd) ++{ ++ char buf[16]; ++ void *p; ++ ssize_t l; ++ ++ l = read(fd, buf, sizeof(buf)); ++ if (l != sizeof(buf)) { ++ printf("read() failed: %m\n"); ++ abort(); ++ } ++ ++ /* verify PROT_READ *is* allowed */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ, ++ MAP_PRIVATE, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ munmap(p, MFD_DEF_SIZE); ++ ++ /* verify MAP_PRIVATE is *always* allowed (even writable) */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ | PROT_WRITE, ++ MAP_PRIVATE, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ munmap(p, MFD_DEF_SIZE); ++} ++ ++static void mfd_assert_write(int fd) ++{ ++ ssize_t l; ++ void *p; ++ int r; ++ ++ /* verify write() succeeds */ ++ l = write(fd, "\0\0\0\0", 4); ++ if (l != 4) { ++ printf("write() failed: %m\n"); ++ abort(); ++ } ++ ++ /* verify PROT_READ | PROT_WRITE is allowed */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ *(char *)p = 0; ++ munmap(p, MFD_DEF_SIZE); ++ ++ /* verify PROT_WRITE is allowed */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_WRITE, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ *(char *)p = 0; ++ munmap(p, MFD_DEF_SIZE); ++ ++ /* verify PROT_READ with MAP_SHARED is allowed and a following ++ * mprotect(PROT_WRITE) allows writing */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p == MAP_FAILED) { ++ printf("mmap() failed: %m\n"); ++ abort(); ++ } ++ ++ r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); ++ if (r < 0) { ++ printf("mprotect() failed: %m\n"); ++ abort(); ++ } ++ ++ *(char *)p = 0; ++ munmap(p, MFD_DEF_SIZE); ++ ++ /* verify PUNCH_HOLE works */ ++ r = fallocate(fd, ++ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, ++ 0, ++ MFD_DEF_SIZE); ++ if (r < 0) { ++ printf("fallocate(PUNCH_HOLE) failed: %m\n"); ++ abort(); ++ } ++} ++ ++static void mfd_fail_write(int fd) ++{ ++ ssize_t l; ++ void *p; ++ int r; ++ ++ /* verify write() fails */ ++ l = write(fd, "data", 4); ++ if (l != -EPERM) { ++ printf("expected EPERM on write(), but got %d: %m\n", (int)l); ++ abort(); ++ } ++ ++ /* verify PROT_READ | PROT_WRITE is not allowed */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p != MAP_FAILED) { ++ printf("mmap() didn't fail as expected\n"); ++ abort(); ++ } ++ ++ /* verify PROT_WRITE is not allowed */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_WRITE, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p != MAP_FAILED) { ++ printf("mmap() didn't fail as expected\n"); ++ abort(); ++ } ++ ++ /* Verify PROT_READ with MAP_SHARED with a following mprotect is not ++ * allowed. Note that for r/w the kernel already prevents the mmap. */ ++ p = mmap(NULL, ++ MFD_DEF_SIZE, ++ PROT_READ, ++ MAP_SHARED, ++ fd, ++ 0); ++ if (p != MAP_FAILED) { ++ r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); ++ if (r >= 0) { ++ printf("mmap()+mprotect() didn't fail as expected\n"); ++ abort(); ++ } ++ } ++ ++ /* verify PUNCH_HOLE fails */ ++ r = fallocate(fd, ++ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, ++ 0, ++ MFD_DEF_SIZE); ++ if (r >= 0) { ++ printf("fallocate(PUNCH_HOLE) didn't fail as expected\n"); ++ abort(); ++ } ++} ++ ++static void mfd_assert_shrink(int fd) ++{ ++ int r, fd2; ++ ++ r = ftruncate(fd, MFD_DEF_SIZE / 2); ++ if (r < 0) { ++ printf("ftruncate(SHRINK) failed: %m\n"); ++ abort(); ++ } ++ ++ mfd_assert_size(fd, MFD_DEF_SIZE / 2); ++ ++ fd2 = mfd_assert_open(fd, ++ O_RDWR | O_CREAT | O_TRUNC, ++ S_IRUSR | S_IWUSR); ++ close(fd2); ++ ++ mfd_assert_size(fd, 0); ++} ++ ++static void mfd_fail_shrink(int fd) ++{ ++ int r; ++ ++ r = ftruncate(fd, MFD_DEF_SIZE / 2); ++ if (r >= 0) { ++ printf("ftruncate(SHRINK) didn't fail as expected\n"); ++ abort(); ++ } ++ ++ mfd_fail_open(fd, ++ O_RDWR | O_CREAT | O_TRUNC, ++ S_IRUSR | S_IWUSR); ++} ++ ++static void mfd_assert_grow(int fd) ++{ ++ int r; ++ ++ r = ftruncate(fd, MFD_DEF_SIZE * 2); ++ if (r < 0) { ++ printf("ftruncate(GROW) failed: %m\n"); ++ abort(); ++ } ++ ++ mfd_assert_size(fd, MFD_DEF_SIZE * 2); ++ ++ r = fallocate(fd, ++ 0, ++ 0, ++ MFD_DEF_SIZE * 4); ++ if (r < 0) { ++ printf("fallocate(ALLOC) failed: %m\n"); ++ abort(); ++ } ++ ++ mfd_assert_size(fd, MFD_DEF_SIZE * 4); ++} ++ ++static void mfd_fail_grow(int fd) ++{ ++ int r; ++ ++ r = ftruncate(fd, MFD_DEF_SIZE * 2); ++ if (r >= 0) { ++ printf("ftruncate(GROW) didn't fail as expected\n"); ++ abort(); ++ } ++ ++ r = fallocate(fd, ++ 0, ++ 0, ++ MFD_DEF_SIZE * 4); ++ if (r >= 0) { ++ printf("fallocate(ALLOC) didn't fail as expected\n"); ++ abort(); ++ } ++} ++ ++static void mfd_assert_grow_write(int fd) ++{ ++ static char buf[MFD_DEF_SIZE * 8]; ++ ssize_t l; ++ ++ l = pwrite(fd, buf, sizeof(buf), 0); ++ if (l != sizeof(buf)) { ++ printf("pwrite() failed: %m\n"); ++ abort(); ++ } ++ ++ mfd_assert_size(fd, MFD_DEF_SIZE * 8); ++} ++ ++static void mfd_fail_grow_write(int fd) ++{ ++ static char buf[MFD_DEF_SIZE * 8]; ++ ssize_t l; ++ ++ l = pwrite(fd, buf, sizeof(buf), 0); ++ if (l == sizeof(buf)) { ++ printf("pwrite() didn't fail as expected\n"); ++ abort(); ++ } ++} ++ ++static int idle_thread_fn(void *arg) ++{ ++ sigset_t set; ++ int sig; ++ ++ /* dummy waiter; SIGTERM terminates us anyway */ ++ sigemptyset(&set); ++ sigaddset(&set, SIGTERM); ++ sigwait(&set, &sig); ++ ++ return 0; ++} ++ ++static pid_t spawn_idle_thread(unsigned int flags) ++{ ++ uint8_t *stack; ++ pid_t pid; ++ ++ stack = malloc(STACK_SIZE); ++ if (!stack) { ++ printf("malloc(STACK_SIZE) failed: %m\n"); ++ abort(); ++ } ++ ++ pid = clone(idle_thread_fn, ++ stack + STACK_SIZE, ++ SIGCHLD | flags, ++ NULL); ++ if (pid < 0) { ++ printf("clone() failed: %m\n"); ++ abort(); ++ } ++ ++ return pid; ++} ++ ++static void join_idle_thread(pid_t pid) ++{ ++ kill(pid, SIGTERM); ++ waitpid(pid, NULL, 0); ++} ++ ++/* ++ * Test memfd_create() syscall ++ * Verify syscall-argument validation, including name checks, flag validation ++ * and more. ++ */ ++static void test_create(void) ++{ ++ char buf[2048]; ++ int fd; ++ ++ /* test NULL name */ ++ mfd_fail_new(NULL, 0); ++ ++ /* test over-long name (not zero-terminated) */ ++ memset(buf, 0xff, sizeof(buf)); ++ mfd_fail_new(buf, 0); ++ ++ /* test over-long zero-terminated name */ ++ memset(buf, 0xff, sizeof(buf)); ++ buf[sizeof(buf) - 1] = 0; ++ mfd_fail_new(buf, 0); ++ ++ /* verify "" is a valid name */ ++ fd = mfd_assert_new("", 0, 0); ++ close(fd); ++ ++ /* verify invalid O_* open flags */ ++ mfd_fail_new("", 0x0100); ++ mfd_fail_new("", ~MFD_CLOEXEC); ++ mfd_fail_new("", ~MFD_ALLOW_SEALING); ++ mfd_fail_new("", ~0); ++ mfd_fail_new("", 0x80000000U); ++ ++ /* verify MFD_CLOEXEC is allowed */ ++ fd = mfd_assert_new("", 0, MFD_CLOEXEC); ++ close(fd); ++ ++ /* verify MFD_ALLOW_SEALING is allowed */ ++ fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); ++ close(fd); ++ ++ /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ ++ fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); ++ close(fd); ++} ++ ++/* ++ * Test basic sealing ++ * A very basic sealing test to see whether setting/retrieving seals works. ++ */ ++static void test_basic(void) ++{ ++ int fd; ++ ++ fd = mfd_assert_new("kern_memfd_basic", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ ++ /* add basic seals */ ++ mfd_assert_has_seals(fd, 0); ++ mfd_assert_add_seals(fd, F_SEAL_SHRINK | ++ F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_SHRINK | ++ F_SEAL_WRITE); ++ ++ /* add them again */ ++ mfd_assert_add_seals(fd, F_SEAL_SHRINK | ++ F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_SHRINK | ++ F_SEAL_WRITE); ++ ++ /* add more seals and seal against sealing */ ++ mfd_assert_add_seals(fd, F_SEAL_GROW | F_SEAL_SEAL); ++ mfd_assert_has_seals(fd, F_SEAL_SHRINK | ++ F_SEAL_GROW | ++ F_SEAL_WRITE | ++ F_SEAL_SEAL); ++ ++ /* verify that sealing no longer works */ ++ mfd_fail_add_seals(fd, F_SEAL_GROW); ++ mfd_fail_add_seals(fd, 0); ++ ++ close(fd); ++ ++ /* verify sealing does not work without MFD_ALLOW_SEALING */ ++ fd = mfd_assert_new("kern_memfd_basic", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC); ++ mfd_assert_has_seals(fd, F_SEAL_SEAL); ++ mfd_fail_add_seals(fd, F_SEAL_SHRINK | ++ F_SEAL_GROW | ++ F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_SEAL); ++ close(fd); ++} ++ ++/* ++ * Test SEAL_WRITE ++ * Test whether SEAL_WRITE actually prevents modifications. ++ */ ++static void test_seal_write(void) ++{ ++ int fd; ++ ++ fd = mfd_assert_new("kern_memfd_seal_write", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ mfd_assert_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE); ++ ++ mfd_assert_read(fd); ++ mfd_fail_write(fd); ++ mfd_assert_shrink(fd); ++ mfd_assert_grow(fd); ++ mfd_fail_grow_write(fd); ++ ++ close(fd); ++} ++ ++/* ++ * Test SEAL_SHRINK ++ * Test whether SEAL_SHRINK actually prevents shrinking ++ */ ++static void test_seal_shrink(void) ++{ ++ int fd; ++ ++ fd = mfd_assert_new("kern_memfd_seal_shrink", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ mfd_assert_add_seals(fd, F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd, F_SEAL_SHRINK); ++ ++ mfd_assert_read(fd); ++ mfd_assert_write(fd); ++ mfd_fail_shrink(fd); ++ mfd_assert_grow(fd); ++ mfd_assert_grow_write(fd); ++ ++ close(fd); ++} ++ ++/* ++ * Test SEAL_GROW ++ * Test whether SEAL_GROW actually prevents growing ++ */ ++static void test_seal_grow(void) ++{ ++ int fd; ++ ++ fd = mfd_assert_new("kern_memfd_seal_grow", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ mfd_assert_add_seals(fd, F_SEAL_GROW); ++ mfd_assert_has_seals(fd, F_SEAL_GROW); ++ ++ mfd_assert_read(fd); ++ mfd_assert_write(fd); ++ mfd_assert_shrink(fd); ++ mfd_fail_grow(fd); ++ mfd_fail_grow_write(fd); ++ ++ close(fd); ++} ++ ++/* ++ * Test SEAL_SHRINK | SEAL_GROW ++ * Test whether SEAL_SHRINK | SEAL_GROW actually prevents resizing ++ */ ++static void test_seal_resize(void) ++{ ++ int fd; ++ ++ fd = mfd_assert_new("kern_memfd_seal_resize", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); ++ mfd_assert_has_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); ++ ++ mfd_assert_read(fd); ++ mfd_assert_write(fd); ++ mfd_fail_shrink(fd); ++ mfd_fail_grow(fd); ++ mfd_fail_grow_write(fd); ++ ++ close(fd); ++} ++ ++/* ++ * Test sharing via dup() ++ * Test that seals are shared between dupped FDs and they're all equal. ++ */ ++static void test_share_dup(void) ++{ ++ int fd, fd2; ++ ++ fd = mfd_assert_new("kern_memfd_share_dup", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ ++ fd2 = mfd_assert_dup(fd); ++ mfd_assert_has_seals(fd2, 0); ++ ++ mfd_assert_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE); ++ ++ mfd_assert_add_seals(fd2, F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK); ++ ++ mfd_assert_add_seals(fd, F_SEAL_SEAL); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); ++ ++ mfd_fail_add_seals(fd, F_SEAL_GROW); ++ mfd_fail_add_seals(fd2, F_SEAL_GROW); ++ mfd_fail_add_seals(fd, F_SEAL_SEAL); ++ mfd_fail_add_seals(fd2, F_SEAL_SEAL); ++ ++ close(fd2); ++ ++ mfd_fail_add_seals(fd, F_SEAL_GROW); ++ close(fd); ++} ++ ++/* ++ * Test sealing with active mmap()s ++ * Modifying seals is only allowed if no other mmap() refs exist. ++ */ ++static void test_share_mmap(void) ++{ ++ int fd; ++ void *p; ++ ++ fd = mfd_assert_new("kern_memfd_share_mmap", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ ++ /* shared/writable ref prevents sealing WRITE, but allows others */ ++ p = mfd_assert_mmap_shared(fd); ++ mfd_fail_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, 0); ++ mfd_assert_add_seals(fd, F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd, F_SEAL_SHRINK); ++ munmap(p, MFD_DEF_SIZE); ++ ++ /* readable ref allows sealing */ ++ p = mfd_assert_mmap_private(fd); ++ mfd_assert_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); ++ munmap(p, MFD_DEF_SIZE); ++ ++ close(fd); ++} ++ ++/* ++ * Test sealing with open(/proc/self/fd/%d) ++ * Via /proc we can get access to a separate file-context for the same memfd. ++ * This is *not* like dup(), but like a real separate open(). Make sure the ++ * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR. ++ */ ++static void test_share_open(void) ++{ ++ int fd, fd2; ++ ++ fd = mfd_assert_new("kern_memfd_share_open", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ ++ fd2 = mfd_assert_open(fd, O_RDWR, 0); ++ mfd_assert_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE); ++ ++ mfd_assert_add_seals(fd2, F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK); ++ ++ close(fd); ++ fd = mfd_assert_open(fd2, O_RDONLY, 0); ++ ++ mfd_fail_add_seals(fd, F_SEAL_SEAL); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK); ++ ++ close(fd2); ++ fd2 = mfd_assert_open(fd, O_RDWR, 0); ++ ++ mfd_assert_add_seals(fd2, F_SEAL_SEAL); ++ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); ++ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); ++ ++ close(fd2); ++ close(fd); ++} ++ ++/* ++ * Test sharing via fork() ++ * Test whether seal-modifications work as expected with forked childs. ++ */ ++static void test_share_fork(void) ++{ ++ int fd; ++ pid_t pid; ++ ++ fd = mfd_assert_new("kern_memfd_share_fork", ++ MFD_DEF_SIZE, ++ MFD_CLOEXEC | MFD_ALLOW_SEALING); ++ mfd_assert_has_seals(fd, 0); ++ ++ pid = spawn_idle_thread(0); ++ mfd_assert_add_seals(fd, F_SEAL_SEAL); ++ mfd_assert_has_seals(fd, F_SEAL_SEAL); ++ ++ mfd_fail_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_SEAL); ++ ++ join_idle_thread(pid); ++ ++ mfd_fail_add_seals(fd, F_SEAL_WRITE); ++ mfd_assert_has_seals(fd, F_SEAL_SEAL); ++ ++ close(fd); ++} ++ ++int main(int argc, char **argv) ++{ ++ pid_t pid; ++ ++ printf("memfd: CREATE\n"); ++ test_create(); ++ printf("memfd: BASIC\n"); ++ test_basic(); ++ ++ printf("memfd: SEAL-WRITE\n"); ++ test_seal_write(); ++ printf("memfd: SEAL-SHRINK\n"); ++ test_seal_shrink(); ++ printf("memfd: SEAL-GROW\n"); ++ test_seal_grow(); ++ printf("memfd: SEAL-RESIZE\n"); ++ test_seal_resize(); ++ ++ printf("memfd: SHARE-DUP\n"); ++ test_share_dup(); ++ printf("memfd: SHARE-MMAP\n"); ++ test_share_mmap(); ++ printf("memfd: SHARE-OPEN\n"); ++ test_share_open(); ++ printf("memfd: SHARE-FORK\n"); ++ test_share_fork(); ++ ++ /* Run test-suite in a multi-threaded environment with a shared ++ * file-table. */ ++ pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM); ++ printf("memfd: SHARE-DUP (shared file-table)\n"); ++ test_share_dup(); ++ printf("memfd: SHARE-MMAP (shared file-table)\n"); ++ test_share_mmap(); ++ printf("memfd: SHARE-OPEN (shared file-table)\n"); ++ test_share_open(); ++ printf("memfd: SHARE-FORK (shared file-table)\n"); ++ test_share_fork(); ++ join_idle_thread(pid); ++ ++ printf("memfd: DONE\n"); ++ ++ return 0; ++} diff --git a/debian/patches/features/all/kdbus/shm-add-memfd_create-syscall.patch b/debian/patches/features/all/kdbus/shm-add-memfd_create-syscall.patch new file mode 100644 index 000000000..7ead2dc64 --- /dev/null +++ b/debian/patches/features/all/kdbus/shm-add-memfd_create-syscall.patch @@ -0,0 +1,181 @@ +From: David Herrmann +Date: Fri, 8 Aug 2014 14:25:29 -0700 +Subject: shm: add memfd_create() syscall +Origin: https://git.kernel.org/linus/9183df25fe7b194563db3fec6dc3202a5855839c +Bug-Debian: https://bugs.debian.org/760702 + +memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor +that you can pass to mmap(). It can support sealing and avoids any +connection to user-visible mount-points. Thus, it's not subject to quotas +on mounted file-systems, but can be used like malloc()'ed memory, but with +a file-descriptor to it. + +memfd_create() returns the raw shmem file, so calls like ftruncate() can +be used to modify the underlying inode. Also calls like fstat() will +return proper information and mark the file as regular file. If you want +sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not +supported (like on all other regular files). + +Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not +subject to a filesystem size limit. It is still properly accounted to +memcg limits, though, and to the same overcommit or no-overcommit +accounting as all user memory. + +Signed-off-by: David Herrmann +Acked-by: Hugh Dickins +Cc: Michael Kerrisk +Cc: Ryan Lortie +Cc: Lennart Poettering +Cc: Daniel Mack +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[bwh: Backported to 3.16: adjust context] +--- + arch/x86/syscalls/syscall_32.tbl | 1 + + arch/x86/syscalls/syscall_64.tbl | 1 + + include/linux/syscalls.h | 1 + + include/uapi/linux/memfd.h | 8 +++++ + kernel/sys_ni.c | 1 + + mm/shmem.c | 73 ++++++++++++++++++++++++++++++++++++++++ + 6 files changed, 85 insertions(+) + create mode 100644 include/uapi/linux/memfd.h + +--- a/arch/x86/syscalls/syscall_32.tbl ++++ b/arch/x86/syscalls/syscall_32.tbl +@@ -360,3 +360,4 @@ + 351 i386 sched_setattr sys_sched_setattr + 352 i386 sched_getattr sys_sched_getattr + 353 i386 renameat2 sys_renameat2 ++356 i386 memfd_create sys_memfd_create +--- a/arch/x86/syscalls/syscall_64.tbl ++++ b/arch/x86/syscalls/syscall_64.tbl +@@ -323,6 +323,7 @@ + 314 common sched_setattr sys_sched_setattr + 315 common sched_getattr sys_sched_getattr + 316 common renameat2 sys_renameat2 ++319 common memfd_create sys_memfd_create + + # + # x32-specific system call numbers start at 512 to avoid cache impact +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -802,6 +802,7 @@ asmlinkage long sys_timerfd_settime(int + asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); + asmlinkage long sys_eventfd(unsigned int count); + asmlinkage long sys_eventfd2(unsigned int count, int flags); ++asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); + asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); + asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); + asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, +--- /dev/null ++++ b/include/uapi/linux/memfd.h +@@ -0,0 +1,8 @@ ++#ifndef _UAPI_LINUX_MEMFD_H ++#define _UAPI_LINUX_MEMFD_H ++ ++/* flags for memfd_create(2) (unsigned int) */ ++#define MFD_CLOEXEC 0x0001U ++#define MFD_ALLOW_SEALING 0x0002U ++ ++#endif /* _UAPI_LINUX_MEMFD_H */ +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime) + cond_syscall(compat_sys_timerfd_gettime); + cond_syscall(sys_eventfd); + cond_syscall(sys_eventfd2); ++cond_syscall(sys_memfd_create); + + /* performance counters: */ + cond_syscall(sys_perf_event_open); +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt; + #include + #include + #include ++#include + #include ++#include + + #include + #include +@@ -2710,6 +2712,77 @@ static int shmem_show_options(struct seq + shmem_show_mpol(seq, sbinfo->mpol); + return 0; + } ++ ++#define MFD_NAME_PREFIX "memfd:" ++#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) ++#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) ++ ++#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) ++ ++SYSCALL_DEFINE2(memfd_create, ++ const char __user *, uname, ++ unsigned int, flags) ++{ ++ struct shmem_inode_info *info; ++ struct file *file; ++ int fd, error; ++ char *name; ++ long len; ++ ++ if (flags & ~(unsigned int)MFD_ALL_FLAGS) ++ return -EINVAL; ++ ++ /* length includes terminating zero */ ++ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); ++ if (len <= 0) ++ return -EFAULT; ++ if (len > MFD_NAME_MAX_LEN + 1) ++ return -EINVAL; ++ ++ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); ++ if (!name) ++ return -ENOMEM; ++ ++ strcpy(name, MFD_NAME_PREFIX); ++ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { ++ error = -EFAULT; ++ goto err_name; ++ } ++ ++ /* terminating-zero may have changed after strnlen_user() returned */ ++ if (name[len + MFD_NAME_PREFIX_LEN - 1]) { ++ error = -EFAULT; ++ goto err_name; ++ } ++ ++ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); ++ if (fd < 0) { ++ error = fd; ++ goto err_name; ++ } ++ ++ file = shmem_file_setup(name, 0, VM_NORESERVE); ++ if (IS_ERR(file)) { ++ error = PTR_ERR(file); ++ goto err_fd; ++ } ++ info = SHMEM_I(file_inode(file)); ++ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; ++ file->f_flags |= O_RDWR | O_LARGEFILE; ++ if (flags & MFD_ALLOW_SEALING) ++ info->seals &= ~F_SEAL_SEAL; ++ ++ fd_install(fd, file); ++ kfree(name); ++ return fd; ++ ++err_fd: ++ put_unused_fd(fd); ++err_name: ++ kfree(name); ++ return error; ++} ++ + #endif /* CONFIG_TMPFS */ + + static void shmem_put_super(struct super_block *sb) diff --git a/debian/patches/features/all/kdbus/shm-add-sealing-API.patch b/debian/patches/features/all/kdbus/shm-add-sealing-API.patch new file mode 100644 index 000000000..c09f65525 --- /dev/null +++ b/debian/patches/features/all/kdbus/shm-add-sealing-API.patch @@ -0,0 +1,395 @@ +From: David Herrmann +Date: Fri, 8 Aug 2014 14:25:27 -0700 +Subject: shm: add sealing API +Origin: https://git.kernel.org/linus/40e041a2c858b3caefc757e26cb85bfceae5062b +Bug-Debian: https://bugs.debian.org/760702 + +If two processes share a common memory region, they usually want some +guarantees to allow safe access. This often includes: + - one side cannot overwrite data while the other reads it + - one side cannot shrink the buffer while the other accesses it + - one side cannot grow the buffer beyond previously set boundaries + +If there is a trust-relationship between both parties, there is no need +for policy enforcement. However, if there's no trust relationship (eg., +for general-purpose IPC) sharing memory-regions is highly fragile and +often not possible without local copies. Look at the following two +use-cases: + + 1) A graphics client wants to share its rendering-buffer with a + graphics-server. The memory-region is allocated by the client for + read/write access and a second FD is passed to the server. While + scanning out from the memory region, the server has no guarantee that + the client doesn't shrink the buffer at any time, requiring rather + cumbersome SIGBUS handling. + 2) A process wants to perform an RPC on another process. To avoid huge + bandwidth consumption, zero-copy is preferred. After a message is + assembled in-memory and a FD is passed to the remote side, both sides + want to be sure that neither modifies this shared copy, anymore. The + source may have put sensible data into the message without a separate + copy and the target may want to parse the message inline, to avoid a + local copy. + +While SIGBUS handling, POSIX mandatory locking and MAP_DENYWRITE provide +ways to achieve most of this, the first one is unproportionally ugly to +use in libraries and the latter two are broken/racy or even disabled due +to denial of service attacks. + +This patch introduces the concept of SEALING. If you seal a file, a +specific set of operations is blocked on that file forever. Unlike locks, +seals can only be set, never removed. Hence, once you verified a specific +set of seals is set, you're guaranteed that no-one can perform the blocked +operations on this file, anymore. + +An initial set of SEALS is introduced by this patch: + - SHRINK: If SEAL_SHRINK is set, the file in question cannot be reduced + in size. This affects ftruncate() and open(O_TRUNC). + - GROW: If SEAL_GROW is set, the file in question cannot be increased + in size. This affects ftruncate(), fallocate() and write(). + - WRITE: If SEAL_WRITE is set, no write operations (besides resizing) + are possible. This affects fallocate(PUNCH_HOLE), mmap() and + write(). + - SEAL: If SEAL_SEAL is set, no further seals can be added to a file. + This basically prevents the F_ADD_SEAL operation on a file and + can be set to prevent others from adding further seals that you + don't want. + +The described use-cases can easily use these seals to provide safe use +without any trust-relationship: + + 1) The graphics server can verify that a passed file-descriptor has + SEAL_SHRINK set. This allows safe scanout, while the client is + allowed to increase buffer size for window-resizing on-the-fly. + Concurrent writes are explicitly allowed. + 2) For general-purpose IPC, both processes can verify that SEAL_SHRINK, + SEAL_GROW and SEAL_WRITE are set. This guarantees that neither + process can modify the data while the other side parses it. + Furthermore, it guarantees that even with writable FDs passed to the + peer, it cannot increase the size to hit memory-limits of the source + process (in case the file-storage is accounted to the source). + +The new API is an extension to fcntl(), adding two new commands: + F_GET_SEALS: Return a bitset describing the seals on the file. This + can be called on any FD if the underlying file supports + sealing. + F_ADD_SEALS: Change the seals of a given file. This requires WRITE + access to the file and F_SEAL_SEAL may not already be set. + Furthermore, the underlying file must support sealing and + there may not be any existing shared mapping of that file. + Otherwise, EBADF/EPERM is returned. + The given seals are _added_ to the existing set of seals + on the file. You cannot remove seals again. + +The fcntl() handler is currently specific to shmem and disabled on all +files. A file needs to explicitly support sealing for this interface to +work. A separate syscall is added in a follow-up, which creates files that +support sealing. There is no intention to support this on other +file-systems. Semantics are unclear for non-volatile files and we lack any +use-case right now. Therefore, the implementation is specific to shmem. + +Signed-off-by: David Herrmann +Acked-by: Hugh Dickins +Cc: Michael Kerrisk +Cc: Ryan Lortie +Cc: Lennart Poettering +Cc: Daniel Mack +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + fs/fcntl.c | 5 ++ + include/linux/shmem_fs.h | 17 ++++++ + include/uapi/linux/fcntl.h | 15 +++++ + mm/shmem.c | 143 +++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 180 insertions(+) + +--- a/fs/fcntl.c ++++ b/fs/fcntl.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -336,6 +337,10 @@ static long do_fcntl(int fd, unsigned in + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; ++ case F_ADD_SEALS: ++ case F_GET_SEALS: ++ err = shmem_fcntl(filp, cmd, arg); ++ break; + default: + break; + } +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -1,6 +1,7 @@ + #ifndef __SHMEM_FS_H + #define __SHMEM_FS_H + ++#include + #include + #include + #include +@@ -11,6 +12,7 @@ + + struct shmem_inode_info { + spinlock_t lock; ++ unsigned int seals; /* shmem seals */ + unsigned long flags; + unsigned long alloced; /* data pages alloced to file */ + union { +@@ -65,4 +67,19 @@ static inline struct page *shmem_read_ma + mapping_gfp_mask(mapping)); + } + ++#ifdef CONFIG_TMPFS ++ ++extern int shmem_add_seals(struct file *file, unsigned int seals); ++extern int shmem_get_seals(struct file *file); ++extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); ++ ++#else ++ ++static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) ++{ ++ return -EINVAL; ++} ++ ++#endif ++ + #endif +--- a/include/uapi/linux/fcntl.h ++++ b/include/uapi/linux/fcntl.h +@@ -28,6 +28,21 @@ + #define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) + + /* ++ * Set/Get seals ++ */ ++#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) ++#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) ++ ++/* ++ * Types of seals ++ */ ++#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ ++#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ ++#define F_SEAL_GROW 0x0004 /* prevent file from growing */ ++#define F_SEAL_WRITE 0x0008 /* prevent writes */ ++/* (1U << 31) is reserved for signed error codes */ ++ ++/* + * Types of directory notifications that may be requested. + */ + #define DN_ACCESS 0x00000001 /* File accessed */ +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -66,6 +66,7 @@ static struct vfsmount *shm_mnt; + #include + #include + #include ++#include + + #include + #include +@@ -538,6 +539,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); + static int shmem_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; ++ struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + error = inode_change_ok(inode, attr); +@@ -548,6 +550,11 @@ static int shmem_setattr(struct dentry * + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + ++ /* protected by i_mutex */ ++ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || ++ (newsize > oldsize && (info->seals & F_SEAL_GROW))) ++ return -EPERM; ++ + if (newsize != oldsize) { + i_size_write(inode, newsize); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +@@ -1390,6 +1397,7 @@ static struct inode *shmem_get_inode(str + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); ++ info->seals = F_SEAL_SEAL; + info->flags = flags & VM_NORESERVE; + INIT_LIST_HEAD(&info->swaplist); + simple_xattrs_init(&info->xattrs); +@@ -1448,7 +1456,17 @@ shmem_write_begin(struct file *file, str + struct page **pagep, void **fsdata) + { + struct inode *inode = mapping->host; ++ struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; ++ ++ /* i_mutex is held by caller */ ++ if (unlikely(info->seals)) { ++ if (info->seals & F_SEAL_WRITE) ++ return -EPERM; ++ if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) ++ return -EPERM; ++ } ++ + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + } + +@@ -1786,11 +1804,125 @@ static loff_t shmem_file_llseek(struct f + return offset; + } + ++static int shmem_wait_for_pins(struct address_space *mapping) ++{ ++ return 0; ++} ++ ++#define F_ALL_SEALS (F_SEAL_SEAL | \ ++ F_SEAL_SHRINK | \ ++ F_SEAL_GROW | \ ++ F_SEAL_WRITE) ++ ++int shmem_add_seals(struct file *file, unsigned int seals) ++{ ++ struct inode *inode = file_inode(file); ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ int error; ++ ++ /* ++ * SEALING ++ * Sealing allows multiple parties to share a shmem-file but restrict ++ * access to a specific subset of file operations. Seals can only be ++ * added, but never removed. This way, mutually untrusted parties can ++ * share common memory regions with a well-defined policy. A malicious ++ * peer can thus never perform unwanted operations on a shared object. ++ * ++ * Seals are only supported on special shmem-files and always affect ++ * the whole underlying inode. Once a seal is set, it may prevent some ++ * kinds of access to the file. Currently, the following seals are ++ * defined: ++ * SEAL_SEAL: Prevent further seals from being set on this file ++ * SEAL_SHRINK: Prevent the file from shrinking ++ * SEAL_GROW: Prevent the file from growing ++ * SEAL_WRITE: Prevent write access to the file ++ * ++ * As we don't require any trust relationship between two parties, we ++ * must prevent seals from being removed. Therefore, sealing a file ++ * only adds a given set of seals to the file, it never touches ++ * existing seals. Furthermore, the "setting seals"-operation can be ++ * sealed itself, which basically prevents any further seal from being ++ * added. ++ * ++ * Semantics of sealing are only defined on volatile files. Only ++ * anonymous shmem files support sealing. More importantly, seals are ++ * never written to disk. Therefore, there's no plan to support it on ++ * other file types. ++ */ ++ ++ if (file->f_op != &shmem_file_operations) ++ return -EINVAL; ++ if (!(file->f_mode & FMODE_WRITE)) ++ return -EPERM; ++ if (seals & ~(unsigned int)F_ALL_SEALS) ++ return -EINVAL; ++ ++ mutex_lock(&inode->i_mutex); ++ ++ if (info->seals & F_SEAL_SEAL) { ++ error = -EPERM; ++ goto unlock; ++ } ++ ++ if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { ++ error = mapping_deny_writable(file->f_mapping); ++ if (error) ++ goto unlock; ++ ++ error = shmem_wait_for_pins(file->f_mapping); ++ if (error) { ++ mapping_allow_writable(file->f_mapping); ++ goto unlock; ++ } ++ } ++ ++ info->seals |= seals; ++ error = 0; ++ ++unlock: ++ mutex_unlock(&inode->i_mutex); ++ return error; ++} ++EXPORT_SYMBOL_GPL(shmem_add_seals); ++ ++int shmem_get_seals(struct file *file) ++{ ++ if (file->f_op != &shmem_file_operations) ++ return -EINVAL; ++ ++ return SHMEM_I(file_inode(file))->seals; ++} ++EXPORT_SYMBOL_GPL(shmem_get_seals); ++ ++long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long error; ++ ++ switch (cmd) { ++ case F_ADD_SEALS: ++ /* disallow upper 32bit */ ++ if (arg > UINT_MAX) ++ return -EINVAL; ++ ++ error = shmem_add_seals(file, arg); ++ break; ++ case F_GET_SEALS: ++ error = shmem_get_seals(file); ++ break; ++ default: ++ error = -EINVAL; ++ break; ++ } ++ ++ return error; ++} ++ + static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) + { + struct inode *inode = file_inode(file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); ++ struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_falloc shmem_falloc; + pgoff_t start, index, end; + int error; +@@ -1806,6 +1938,12 @@ static long shmem_fallocate(struct file + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + ++ /* protected by i_mutex */ ++ if (info->seals & F_SEAL_WRITE) { ++ error = -EPERM; ++ goto out; ++ } ++ + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; +@@ -1832,6 +1970,11 @@ static long shmem_fallocate(struct file + if (error) + goto out; + ++ if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { ++ error = -EPERM; ++ goto out; ++ } ++ + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ diff --git a/debian/patches/features/all/kdbus/shm-wait-for-pins-to-be-released-when-sealing.patch b/debian/patches/features/all/kdbus/shm-wait-for-pins-to-be-released-when-sealing.patch new file mode 100644 index 000000000..da1ca58d8 --- /dev/null +++ b/debian/patches/features/all/kdbus/shm-wait-for-pins-to-be-released-when-sealing.patch @@ -0,0 +1,154 @@ +From: David Herrmann +Date: Fri, 8 Aug 2014 14:25:36 -0700 +Subject: shm: wait for pins to be released when sealing +Origin: https://git.kernel.org/linus/05f65b5c70909ef686f865f0a85406d74d75f70f +Bug-Debian: https://bugs.debian.org/760702 + +If we set SEAL_WRITE on a file, we must make sure there cannot be any +ongoing write-operations on the file. For write() calls, we simply lock +the inode mutex, for mmap() we simply verify there're no writable +mappings. However, there might be pages pinned by AIO, Direct-IO and +similar operations via GUP. We must make sure those do not write to the +memfd file after we set SEAL_WRITE. + +As there is no way to notify GUP users to drop pages or to wait for them +to be done, we implement the wait ourself: When setting SEAL_WRITE, we +check all pages for their ref-count. If it's bigger than 1, we know +there's some user of the page. We then mark the page and wait for up to +150ms for those ref-counts to be dropped. If the ref-counts are not +dropped in time, we refuse the seal operation. + +Signed-off-by: David Herrmann +Acked-by: Hugh Dickins +Cc: Michael Kerrisk +Cc: Ryan Lortie +Cc: Lennart Poettering +Cc: Daniel Mack +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + mm/shmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 109 insertions(+), 1 deletion(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -1806,9 +1806,117 @@ static loff_t shmem_file_llseek(struct f + return offset; + } + ++/* ++ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, ++ * so reuse a tag which we firmly believe is never set or cleared on shmem. ++ */ ++#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE ++#define LAST_SCAN 4 /* about 150ms max */ ++ ++static void shmem_tag_pins(struct address_space *mapping) ++{ ++ struct radix_tree_iter iter; ++ void **slot; ++ pgoff_t start; ++ struct page *page; ++ ++ lru_add_drain(); ++ start = 0; ++ rcu_read_lock(); ++ ++restart: ++ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { ++ page = radix_tree_deref_slot(slot); ++ if (!page || radix_tree_exception(page)) { ++ if (radix_tree_deref_retry(page)) ++ goto restart; ++ } else if (page_count(page) - page_mapcount(page) > 1) { ++ spin_lock_irq(&mapping->tree_lock); ++ radix_tree_tag_set(&mapping->page_tree, iter.index, ++ SHMEM_TAG_PINNED); ++ spin_unlock_irq(&mapping->tree_lock); ++ } ++ ++ if (need_resched()) { ++ cond_resched_rcu(); ++ start = iter.index + 1; ++ goto restart; ++ } ++ } ++ rcu_read_unlock(); ++} ++ ++/* ++ * Setting SEAL_WRITE requires us to verify there's no pending writer. However, ++ * via get_user_pages(), drivers might have some pending I/O without any active ++ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages ++ * and see whether it has an elevated ref-count. If so, we tag them and wait for ++ * them to be dropped. ++ * The caller must guarantee that no new user will acquire writable references ++ * to those pages to avoid races. ++ */ + static int shmem_wait_for_pins(struct address_space *mapping) + { +- return 0; ++ struct radix_tree_iter iter; ++ void **slot; ++ pgoff_t start; ++ struct page *page; ++ int error, scan; ++ ++ shmem_tag_pins(mapping); ++ ++ error = 0; ++ for (scan = 0; scan <= LAST_SCAN; scan++) { ++ if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) ++ break; ++ ++ if (!scan) ++ lru_add_drain_all(); ++ else if (schedule_timeout_killable((HZ << scan) / 200)) ++ scan = LAST_SCAN; ++ ++ start = 0; ++ rcu_read_lock(); ++restart: ++ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, ++ start, SHMEM_TAG_PINNED) { ++ ++ page = radix_tree_deref_slot(slot); ++ if (radix_tree_exception(page)) { ++ if (radix_tree_deref_retry(page)) ++ goto restart; ++ ++ page = NULL; ++ } ++ ++ if (page && ++ page_count(page) - page_mapcount(page) != 1) { ++ if (scan < LAST_SCAN) ++ goto continue_resched; ++ ++ /* ++ * On the last scan, we clean up all those tags ++ * we inserted; but make a note that we still ++ * found pages pinned. ++ */ ++ error = -EBUSY; ++ } ++ ++ spin_lock_irq(&mapping->tree_lock); ++ radix_tree_tag_clear(&mapping->page_tree, ++ iter.index, SHMEM_TAG_PINNED); ++ spin_unlock_irq(&mapping->tree_lock); ++continue_resched: ++ if (need_resched()) { ++ cond_resched_rcu(); ++ start = iter.index + 1; ++ goto restart; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++ return error; + } + + #define F_ALL_SEALS (F_SEAL_SEAL | \ diff --git a/debian/patches/features/all/kdbus/sparc-Hook-up-memfd_create-system-call.patch b/debian/patches/features/all/kdbus/sparc-Hook-up-memfd_create-system-call.patch new file mode 100644 index 000000000..b6e556a0c --- /dev/null +++ b/debian/patches/features/all/kdbus/sparc-Hook-up-memfd_create-system-call.patch @@ -0,0 +1,49 @@ +From: "David S. Miller" +Date: Wed, 13 Aug 2014 22:00:09 -0700 +Subject: sparc: Hook up memfd_create system call. +Origin: https://git.kernel.org/linus/10cf15e1d1289aa0bf1d26e9f55176b4c7c5c512 +Bug-Debian: https://bugs.debian.org/760702 + +Signed-off-by: David S. Miller +[bwh: Backported to 3.16: + - Adjust context + - Insert unimplemented-syscall entries for seccomp and getrandom] +--- +--- a/arch/sparc/include/uapi/asm/unistd.h ++++ b/arch/sparc/include/uapi/asm/unistd.h +@@ -411,8 +411,9 @@ + #define __NR_sched_setattr 343 + #define __NR_sched_getattr 344 + #define __NR_renameat2 345 ++#define __NR_memfd_create 348 + +-#define NR_syscalls 346 ++#define NR_syscalls 349 + + /* Bitmask values returned from kern_features system call. */ + #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 +--- a/arch/sparc/kernel/systbls_32.S ++++ b/arch/sparc/kernel/systbls_32.S +@@ -86,4 +86,4 @@ sys_call_table: + /*330*/ .long sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime + /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev + /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr +-/*345*/ .long sys_renameat2 ++/*345*/ .long sys_renameat2, sys_ni_syscall, sys_ni_syscall, sys_memfd_create +--- a/arch/sparc/kernel/systbls_64.S ++++ b/arch/sparc/kernel/systbls_64.S +@@ -87,7 +87,7 @@ sys_call_table32: + /*330*/ .word compat_sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime + .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev + /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr +- .word sys32_renameat2 ++ .word sys32_renameat2, sys_ni_syscall, sys_ni_syscall, sys_memfd_create + + #endif /* CONFIG_COMPAT */ + +@@ -166,4 +166,4 @@ sys_call_table: + /*330*/ .word sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime + .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev + /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr +- .word sys_renameat2 ++ .word sys_renameat2, sys_ni_syscall, sys_ni_syscall, sys_memfd_create diff --git a/debian/patches/features/all/kdbus/tools-selftests-fix-build-issue-with-make-kselftests.patch b/debian/patches/features/all/kdbus/tools-selftests-fix-build-issue-with-make-kselftests.patch new file mode 100644 index 000000000..a2703b007 --- /dev/null +++ b/debian/patches/features/all/kdbus/tools-selftests-fix-build-issue-with-make-kselftests.patch @@ -0,0 +1,118 @@ +From: Phong Tran +Date: Fri, 29 Aug 2014 15:19:06 -0700 +Subject: tools: selftests: fix build issue with make kselftests target +Origin: https://git.kernel.org/linus/498b473af9c20a4cb533297dc43b063f35f86349 +Bug-Debian: https://bugs.debian.org/760702 + +Fix the typo of ARCH when running 'make kselftests'. Change the 'X86' +to 'x86'. Test by compilation. + +Signed-off-by: Phong Tran +Cc: David Herrmann +Cc: Hugh Dickins +Cc: Shuah Khan +Cc: Sam Ravnborg +Cc: Michal Marek +Cc: Shuah Khan +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + tools/testing/selftests/ipc/Makefile | 6 +++--- + tools/testing/selftests/kcmp/Makefile | 6 +++--- + tools/testing/selftests/memfd/Makefile | 10 +++++----- + 3 files changed, 11 insertions(+), 11 deletions(-) + +diff --git a/tools/testing/selftests/ipc/Makefile b/tools/testing/selftests/ipc/Makefile +index 5386fd7..74bbefd 100644 +--- a/tools/testing/selftests/ipc/Makefile ++++ b/tools/testing/selftests/ipc/Makefile +@@ -1,18 +1,18 @@ + uname_M := $(shell uname -m 2>/dev/null || echo not) + ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) + ifeq ($(ARCH),i386) +- ARCH := X86 ++ ARCH := x86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ + endif + ifeq ($(ARCH),x86_64) +- ARCH := X86 ++ ARCH := x86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ + endif + + CFLAGS += -I../../../../usr/include/ + + all: +-ifeq ($(ARCH),X86) ++ifeq ($(ARCH),x86) + gcc $(CFLAGS) msgque.c -o msgque_test + else + echo "Not an x86 target, can't build msgque selftest" +diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile +index d7d6bbe..8aabd82 100644 +--- a/tools/testing/selftests/kcmp/Makefile ++++ b/tools/testing/selftests/kcmp/Makefile +@@ -1,11 +1,11 @@ + uname_M := $(shell uname -m 2>/dev/null || echo not) + ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) + ifeq ($(ARCH),i386) +- ARCH := X86 ++ ARCH := x86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ + endif + ifeq ($(ARCH),x86_64) +- ARCH := X86 ++ ARCH := x86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ + endif + +@@ -15,7 +15,7 @@ CFLAGS += -I../../../../usr/include/ + CFLAGS += -I../../../../arch/x86/include/ + + all: +-ifeq ($(ARCH),X86) ++ifeq ($(ARCH),x86) + gcc $(CFLAGS) kcmp_test.c -o kcmp_test + else + echo "Not an x86 target, can't build kcmp selftest" +diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile +index 6816c49..ad4ab01 100644 +--- a/tools/testing/selftests/memfd/Makefile ++++ b/tools/testing/selftests/memfd/Makefile +@@ -1,10 +1,10 @@ + uname_M := $(shell uname -m 2>/dev/null || echo not) + ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) + ifeq ($(ARCH),i386) +- ARCH := X86 ++ ARCH := x86 + endif + ifeq ($(ARCH),x86_64) +- ARCH := X86 ++ ARCH := x86 + endif + + CFLAGS += -D_FILE_OFFSET_BITS=64 +@@ -14,20 +14,20 @@ CFLAGS += -I../../../../include/uapi/ + CFLAGS += -I../../../../include/ + + all: +-ifeq ($(ARCH),X86) ++ifeq ($(ARCH),x86) + gcc $(CFLAGS) memfd_test.c -o memfd_test + else + echo "Not an x86 target, can't build memfd selftest" + endif + + run_tests: all +-ifeq ($(ARCH),X86) ++ifeq ($(ARCH),x86) + gcc $(CFLAGS) memfd_test.c -o memfd_test + endif + @./memfd_test || echo "memfd_test: [FAIL]" + + build_fuse: +-ifeq ($(ARCH),X86) ++ifeq ($(ARCH),x86) + gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt + gcc $(CFLAGS) fuse_test.c -o fuse_test + else diff --git a/debian/patches/series b/debian/patches/series index a3db25bc6..9b8d9c47c 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -125,6 +125,23 @@ bugfix/all/drivers-mfd-rtsx_usb.c-export-device-table.patch bugfix/all/reiserfs-fix-corruption-introduced-by-balance_leaf-r.patch bugfix/all/reiserfs-Fix-use-after-free-in-journal-teardown.patch +# memfd_create() & kdbus backport +features/all/kdbus/mm-allow-drivers-to-prevent-new-writable-mappings.patch +features/all/kdbus/shm-add-sealing-API.patch +features/all/kdbus/shm-add-memfd_create-syscall.patch +features/all/kdbus/selftests-add-memfd_create-sealing-tests.patch +features/all/kdbus/selftests-add-memfd-sealing-page-pinning-tests.patch +features/all/kdbus/shm-wait-for-pins-to-be-released-when-sealing.patch +features/all/kdbus/tools-selftests-fix-build-issue-with-make-kselftests.patch +features/all/kdbus/ARM-wire-up-memfd_create-syscall.patch +features/all/kdbus/arm64-compat-wire-up-memfd_create-syscall.patch +features/all/kdbus/s390-wire-up-memfd_create-syscall.patch +features/all/kdbus/sparc-Hook-up-memfd_create-system-call.patch +features/all/kdbus/asm-generic-add-memfd_create-system-call-to-unistd.h.patch +features/all/kdbus/m68k-Wire-up-memfd_create.patch +features/all/kdbus/MIPS-Wire-up-new-syscalls-getrandom-and-memfd_create.patch +features/all/kdbus/powerpc-Wire-up-sys_seccomp-sys_getrandom-and-sys_me.patch + # Miscellaneous features features/all/efi-autoload-efivars.patch features/all/virtio-scsi-Implement-change_queue_depth-for-virtscs.patch