VisionFive2 Linux kernel

StarFive Tech Linux Kernel for VisionFive (JH7110) boards (mirror)

More than 9999 Commits   32 Branches   54 Tags
20c8ccb1975b8 (Thomas Gleixner     2019-06-04 10:11:32 +0200    1) // SPDX-License-Identifier: GPL-2.0-only
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    2) /*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    3)  *  fs/userfaultfd.c
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    4)  *
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    5)  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    6)  *  Copyright (C) 2008-2009 Red Hat, Inc.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    7)  *  Copyright (C) 2015  Red Hat, Inc.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    8)  *
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700    9)  *  Some part derived from fs/eventfd.c (anon inode setup) and
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   10)  *  mm/ksm.c (mm hashing).
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   11)  */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   12) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800   13) #include <linux/list.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   14) #include <linux/hashtable.h>
174cd4b1e5fbd (Ingo Molnar         2017-02-02 19:15:33 +0100   15) #include <linux/sched/signal.h>
6e84f31522f93 (Ingo Molnar         2017-02-08 18:51:29 +0100   16) #include <linux/sched/mm.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   17) #include <linux/mm.h>
6dfeaff93be1a (Peter Xu            2021-05-04 18:33:13 -0700   18) #include <linux/mmu_notifier.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   19) #include <linux/poll.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   20) #include <linux/slab.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   21) #include <linux/seq_file.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   22) #include <linux/file.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   23) #include <linux/bug.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   24) #include <linux/anon_inodes.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   25) #include <linux/syscalls.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   26) #include <linux/userfaultfd_k.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   27) #include <linux/mempolicy.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   28) #include <linux/ioctl.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   29) #include <linux/security.h>
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800   30) #include <linux/hugetlb.h>
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   31) 
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800   32) int sysctl_unprivileged_userfaultfd __read_mostly;
cefdca0a86be5 (Peter Xu            2019-05-13 17:16:41 -0700   33) 
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   34) static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   35) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   36) enum userfaultfd_state {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   37) 	UFFD_STATE_WAIT_API,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   38) 	UFFD_STATE_RUNNING,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   39) };
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   40) 
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   41) /*
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   42)  * Start with fault_pending_wqh and fault_wqh so they're more likely
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   43)  * to be in the same cacheline.
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   44)  *
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   45)  * Locking order:
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   46)  *	fd_wqh.lock
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   47)  *		fault_pending_wqh.lock
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   48)  *			fault_wqh.lock
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   49)  *		event_wqh.lock
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   50)  *
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   51)  * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   52)  * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700   53)  * also taken in IRQ context.
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   54)  */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   55) struct userfaultfd_ctx {
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700   56) 	/* waitqueue head for the pending (i.e. not read) userfaults */
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700   57) 	wait_queue_head_t fault_pending_wqh;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700   58) 	/* waitqueue head for the userfaults */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   59) 	wait_queue_head_t fault_wqh;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   60) 	/* waitqueue head for the pseudo fd to wakeup poll/read */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   61) 	wait_queue_head_t fd_wqh;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800   62) 	/* waitqueue head for events */
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800   63) 	wait_queue_head_t event_wqh;
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700   64) 	/* a refile sequence protected by fault_pending_wqh lock */
2ca97ac8bdcc3 (Ahmed S. Darwish    2020-07-20 17:55:28 +0200   65) 	seqcount_spinlock_t refile_seq;
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700   66) 	/* pseudo fd refcounting */
ca880420665db (Eric Biggers        2018-12-28 00:34:43 -0800   67) 	refcount_t refcount;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   68) 	/* userfaultfd syscall flags */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   69) 	unsigned int flags;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800   70) 	/* features requested from the userspace */
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800   71) 	unsigned int features;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   72) 	/* state machine */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   73) 	enum userfaultfd_state state;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   74) 	/* released */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   75) 	bool released;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700   76) 	/* memory mappings are changing because of non-cooperative event */
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700   77) 	bool mmap_changing;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   78) 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   79) 	struct mm_struct *mm;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   80) };
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   81) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800   82) struct userfaultfd_fork_ctx {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800   83) 	struct userfaultfd_ctx *orig;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800   84) 	struct userfaultfd_ctx *new;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800   85) 	struct list_head list;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800   86) };
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800   87) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   88) struct userfaultfd_unmap_ctx {
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   89) 	struct userfaultfd_ctx *ctx;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   90) 	unsigned long start;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   91) 	unsigned long end;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   92) 	struct list_head list;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   93) };
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800   94) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   95) struct userfaultfd_wait_queue {
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700   96) 	struct uffd_msg msg;
ac6424b981bce (Ingo Molnar         2017-06-20 12:06:13 +0200   97) 	wait_queue_entry_t wq;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700   98) 	struct userfaultfd_ctx *ctx;
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800   99) 	bool waken;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  100) };
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  101) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  102) struct userfaultfd_wake_range {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  103) 	unsigned long start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  104) 	unsigned long len;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  105) };
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  106) 
ac6424b981bce (Ingo Molnar         2017-06-20 12:06:13 +0200  107) static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  108) 				     int wake_flags, void *key)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  109) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  110) 	struct userfaultfd_wake_range *range = key;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  111) 	int ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  112) 	struct userfaultfd_wait_queue *uwq;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  113) 	unsigned long start, len;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  114) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  115) 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  116) 	ret = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  117) 	/* len == 0 means wake all */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  118) 	start = range->start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  119) 	len = range->len;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  120) 	if (len && (start > uwq->msg.arg.pagefault.address ||
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  121) 		    start + len <= uwq->msg.arg.pagefault.address))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  122) 		goto out;
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800  123) 	WRITE_ONCE(uwq->waken, true);
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800  124) 	/*
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  125) 	 * The Program-Order guarantees provided by the scheduler
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  126) 	 * ensure uwq->waken is visible before the task is woken.
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800  127) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  128) 	ret = wake_up_state(wq->private, mode);
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  129) 	if (ret) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  130) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  131) 		 * Wake only once, autoremove behavior.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  132) 		 *
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  133) 		 * After the effect of list_del_init is visible to the other
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  134) 		 * CPUs, the waitqueue may disappear from under us, see the
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  135) 		 * !list_empty_careful() in handle_userfault().
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  136) 		 *
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  137) 		 * try_to_wake_up() has an implicit smp_mb(), and the
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  138) 		 * wq->private is read before calling the extern function
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  139) 		 * "wake_up_state" (which in turns calls try_to_wake_up).
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  140) 		 */
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200  141) 		list_del_init(&wq->entry);
a9668cd6ee288 (Peter Zijlstra      2017-06-07 17:51:27 +0200  142) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  143) out:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  144) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  145) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  146) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  147) /**
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  148)  * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  149)  * context.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  150)  * @ctx: [in] Pointer to the userfaultfd context.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  151)  */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  152) static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  153) {
ca880420665db (Eric Biggers        2018-12-28 00:34:43 -0800  154) 	refcount_inc(&ctx->refcount);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  155) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  156) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  157) /**
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  158)  * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  159)  * context.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  160)  * @ctx: [in] Pointer to userfaultfd context.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  161)  *
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  162)  * The userfaultfd context reference must have been previously acquired either
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  163)  * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  164)  */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  165) static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  166) {
ca880420665db (Eric Biggers        2018-12-28 00:34:43 -0800  167) 	if (refcount_dec_and_test(&ctx->refcount)) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  168) 		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  169) 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  170) 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  171) 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  172) 		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  173) 		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  174) 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  175) 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700  176) 		mmdrop(ctx->mm);
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700  177) 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  178) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  179) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  180) 
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  181) static inline void msg_init(struct uffd_msg *msg)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  182) {
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  183) 	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  184) 	/*
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  185) 	 * Must use memset to zero out the paddings or kernel data is
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  186) 	 * leaked to userland.
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  187) 	 */
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  188) 	memset(msg, 0, sizeof(struct uffd_msg));
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  189) }
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  190) 
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  191) static inline struct uffd_msg userfault_msg(unsigned long address,
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  192) 					    unsigned int flags,
9d4ac934829ac (Alexey Perevalov    2017-09-06 16:23:56 -0700  193) 					    unsigned long reason,
9d4ac934829ac (Alexey Perevalov    2017-09-06 16:23:56 -0700  194) 					    unsigned int features)
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  195) {
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  196) 	struct uffd_msg msg;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  197) 	msg_init(&msg);
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  198) 	msg.event = UFFD_EVENT_PAGEFAULT;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  199) 	msg.arg.pagefault.address = address;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  200) 	/*
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  201) 	 * These flags indicate why the userfault occurred:
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  202) 	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  203) 	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  204) 	 * - Neither of these flags being set indicates a MISSING fault.
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  205) 	 *
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  206) 	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  207) 	 * fault. Otherwise, it was a read fault.
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  208) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  209) 	if (flags & FAULT_FLAG_WRITE)
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  210) 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  211) 	if (reason & VM_UFFD_WP)
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  212) 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  213) 	if (reason & VM_UFFD_MINOR)
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  214) 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
9d4ac934829ac (Alexey Perevalov    2017-09-06 16:23:56 -0700  215) 	if (features & UFFD_FEATURE_THREAD_ID)
a36985d31a65d (Andrea Arcangeli    2017-09-06 16:23:59 -0700  216) 		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700  217) 	return msg;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  218) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  219) 
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  220) #ifdef CONFIG_HUGETLB_PAGE
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  221) /*
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  222)  * Same functionality as userfaultfd_must_wait below with modifications for
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  223)  * hugepmd ranges.
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  224)  */
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  225) static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
7868a2087ec13 (Punit Agrawal       2017-07-06 15:39:42 -0700  226) 					 struct vm_area_struct *vma,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  227) 					 unsigned long address,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  228) 					 unsigned long flags,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  229) 					 unsigned long reason)
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  230) {
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  231) 	struct mm_struct *mm = ctx->mm;
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  232) 	pte_t *ptep, pte;
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  233) 	bool ret = true;
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  234) 
42fc541404f24 (Michel Lespinasse   2020-06-08 21:33:44 -0700  235) 	mmap_assert_locked(mm);
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  236) 
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  237) 	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  238) 
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  239) 	if (!ptep)
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  240) 		goto out;
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  241) 
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  242) 	ret = false;
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  243) 	pte = huge_ptep_get(ptep);
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  244) 
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  245) 	/*
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  246) 	 * Lockless access: we're in a wait_event so it's ok if it
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  247) 	 * changes under us.
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  248) 	 */
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  249) 	if (huge_pte_none(pte))
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  250) 		ret = true;
1e2c043628c77 (Janosch Frank       2018-07-03 17:02:39 -0700  251) 	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  252) 		ret = true;
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  253) out:
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  254) 	return ret;
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  255) }
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  256) #else
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  257) static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
7868a2087ec13 (Punit Agrawal       2017-07-06 15:39:42 -0700  258) 					 struct vm_area_struct *vma,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  259) 					 unsigned long address,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  260) 					 unsigned long flags,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  261) 					 unsigned long reason)
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  262) {
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  263) 	return false;	/* should never get here */
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  264) }
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  265) #endif /* CONFIG_HUGETLB_PAGE */
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  266) 
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  267) /*
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  268)  * Verify the pagetables are still not ok after having reigstered into
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  269)  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  270)  * userfault that has already been resolved, if userfaultfd_read and
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  271)  * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  272)  * threads.
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  273)  */
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  274) static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  275) 					 unsigned long address,
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  276) 					 unsigned long flags,
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  277) 					 unsigned long reason)
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  278) {
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  279) 	struct mm_struct *mm = ctx->mm;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  280) 	pgd_t *pgd;
c2febafc67734 (Kirill A. Shutemov  2017-03-09 17:24:07 +0300  281) 	p4d_t *p4d;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  282) 	pud_t *pud;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  283) 	pmd_t *pmd, _pmd;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  284) 	pte_t *pte;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  285) 	bool ret = true;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  286) 
42fc541404f24 (Michel Lespinasse   2020-06-08 21:33:44 -0700  287) 	mmap_assert_locked(mm);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  288) 
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  289) 	pgd = pgd_offset(mm, address);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  290) 	if (!pgd_present(*pgd))
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  291) 		goto out;
c2febafc67734 (Kirill A. Shutemov  2017-03-09 17:24:07 +0300  292) 	p4d = p4d_offset(pgd, address);
c2febafc67734 (Kirill A. Shutemov  2017-03-09 17:24:07 +0300  293) 	if (!p4d_present(*p4d))
c2febafc67734 (Kirill A. Shutemov  2017-03-09 17:24:07 +0300  294) 		goto out;
c2febafc67734 (Kirill A. Shutemov  2017-03-09 17:24:07 +0300  295) 	pud = pud_offset(p4d, address);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  296) 	if (!pud_present(*pud))
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  297) 		goto out;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  298) 	pmd = pmd_offset(pud, address);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  299) 	/*
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  300) 	 * READ_ONCE must function as a barrier with narrower scope
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  301) 	 * and it must be equivalent to:
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  302) 	 *	_pmd = *pmd; barrier();
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  303) 	 *
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  304) 	 * This is to deal with the instability (as in
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  305) 	 * pmd_trans_unstable) of the pmd.
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  306) 	 */
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  307) 	_pmd = READ_ONCE(*pmd);
a365ac09d3343 (Huang Ying          2018-01-31 16:17:32 -0800  308) 	if (pmd_none(_pmd))
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  309) 		goto out;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  310) 
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  311) 	ret = false;
a365ac09d3343 (Huang Ying          2018-01-31 16:17:32 -0800  312) 	if (!pmd_present(_pmd))
a365ac09d3343 (Huang Ying          2018-01-31 16:17:32 -0800  313) 		goto out;
a365ac09d3343 (Huang Ying          2018-01-31 16:17:32 -0800  314) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700  315) 	if (pmd_trans_huge(_pmd)) {
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700  316) 		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700  317) 			ret = true;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  318) 		goto out;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700  319) 	}
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  320) 
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  321) 	/*
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  322) 	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  323) 	 * and use the standard pte_offset_map() instead of parsing _pmd.
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  324) 	 */
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  325) 	pte = pte_offset_map(pmd, address);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  326) 	/*
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  327) 	 * Lockless access: we're in a wait_event so it's ok if it
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  328) 	 * changes under us.
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  329) 	 */
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  330) 	if (pte_none(*pte))
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  331) 		ret = true;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700  332) 	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700  333) 		ret = true;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  334) 	pte_unmap(pte);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  335) 
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  336) out:
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  337) 	return ret;
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  338) }
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  339) 
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  340) static inline long userfaultfd_get_blocking_state(unsigned int flags)
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  341) {
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  342) 	if (flags & FAULT_FLAG_INTERRUPTIBLE)
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  343) 		return TASK_INTERRUPTIBLE;
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  344) 
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  345) 	if (flags & FAULT_FLAG_KILLABLE)
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  346) 		return TASK_KILLABLE;
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  347) 
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  348) 	return TASK_UNINTERRUPTIBLE;
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  349) }
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  350) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  351) /*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  352)  * The locking rules involved in returning VM_FAULT_RETRY depending on
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  353)  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  354)  * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  355)  * recommendation in __lock_page_or_retry is not an understatement.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  356)  *
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  357)  * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  358)  * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  359)  * not set.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  360)  *
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  361)  * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  362)  * set, VM_FAULT_RETRY can still be returned if and only if there are
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  363)  * fatal_signal_pending()s, and the mmap_lock must be released before
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  364)  * returning it.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  365)  */
2b7403035459c (Souptick Joarder    2018-08-23 17:01:36 -0700  366) vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  367) {
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  368) 	struct mm_struct *mm = vmf->vma->vm_mm;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  369) 	struct userfaultfd_ctx *ctx;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  370) 	struct userfaultfd_wait_queue uwq;
2b7403035459c (Souptick Joarder    2018-08-23 17:01:36 -0700  371) 	vm_fault_t ret = VM_FAULT_SIGBUS;
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  372) 	bool must_wait;
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800  373) 	long blocking_state;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  374) 
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  375) 	/*
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  376) 	 * We don't do userfault handling for the final child pid update.
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  377) 	 *
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  378) 	 * We also don't do userfault handling during
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  379) 	 * coredumping. hugetlbfs has the special
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  380) 	 * follow_hugetlb_page() to skip missing pages in the
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  381) 	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  382) 	 * the no_page_table() helper in follow_page_mask(), but the
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  383) 	 * shmem_vm_ops->fault method is invoked even during
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  384) 	 * coredumping without mmap_lock and it ends up here.
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  385) 	 */
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  386) 	if (current->flags & (PF_EXITING|PF_DUMPCORE))
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  387) 		goto out;
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  388) 
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  389) 	/*
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  390) 	 * Coredumping runs without mmap_lock so we can only check that
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  391) 	 * the mmap_lock is held, if PF_DUMPCORE was not set.
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  392) 	 */
42fc541404f24 (Michel Lespinasse   2020-06-08 21:33:44 -0700  393) 	mmap_assert_locked(mm);
64c2b20301f62 (Andrea Arcangeli    2017-06-16 14:02:37 -0700  394) 
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  395) 	ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  396) 	if (!ctx)
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  397) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  398) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  399) 	BUG_ON(ctx->mm != mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  400) 
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  401) 	/* Any unrecognized flag is a bug. */
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  402) 	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  403) 	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  404) 	VM_BUG_ON(!reason || (reason & (reason - 1)));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  405) 
2d6d6f5a09a96 (Prakash Sangappa    2017-09-06 16:23:39 -0700  406) 	if (ctx->features & UFFD_FEATURE_SIGBUS)
2d6d6f5a09a96 (Prakash Sangappa    2017-09-06 16:23:39 -0700  407) 		goto out;
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  408) 	if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  409) 	    ctx->flags & UFFD_USER_MODE_ONLY) {
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  410) 		printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  411) 			"sysctl knob to 1 if kernel faults must be handled "
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  412) 			"without obtaining CAP_SYS_PTRACE capability\n");
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  413) 		goto out;
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800  414) 	}
2d6d6f5a09a96 (Prakash Sangappa    2017-09-06 16:23:39 -0700  415) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  416) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  417) 	 * If it's already released don't get it. This avoids to loop
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  418) 	 * in __get_user_pages if userfaultfd_release waits on the
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  419) 	 * caller of handle_userfault to release the mmap_lock.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  420) 	 */
6aa7de059173a (Mark Rutland        2017-10-23 14:07:29 -0700  421) 	if (unlikely(READ_ONCE(ctx->released))) {
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  422) 		/*
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  423) 		 * Don't return VM_FAULT_SIGBUS in this case, so a non
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  424) 		 * cooperative manager can close the uffd after the
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  425) 		 * last UFFDIO_COPY, without risking to trigger an
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  426) 		 * involuntary SIGBUS if the process was starting the
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  427) 		 * userfaultfd while the userfaultfd was still armed
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  428) 		 * (but after the last UFFDIO_COPY). If the uffd
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  429) 		 * wasn't already closed when the userfault reached
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  430) 		 * this point, that would normally be solved by
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  431) 		 * userfaultfd_must_wait returning 'false'.
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  432) 		 *
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  433) 		 * If we were to return VM_FAULT_SIGBUS here, the non
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  434) 		 * cooperative manager would be instead forced to
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  435) 		 * always call UFFDIO_UNREGISTER before it can safely
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  436) 		 * close the uffd.
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  437) 		 */
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  438) 		ret = VM_FAULT_NOPAGE;
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  439) 		goto out;
656710a60e369 (Andrea Arcangeli    2017-09-08 16:12:42 -0700  440) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  441) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  442) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  443) 	 * Check that we can return VM_FAULT_RETRY.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  444) 	 *
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  445) 	 * NOTE: it should become possible to return VM_FAULT_RETRY
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  446) 	 * even if FAULT_FLAG_TRIED is set without leading to gup()
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  447) 	 * -EBUSY failures, if the userfaultfd is to be extended for
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  448) 	 * VM_UFFD_WP tracking and we intend to arm the userfault
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  449) 	 * without first stopping userland access to the memory. For
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  450) 	 * VM_UFFD_MISSING userfaults this is enough for now.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  451) 	 */
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  452) 	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  453) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  454) 		 * Validate the invariant that nowait must allow retry
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  455) 		 * to be sure not to return SIGBUS erroneously on
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  456) 		 * nowait invocations.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  457) 		 */
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  458) 		BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  459) #ifdef CONFIG_DEBUG_VM
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  460) 		if (printk_ratelimit()) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  461) 			printk(KERN_WARNING
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  462) 			       "FAULT_FLAG_ALLOW_RETRY missing %x\n",
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  463) 			       vmf->flags);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  464) 			dump_stack();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  465) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  466) #endif
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  467) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  468) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  469) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  470) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  471) 	 * Handle nowait, not much to do other than tell it to retry
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  472) 	 * and wait.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  473) 	 */
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  474) 	ret = VM_FAULT_RETRY;
82b0f8c39a386 (Jan Kara            2016-12-14 15:06:58 -0800  475) 	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  476) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  477) 
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  478) 	/* take the reference before dropping the mmap_lock */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  479) 	userfaultfd_ctx_get(ctx);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  480) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  481) 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  482) 	uwq.wq.private = current;
9d4ac934829ac (Alexey Perevalov    2017-09-06 16:23:56 -0700  483) 	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
9d4ac934829ac (Alexey Perevalov    2017-09-06 16:23:56 -0700  484) 			ctx->features);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  485) 	uwq.ctx = ctx;
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800  486) 	uwq.waken = false;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  487) 
3e69ad081c18d (Peter Xu            2020-04-01 21:09:00 -0700  488) 	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
dfa37dc3fc1f6 (Andrea Arcangeli    2015-09-04 15:47:18 -0700  489) 
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  490) 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  491) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  492) 	 * After the __add_wait_queue the uwq is visible to userland
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  493) 	 * through poll/read().
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  494) 	 */
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  495) 	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  496) 	/*
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  497) 	 * The smp_mb() after __set_current_state prevents the reads
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  498) 	 * following the spin_unlock to happen before the list_add in
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  499) 	 * __add_wait_queue.
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  500) 	 */
15a77c6fe494f (Andrea Arcangeli    2017-01-24 15:17:59 -0800  501) 	set_current_state(blocking_state);
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  502) 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  503) 
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  504) 	if (!is_vm_hugetlb_page(vmf->vma))
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  505) 		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  506) 						  reason);
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  507) 	else
7868a2087ec13 (Punit Agrawal       2017-07-06 15:39:42 -0700  508) 		must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
7868a2087ec13 (Punit Agrawal       2017-07-06 15:39:42 -0700  509) 						       vmf->address,
369cd2121be44 (Mike Kravetz        2017-02-22 15:43:10 -0800  510) 						       vmf->flags, reason);
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700  511) 	mmap_read_unlock(mm);
8d2afd96c2031 (Andrea Arcangeli    2015-09-04 15:46:51 -0700  512) 
f9bf352224d7d (Linus Torvalds      2020-08-02 10:42:31 -0700  513) 	if (likely(must_wait && !READ_ONCE(ctx->released))) {
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  514) 		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  515) 		schedule();
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  516) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  517) 
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  518) 	__set_current_state(TASK_RUNNING);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  519) 
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  520) 	/*
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  521) 	 * Here we race with the list_del; list_add in
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  522) 	 * userfaultfd_ctx_read(), however because we don't ever run
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  523) 	 * list_del_init() to refile across the two lists, the prev
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  524) 	 * and next pointers will never point to self. list_add also
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  525) 	 * would never let any of the two pointers to point to
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  526) 	 * self. So list_empty_careful won't risk to see both pointers
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  527) 	 * pointing to self at any time during the list refile. The
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  528) 	 * only case where list_del_init() is called is the full
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  529) 	 * removal in the wake function and there we don't re-list_add
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  530) 	 * and it's fine not to block on the spinlock. The uwq on this
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  531) 	 * kernel stack can be released after the list_del_init.
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  532) 	 */
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200  533) 	if (!list_empty_careful(&uwq.wq.entry)) {
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  534) 		spin_lock_irq(&ctx->fault_pending_wqh.lock);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  535) 		/*
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  536) 		 * No need of list_del_init(), the uwq on the stack
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  537) 		 * will be freed shortly anyway.
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  538) 		 */
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200  539) 		list_del(&uwq.wq.entry);
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  540) 		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  541) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  542) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  543) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  544) 	 * ctx may go away after this if the userfault pseudo fd is
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  545) 	 * already released.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  546) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  547) 	userfaultfd_ctx_put(ctx);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  548) 
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  549) out:
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  550) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  551) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  552) 
8c9e7bb7a41f2 (Andrea Arcangeli    2017-03-09 16:16:54 -0800  553) static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
8c9e7bb7a41f2 (Andrea Arcangeli    2017-03-09 16:16:54 -0800  554) 					      struct userfaultfd_wait_queue *ewq)
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  555) {
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  556) 	struct userfaultfd_ctx *release_new_ctx;
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  557) 
9a69a829f9b65 (Andrea Arcangeli    2017-03-09 16:16:52 -0800  558) 	if (WARN_ON_ONCE(current->flags & PF_EXITING))
9a69a829f9b65 (Andrea Arcangeli    2017-03-09 16:16:52 -0800  559) 		goto out;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  560) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  561) 	ewq->ctx = ctx;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  562) 	init_waitqueue_entry(&ewq->wq, current);
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  563) 	release_new_ctx = NULL;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  564) 
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  565) 	spin_lock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  566) 	/*
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  567) 	 * After the __add_wait_queue the uwq is visible to userland
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  568) 	 * through poll/read().
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  569) 	 */
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  570) 	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  571) 	for (;;) {
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  572) 		set_current_state(TASK_KILLABLE);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  573) 		if (ewq->msg.event == 0)
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  574) 			break;
6aa7de059173a (Mark Rutland        2017-10-23 14:07:29 -0700  575) 		if (READ_ONCE(ctx->released) ||
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  576) 		    fatal_signal_pending(current)) {
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700  577) 			/*
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700  578) 			 * &ewq->wq may be queued in fork_event, but
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700  579) 			 * __remove_wait_queue ignores the head
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700  580) 			 * parameter. It would be a problem if it
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700  581) 			 * didn't.
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700  582) 			 */
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  583) 			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  584) 			if (ewq->msg.event == UFFD_EVENT_FORK) {
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  585) 				struct userfaultfd_ctx *new;
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  586) 
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  587) 				new = (struct userfaultfd_ctx *)
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  588) 					(unsigned long)
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  589) 					ewq->msg.arg.reserved.reserved1;
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  590) 				release_new_ctx = new;
7eb76d457fd75 (Mike Rapoport       2017-03-09 16:17:09 -0800  591) 			}
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  592) 			break;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  593) 		}
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  594) 
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  595) 		spin_unlock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  596) 
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  597) 		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  598) 		schedule();
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  599) 
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  600) 		spin_lock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  601) 	}
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  602) 	__set_current_state(TASK_RUNNING);
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  603) 	spin_unlock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  604) 
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  605) 	if (release_new_ctx) {
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  606) 		struct vm_area_struct *vma;
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  607) 		struct mm_struct *mm = release_new_ctx->mm;
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  608) 
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  609) 		/* the various vma->vm_userfaultfd_ctx still points to it */
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700  610) 		mmap_write_lock(mm);
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  611) 		for (vma = mm->mmap; vma; vma = vma->vm_next)
31e810aa1033a (Mike Rapoport       2018-08-02 15:36:09 -0700  612) 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  613) 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  614) 				vma->vm_flags &= ~__VM_UFFD_FLAGS;
31e810aa1033a (Mike Rapoport       2018-08-02 15:36:09 -0700  615) 			}
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700  616) 		mmap_write_unlock(mm);
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  617) 
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  618) 		userfaultfd_ctx_put(release_new_ctx);
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  619) 	}
0cbb4b4f4c44f (Andrea Arcangeli    2018-01-04 16:18:09 -0800  620) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  621) 	/*
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  622) 	 * ctx may go away after this if the userfault pseudo fd is
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  623) 	 * already released.
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  624) 	 */
9a69a829f9b65 (Andrea Arcangeli    2017-03-09 16:16:52 -0800  625) out:
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700  626) 	WRITE_ONCE(ctx->mmap_changing, false);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  627) 	userfaultfd_ctx_put(ctx);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  628) }
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  629) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  630) static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  631) 				       struct userfaultfd_wait_queue *ewq)
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  632) {
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  633) 	ewq->msg.event = 0;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  634) 	wake_up_locked(&ctx->event_wqh);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  635) 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  636) }
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  637) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  638) int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  639) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  640) 	struct userfaultfd_ctx *ctx = NULL, *octx;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  641) 	struct userfaultfd_fork_ctx *fctx;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  642) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  643) 	octx = vma->vm_userfaultfd_ctx.ctx;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  644) 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  645) 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  646) 		vma->vm_flags &= ~__VM_UFFD_FLAGS;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  647) 		return 0;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  648) 	}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  649) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  650) 	list_for_each_entry(fctx, fcs, list)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  651) 		if (fctx->orig == octx) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  652) 			ctx = fctx->new;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  653) 			break;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  654) 		}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  655) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  656) 	if (!ctx) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  657) 		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  658) 		if (!fctx)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  659) 			return -ENOMEM;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  660) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  661) 		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  662) 		if (!ctx) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  663) 			kfree(fctx);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  664) 			return -ENOMEM;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  665) 		}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  666) 
ca880420665db (Eric Biggers        2018-12-28 00:34:43 -0800  667) 		refcount_set(&ctx->refcount, 1);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  668) 		ctx->flags = octx->flags;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  669) 		ctx->state = UFFD_STATE_RUNNING;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  670) 		ctx->features = octx->features;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  671) 		ctx->released = false;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700  672) 		ctx->mmap_changing = false;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  673) 		ctx->mm = vma->vm_mm;
00bb31fa44acf (Mike Rapoport       2017-11-15 17:36:56 -0800  674) 		mmgrab(ctx->mm);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  675) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  676) 		userfaultfd_ctx_get(octx);
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700  677) 		WRITE_ONCE(octx->mmap_changing, true);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  678) 		fctx->orig = octx;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  679) 		fctx->new = ctx;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  680) 		list_add_tail(&fctx->list, fcs);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  681) 	}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  682) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  683) 	vma->vm_userfaultfd_ctx.ctx = ctx;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  684) 	return 0;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  685) }
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  686) 
8c9e7bb7a41f2 (Andrea Arcangeli    2017-03-09 16:16:54 -0800  687) static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  688) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  689) 	struct userfaultfd_ctx *ctx = fctx->orig;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  690) 	struct userfaultfd_wait_queue ewq;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  691) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  692) 	msg_init(&ewq.msg);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  693) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  694) 	ewq.msg.event = UFFD_EVENT_FORK;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  695) 	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  696) 
8c9e7bb7a41f2 (Andrea Arcangeli    2017-03-09 16:16:54 -0800  697) 	userfaultfd_event_wait_completion(ctx, &ewq);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  698) }
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  699) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  700) void dup_userfaultfd_complete(struct list_head *fcs)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  701) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  702) 	struct userfaultfd_fork_ctx *fctx, *n;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  703) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  704) 	list_for_each_entry_safe(fctx, n, fcs, list) {
8c9e7bb7a41f2 (Andrea Arcangeli    2017-03-09 16:16:54 -0800  705) 		dup_fctx(fctx);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  706) 		list_del(&fctx->list);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  707) 		kfree(fctx);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  708) 	}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  709) }
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  710) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  711) void mremap_userfaultfd_prep(struct vm_area_struct *vma,
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  712) 			     struct vm_userfaultfd_ctx *vm_ctx)
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  713) {
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  714) 	struct userfaultfd_ctx *ctx;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  715) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  716) 	ctx = vma->vm_userfaultfd_ctx.ctx;
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  717) 
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  718) 	if (!ctx)
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  719) 		return;
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  720) 
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  721) 	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  722) 		vm_ctx->ctx = ctx;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  723) 		userfaultfd_ctx_get(ctx);
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700  724) 		WRITE_ONCE(ctx->mmap_changing, true);
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  725) 	} else {
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  726) 		/* Drop uffd context if remap feature not enabled */
3cfd22be0ad66 (Peter Xu            2018-12-28 00:38:47 -0800  727) 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  728) 		vma->vm_flags &= ~__VM_UFFD_FLAGS;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  729) 	}
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  730) }
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  731) 
90794bf19dc19 (Andrea Arcangeli    2017-02-22 15:42:37 -0800  732) void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  733) 				 unsigned long from, unsigned long to,
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  734) 				 unsigned long len)
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  735) {
90794bf19dc19 (Andrea Arcangeli    2017-02-22 15:42:37 -0800  736) 	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  737) 	struct userfaultfd_wait_queue ewq;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  738) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  739) 	if (!ctx)
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  740) 		return;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  741) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  742) 	if (to & ~PAGE_MASK) {
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  743) 		userfaultfd_ctx_put(ctx);
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  744) 		return;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  745) 	}
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  746) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  747) 	msg_init(&ewq.msg);
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  748) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  749) 	ewq.msg.event = UFFD_EVENT_REMAP;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  750) 	ewq.msg.arg.remap.from = from;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  751) 	ewq.msg.arg.remap.to = to;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  752) 	ewq.msg.arg.remap.len = len;
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  753) 
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  754) 	userfaultfd_event_wait_completion(ctx, &ewq);
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  755) }
72f87654c6969 (Pavel Emelyanov     2017-02-22 15:42:34 -0800  756) 
70ccb92fdd90b (Andrea Arcangeli    2017-03-09 16:17:11 -0800  757) bool userfaultfd_remove(struct vm_area_struct *vma,
d811914d87576 (Mike Rapoport       2017-02-24 14:56:02 -0800  758) 			unsigned long start, unsigned long end)
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  759) {
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  760) 	struct mm_struct *mm = vma->vm_mm;
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  761) 	struct userfaultfd_ctx *ctx;
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  762) 	struct userfaultfd_wait_queue ewq;
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  763) 
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  764) 	ctx = vma->vm_userfaultfd_ctx.ctx;
d811914d87576 (Mike Rapoport       2017-02-24 14:56:02 -0800  765) 	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
70ccb92fdd90b (Andrea Arcangeli    2017-03-09 16:17:11 -0800  766) 		return true;
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  767) 
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  768) 	userfaultfd_ctx_get(ctx);
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700  769) 	WRITE_ONCE(ctx->mmap_changing, true);
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700  770) 	mmap_read_unlock(mm);
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  771) 
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  772) 	msg_init(&ewq.msg);
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  773) 
d811914d87576 (Mike Rapoport       2017-02-24 14:56:02 -0800  774) 	ewq.msg.event = UFFD_EVENT_REMOVE;
d811914d87576 (Mike Rapoport       2017-02-24 14:56:02 -0800  775) 	ewq.msg.arg.remove.start = start;
d811914d87576 (Mike Rapoport       2017-02-24 14:56:02 -0800  776) 	ewq.msg.arg.remove.end = end;
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  777) 
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  778) 	userfaultfd_event_wait_completion(ctx, &ewq);
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  779) 
70ccb92fdd90b (Andrea Arcangeli    2017-03-09 16:17:11 -0800  780) 	return false;
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  781) }
05ce77249d506 (Pavel Emelyanov     2017-02-22 15:42:40 -0800  782) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  783) static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  784) 			  unsigned long start, unsigned long end)
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  785) {
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  786) 	struct userfaultfd_unmap_ctx *unmap_ctx;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  787) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  788) 	list_for_each_entry(unmap_ctx, unmaps, list)
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  789) 		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  790) 		    unmap_ctx->end == end)
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  791) 			return true;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  792) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  793) 	return false;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  794) }
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  795) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  796) int userfaultfd_unmap_prep(struct vm_area_struct *vma,
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  797) 			   unsigned long start, unsigned long end,
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  798) 			   struct list_head *unmaps)
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  799) {
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  800) 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  801) 		struct userfaultfd_unmap_ctx *unmap_ctx;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  802) 		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  803) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  804) 		if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  805) 		    has_unmap_ctx(ctx, unmaps, start, end))
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  806) 			continue;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  807) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  808) 		unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  809) 		if (!unmap_ctx)
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  810) 			return -ENOMEM;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  811) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  812) 		userfaultfd_ctx_get(ctx);
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700  813) 		WRITE_ONCE(ctx->mmap_changing, true);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  814) 		unmap_ctx->ctx = ctx;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  815) 		unmap_ctx->start = start;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  816) 		unmap_ctx->end = end;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  817) 		list_add_tail(&unmap_ctx->list, unmaps);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  818) 	}
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  819) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  820) 	return 0;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  821) }
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  822) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  823) void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  824) {
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  825) 	struct userfaultfd_unmap_ctx *ctx, *n;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  826) 	struct userfaultfd_wait_queue ewq;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  827) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  828) 	list_for_each_entry_safe(ctx, n, uf, list) {
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  829) 		msg_init(&ewq.msg);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  830) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  831) 		ewq.msg.event = UFFD_EVENT_UNMAP;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  832) 		ewq.msg.arg.remove.start = ctx->start;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  833) 		ewq.msg.arg.remove.end = ctx->end;
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  834) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  835) 		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  836) 
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  837) 		list_del(&ctx->list);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  838) 		kfree(ctx);
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  839) 	}
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  840) }
897ab3e0c49e2 (Mike Rapoport       2017-02-24 14:58:22 -0800  841) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  842) static int userfaultfd_release(struct inode *inode, struct file *file)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  843) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  844) 	struct userfaultfd_ctx *ctx = file->private_data;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  845) 	struct mm_struct *mm = ctx->mm;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  846) 	struct vm_area_struct *vma, *prev;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  847) 	/* len == 0 means wake all */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  848) 	struct userfaultfd_wake_range range = { .len = 0, };
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  849) 	unsigned long new_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  850) 
6aa7de059173a (Mark Rutland        2017-10-23 14:07:29 -0700  851) 	WRITE_ONCE(ctx->released, true);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  852) 
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700  853) 	if (!mmget_not_zero(mm))
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700  854) 		goto wakeup;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700  855) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  856) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  857) 	 * Flush page faults out of all CPUs. NOTE: all page faults
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  858) 	 * must be retried without returning VM_FAULT_SIGBUS if
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  859) 	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  860) 	 * changes while handle_userfault released the mmap_lock. So
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  861) 	 * it's critical that released is set to true (above), before
c1e8d7c6a7a68 (Michel Lespinasse   2020-06-08 21:33:54 -0700  862) 	 * taking the mmap_lock for writing.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  863) 	 */
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700  864) 	mmap_write_lock(mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  865) 	prev = NULL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  866) 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  867) 		cond_resched();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  868) 		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  869) 		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  870) 		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  871) 			prev = vma;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  872) 			continue;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  873) 		}
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700  874) 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  875) 		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  876) 				 new_flags, vma->anon_vma,
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  877) 				 vma->vm_file, vma->vm_pgoff,
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  878) 				 vma_policy(vma),
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  879) 				 NULL_VM_UFFD_CTX);
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  880) 		if (prev)
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  881) 			vma = prev;
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  882) 		else
4d45e75a9955a (Jann Horn           2020-10-15 20:13:00 -0700  883) 			prev = vma;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  884) 		vma->vm_flags = new_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  885) 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  886) 	}
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700  887) 	mmap_write_unlock(mm);
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700  888) 	mmput(mm);
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700  889) wakeup:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  890) 	/*
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  891) 	 * After no new page faults can wait on this fault_*wqh, flush
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  892) 	 * the last page faults that may have been already waiting on
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  893) 	 * the fault_*wqh.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  894) 	 */
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  895) 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
ac5be6b47e8bd (Andrea Arcangeli    2015-09-22 14:58:49 -0700  896) 	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
c430d1e848ff1 (Matthew Wilcox      2018-08-21 21:56:30 -0700  897) 	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700  898) 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  899) 
5a18b64e3f021 (Mike Rapoport       2017-08-02 13:32:24 -0700  900) 	/* Flush pending events that may still wait on event_wqh */
5a18b64e3f021 (Mike Rapoport       2017-08-02 13:32:24 -0700  901) 	wake_up_all(&ctx->event_wqh);
5a18b64e3f021 (Mike Rapoport       2017-08-02 13:32:24 -0700  902) 
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  903) 	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  904) 	userfaultfd_ctx_put(ctx);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  905) 	return 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  906) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  907) 
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  908) /* fault_pending_wqh.lock must be hold by the caller */
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  909) static inline struct userfaultfd_wait_queue *find_userfault_in(
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  910) 		wait_queue_head_t *wqh)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  911) {
ac6424b981bce (Ingo Molnar         2017-06-20 12:06:13 +0200  912) 	wait_queue_entry_t *wq;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  913) 	struct userfaultfd_wait_queue *uwq;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  914) 
456a737896b25 (Lance Roy           2018-10-04 23:45:44 -0700  915) 	lockdep_assert_held(&wqh->lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  916) 
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  917) 	uwq = NULL;
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  918) 	if (!waitqueue_active(wqh))
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  919) 		goto out;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  920) 	/* walk in reverse to provide FIFO behavior to read userfaults */
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200  921) 	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  922) 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  923) out:
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  924) 	return uwq;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  925) }
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  926) 
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  927) static inline struct userfaultfd_wait_queue *find_userfault(
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  928) 		struct userfaultfd_ctx *ctx)
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  929) {
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  930) 	return find_userfault_in(&ctx->fault_pending_wqh);
6dcc27fd39437 (Pavel Emelyanov     2017-02-22 15:42:18 -0800  931) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  932) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  933) static inline struct userfaultfd_wait_queue *find_userfault_evt(
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  934) 		struct userfaultfd_ctx *ctx)
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  935) {
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  936) 	return find_userfault_in(&ctx->event_wqh);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  937) }
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  938) 
076ccb76e1a6c (Al Viro             2017-07-03 01:02:18 -0400  939) static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  940) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  941) 	struct userfaultfd_ctx *ctx = file->private_data;
076ccb76e1a6c (Al Viro             2017-07-03 01:02:18 -0400  942) 	__poll_t ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  943) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  944) 	poll_wait(file, &ctx->fd_wqh, wait);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  945) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  946) 	switch (ctx->state) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  947) 	case UFFD_STATE_WAIT_API:
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  948) 		return EPOLLERR;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  949) 	case UFFD_STATE_RUNNING:
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  950) 		/*
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  951) 		 * poll() never guarantees that read won't block.
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  952) 		 * userfaults can be waken before they're read().
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  953) 		 */
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700  954) 		if (unlikely(!(file->f_flags & O_NONBLOCK)))
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  955) 			return EPOLLERR;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  956) 		/*
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  957) 		 * lockless access to see if there are pending faults
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  958) 		 * __pollwait last action is the add_wait_queue but
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  959) 		 * the spin_unlock would allow the waitqueue_active to
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  960) 		 * pass above the actual list_add inside
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  961) 		 * add_wait_queue critical section. So use a full
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  962) 		 * memory barrier to serialize the list_add write of
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  963) 		 * add_wait_queue() with the waitqueue_active read
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  964) 		 * below.
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  965) 		 */
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  966) 		ret = 0;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  967) 		smp_mb();
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700  968) 		if (waitqueue_active(&ctx->fault_pending_wqh))
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  969) 			ret = EPOLLIN;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  970) 		else if (waitqueue_active(&ctx->event_wqh))
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  971) 			ret = EPOLLIN;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800  972) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  973) 		return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  974) 	default:
8474901a33d8a (Andrea Arcangeli    2017-02-22 15:42:12 -0800  975) 		WARN_ON_ONCE(1);
a9a08845e9acb (Linus Torvalds      2018-02-11 14:34:03 -0800  976) 		return EPOLLERR;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  977) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  978) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  979) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  980) static const struct file_operations userfaultfd_fops;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  981) 
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800  982) static int resolve_userfault_fork(struct userfaultfd_ctx *new,
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800  983) 				  struct inode *inode,
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  984) 				  struct uffd_msg *msg)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  985) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  986) 	int fd;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  987) 
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800  988) 	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800  989) 			O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  990) 	if (fd < 0)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  991) 		return fd;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  992) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  993) 	msg->arg.reserved.reserved1 = 0;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  994) 	msg->arg.fork.ufd = fd;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  995) 	return 0;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  996) }
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800  997) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700  998) static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800  999) 				    struct uffd_msg *msg, struct inode *inode)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1000) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1001) 	ssize_t ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1002) 	DECLARE_WAITQUEUE(wait, current);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1003) 	struct userfaultfd_wait_queue *uwq;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1004) 	/*
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1005) 	 * Handling fork event requires sleeping operations, so
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1006) 	 * we drop the event_wqh lock, then do these ops, then
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1007) 	 * lock it back and wake up the waiter. While the lock is
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1008) 	 * dropped the ewq may go away so we keep track of it
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1009) 	 * carefully.
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1010) 	 */
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1011) 	LIST_HEAD(fork_event);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1012) 	struct userfaultfd_ctx *fork_nctx = NULL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1013) 
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1014) 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
ae62c16e105a8 (Christoph Hellwig   2018-10-26 15:02:19 -0700 1015) 	spin_lock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1016) 	__add_wait_queue(&ctx->fd_wqh, &wait);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1017) 	for (;;) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1018) 		set_current_state(TASK_INTERRUPTIBLE);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1019) 		spin_lock(&ctx->fault_pending_wqh.lock);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1020) 		uwq = find_userfault(ctx);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1021) 		if (uwq) {
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1022) 			/*
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1023) 			 * Use a seqcount to repeat the lockless check
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1024) 			 * in wake_userfault() to avoid missing
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1025) 			 * wakeups because during the refile both
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1026) 			 * waitqueue could become empty if this is the
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1027) 			 * only userfault.
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1028) 			 */
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1029) 			write_seqcount_begin(&ctx->refile_seq);
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1030) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1031) 			/*
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1032) 			 * The fault_pending_wqh.lock prevents the uwq
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1033) 			 * to disappear from under us.
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1034) 			 *
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1035) 			 * Refile this userfault from
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1036) 			 * fault_pending_wqh to fault_wqh, it's not
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1037) 			 * pending anymore after we read it.
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1038) 			 *
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1039) 			 * Use list_del() by hand (as
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1040) 			 * userfaultfd_wake_function also uses
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1041) 			 * list_del_init() by hand) to be sure nobody
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1042) 			 * changes __remove_wait_queue() to use
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1043) 			 * list_del_init() in turn breaking the
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1044) 			 * !list_empty_careful() check in
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200 1045) 			 * handle_userfault(). The uwq->wq.head list
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1046) 			 * must never be empty at any time during the
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1047) 			 * refile, or the waitqueue could disappear
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1048) 			 * from under us. The "wait_queue_head_t"
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1049) 			 * parameter of __remove_wait_queue() is unused
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1050) 			 * anyway.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1051) 			 */
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200 1052) 			list_del(&uwq->wq.entry);
c430d1e848ff1 (Matthew Wilcox      2018-08-21 21:56:30 -0700 1053) 			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1054) 
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1055) 			write_seqcount_end(&ctx->refile_seq);
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1056) 
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1057) 			/* careful to always initialize msg if ret == 0 */
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1058) 			*msg = uwq->msg;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1059) 			spin_unlock(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1060) 			ret = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1061) 			break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1062) 		}
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1063) 		spin_unlock(&ctx->fault_pending_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1064) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1065) 		spin_lock(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1066) 		uwq = find_userfault_evt(ctx);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1067) 		if (uwq) {
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1068) 			*msg = uwq->msg;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1069) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1070) 			if (uwq->msg.event == UFFD_EVENT_FORK) {
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1071) 				fork_nctx = (struct userfaultfd_ctx *)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1072) 					(unsigned long)
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1073) 					uwq->msg.arg.reserved.reserved1;
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200 1074) 				list_move(&uwq->wq.entry, &fork_event);
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1075) 				/*
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1076) 				 * fork_nctx can be freed as soon as
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1077) 				 * we drop the lock, unless we take a
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1078) 				 * reference on it.
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1079) 				 */
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1080) 				userfaultfd_ctx_get(fork_nctx);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1081) 				spin_unlock(&ctx->event_wqh.lock);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1082) 				ret = 0;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1083) 				break;
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1084) 			}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1085) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1086) 			userfaultfd_event_complete(ctx, uwq);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1087) 			spin_unlock(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1088) 			ret = 0;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1089) 			break;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1090) 		}
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1091) 		spin_unlock(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1092) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1093) 		if (signal_pending(current)) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1094) 			ret = -ERESTARTSYS;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1095) 			break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1096) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1097) 		if (no_wait) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1098) 			ret = -EAGAIN;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1099) 			break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1100) 		}
ae62c16e105a8 (Christoph Hellwig   2018-10-26 15:02:19 -0700 1101) 		spin_unlock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1102) 		schedule();
ae62c16e105a8 (Christoph Hellwig   2018-10-26 15:02:19 -0700 1103) 		spin_lock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1104) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1105) 	__remove_wait_queue(&ctx->fd_wqh, &wait);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1106) 	__set_current_state(TASK_RUNNING);
ae62c16e105a8 (Christoph Hellwig   2018-10-26 15:02:19 -0700 1107) 	spin_unlock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1108) 
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1109) 	if (!ret && msg->event == UFFD_EVENT_FORK) {
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800 1110) 		ret = resolve_userfault_fork(fork_nctx, inode, msg);
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700 1111) 		spin_lock_irq(&ctx->event_wqh.lock);
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1112) 		if (!list_empty(&fork_event)) {
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1113) 			/*
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1114) 			 * The fork thread didn't abort, so we can
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1115) 			 * drop the temporary refcount.
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1116) 			 */
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1117) 			userfaultfd_ctx_put(fork_nctx);
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1118) 
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1119) 			uwq = list_first_entry(&fork_event,
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1120) 					       typeof(*uwq),
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1121) 					       wq.entry);
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1122) 			/*
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1123) 			 * If fork_event list wasn't empty and in turn
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1124) 			 * the event wasn't already released by fork
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1125) 			 * (the event is allocated on fork kernel
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1126) 			 * stack), put the event back to its place in
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1127) 			 * the event_wq. fork_event head will be freed
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1128) 			 * as soon as we return so the event cannot
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1129) 			 * stay queued there no matter the current
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1130) 			 * "ret" value.
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1131) 			 */
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1132) 			list_del(&uwq->wq.entry);
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1133) 			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1134) 
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1135) 			/*
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1136) 			 * Leave the event in the waitqueue and report
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1137) 			 * error to userland if we failed to resolve
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1138) 			 * the userfault fork.
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1139) 			 */
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1140) 			if (likely(!ret))
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1141) 				userfaultfd_event_complete(ctx, uwq);
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1142) 		} else {
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1143) 			/*
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1144) 			 * Here the fork thread aborted and the
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1145) 			 * refcount from the fork thread on fork_nctx
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1146) 			 * has already been released. We still hold
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1147) 			 * the reference we took before releasing the
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1148) 			 * lock above. If resolve_userfault_fork
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1149) 			 * failed we've to drop it because the
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1150) 			 * fork_nctx has to be freed in such case. If
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1151) 			 * it succeeded we'll hold it because the new
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1152) 			 * uffd references it.
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1153) 			 */
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1154) 			if (ret)
384632e67e082 (Andrea Arcangeli    2017-10-03 16:15:38 -0700 1155) 				userfaultfd_ctx_put(fork_nctx);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1156) 		}
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700 1157) 		spin_unlock_irq(&ctx->event_wqh.lock);
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1158) 	}
893e26e61d04e (Pavel Emelyanov     2017-02-22 15:42:27 -0800 1159) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1160) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1161) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1162) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1163) static ssize_t userfaultfd_read(struct file *file, char __user *buf,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1164) 				size_t count, loff_t *ppos)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1165) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1166) 	struct userfaultfd_ctx *ctx = file->private_data;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1167) 	ssize_t _ret, ret = 0;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1168) 	struct uffd_msg msg;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1169) 	int no_wait = file->f_flags & O_NONBLOCK;
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800 1170) 	struct inode *inode = file_inode(file);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1171) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1172) 	if (ctx->state == UFFD_STATE_WAIT_API)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1173) 		return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1174) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1175) 	for (;;) {
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1176) 		if (count < sizeof(msg))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1177) 			return ret ? ret : -EINVAL;
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800 1178) 		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1179) 		if (_ret < 0)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1180) 			return ret ? ret : _ret;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1181) 		if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1182) 			return ret ? ret : -EFAULT;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1183) 		ret += sizeof(msg);
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1184) 		buf += sizeof(msg);
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1185) 		count -= sizeof(msg);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1186) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1187) 		 * Allow to read more than one fault at time but only
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1188) 		 * block if waiting for the very first one.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1189) 		 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1190) 		no_wait = O_NONBLOCK;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1191) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1192) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1193) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1194) static void __wake_userfault(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1195) 			     struct userfaultfd_wake_range *range)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1196) {
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700 1197) 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1198) 	/* wake all in the range and autoremove */
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1199) 	if (waitqueue_active(&ctx->fault_pending_wqh))
ac5be6b47e8bd (Andrea Arcangeli    2015-09-22 14:58:49 -0700 1200) 		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1201) 				     range);
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 1202) 	if (waitqueue_active(&ctx->fault_wqh))
c430d1e848ff1 (Matthew Wilcox      2018-08-21 21:56:30 -0700 1203) 		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700 1204) 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1205) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1206) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1207) static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1208) 					   struct userfaultfd_wake_range *range)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1209) {
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1210) 	unsigned seq;
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1211) 	bool need_wakeup;
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1212) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1213) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1214) 	 * To be sure waitqueue_active() is not reordered by the CPU
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1215) 	 * before the pagetable update, use an explicit SMP memory
3e4e28c5a8f01 (Michel Lespinasse   2020-06-08 21:33:51 -0700 1216) 	 * barrier here. PT lock release or mmap_read_unlock(mm) still
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1217) 	 * have release semantics that can allow the
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1218) 	 * waitqueue_active() to be reordered before the pte update.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1219) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1220) 	smp_mb();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1221) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1222) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1223) 	 * Use waitqueue_active because it's very frequent to
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1224) 	 * change the address space atomically even if there are no
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1225) 	 * userfaults yet. So we take the spinlock only when we're
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1226) 	 * sure we've userfaults to wake.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1227) 	 */
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1228) 	do {
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1229) 		seq = read_seqcount_begin(&ctx->refile_seq);
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1230) 		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1231) 			waitqueue_active(&ctx->fault_wqh);
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1232) 		cond_resched();
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1233) 	} while (read_seqcount_retry(&ctx->refile_seq, seq));
2c5b7e1be74ff (Andrea Arcangeli    2015-09-04 15:47:23 -0700 1234) 	if (need_wakeup)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1235) 		__wake_userfault(ctx, range);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1236) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1237) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1238) static __always_inline int validate_range(struct mm_struct *mm,
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1239) 					  __u64 start, __u64 len)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1240) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1241) 	__u64 task_size = mm->task_size;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1242) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1243) 	if (start & ~PAGE_MASK)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1244) 		return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1245) 	if (len & ~PAGE_MASK)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1246) 		return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1247) 	if (!len)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1248) 		return -EINVAL;
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1249) 	if (start < mmap_min_addr)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1250) 		return -EINVAL;
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1251) 	if (start >= task_size)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1252) 		return -EINVAL;
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1253) 	if (len > task_size - start)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1254) 		return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1255) 	return 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1256) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1257) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1258) static inline bool vma_can_userfault(struct vm_area_struct *vma,
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1259) 				     unsigned long vm_flags)
ba6907db6de17 (Mike Rapoport       2017-02-22 15:43:22 -0800 1260) {
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1261) 	/* FIXME: add WP support to hugetlbfs and shmem */
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1262) 	if (vm_flags & VM_UFFD_WP) {
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1263) 		if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1264) 			return false;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1265) 	}
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1266) 
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1267) 	if (vm_flags & VM_UFFD_MINOR) {
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1268) 		/* FIXME: Add minor fault interception for shmem. */
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1269) 		if (!is_vm_hugetlb_page(vma))
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1270) 			return false;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1271) 	}
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1272) 
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1273) 	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1274) 	       vma_is_shmem(vma);
ba6907db6de17 (Mike Rapoport       2017-02-22 15:43:22 -0800 1275) }
ba6907db6de17 (Mike Rapoport       2017-02-22 15:43:22 -0800 1276) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1277) static int userfaultfd_register(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1278) 				unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1279) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1280) 	struct mm_struct *mm = ctx->mm;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1281) 	struct vm_area_struct *vma, *prev, *cur;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1282) 	int ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1283) 	struct uffdio_register uffdio_register;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1284) 	struct uffdio_register __user *user_uffdio_register;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1285) 	unsigned long vm_flags, new_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1286) 	bool found;
ce53e8e6f2cb0 (Mike Rapoport       2017-09-06 16:23:12 -0700 1287) 	bool basic_ioctls;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1288) 	unsigned long start, end, vma_end;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1289) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1290) 	user_uffdio_register = (struct uffdio_register __user *) arg;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1291) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1292) 	ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1293) 	if (copy_from_user(&uffdio_register, user_uffdio_register,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1294) 			   sizeof(uffdio_register)-sizeof(__u64)))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1295) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1296) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1297) 	ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1298) 	if (!uffdio_register.mode)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1299) 		goto out;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1300) 	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1301) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1302) 	vm_flags = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1303) 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1304) 		vm_flags |= VM_UFFD_MISSING;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1305) 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1306) 		vm_flags |= VM_UFFD_WP;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1307) 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1308) #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1309) 		goto out;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1310) #endif
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1311) 		vm_flags |= VM_UFFD_MINOR;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1312) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1313) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1314) 	ret = validate_range(mm, uffdio_register.range.start,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1315) 			     uffdio_register.range.len);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1316) 	if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1317) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1318) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1319) 	start = uffdio_register.range.start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1320) 	end = start + uffdio_register.range.len;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1321) 
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1322) 	ret = -ENOMEM;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1323) 	if (!mmget_not_zero(mm))
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1324) 		goto out;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1325) 
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700 1326) 	mmap_write_lock(mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1327) 	vma = find_vma_prev(mm, start, &prev);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1328) 	if (!vma)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1329) 		goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1330) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1331) 	/* check that there's at least one vma in the range */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1332) 	ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1333) 	if (vma->vm_start >= end)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1334) 		goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1335) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1336) 	/*
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1337) 	 * If the first vma contains huge pages, make sure start address
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1338) 	 * is aligned to huge page size.
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1339) 	 */
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1340) 	if (is_vm_hugetlb_page(vma)) {
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1341) 		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1342) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1343) 		if (start & (vma_hpagesize - 1))
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1344) 			goto out_unlock;
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1345) 	}
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1346) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1347) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1348) 	 * Search for not compatible vmas.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1349) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1350) 	found = false;
ce53e8e6f2cb0 (Mike Rapoport       2017-09-06 16:23:12 -0700 1351) 	basic_ioctls = false;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1352) 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1353) 		cond_resched();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1354) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1355) 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1356) 		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1357) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1358) 		/* check not compatible vmas */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1359) 		ret = -EINVAL;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1360) 		if (!vma_can_userfault(cur, vm_flags))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1361) 			goto out_unlock;
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1362) 
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1363) 		/*
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1364) 		 * UFFDIO_COPY will fill file holes even without
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1365) 		 * PROT_WRITE. This check enforces that if this is a
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1366) 		 * MAP_SHARED, the process has write permission to the backing
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1367) 		 * file. If VM_MAYWRITE is set it also enforces that on a
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1368) 		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1369) 		 * F_WRITE_SEAL can be taken until the vma is destroyed.
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1370) 		 */
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1371) 		ret = -EPERM;
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1372) 		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1373) 			goto out_unlock;
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1374) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1375) 		/*
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1376) 		 * If this vma contains ending address, and huge pages
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1377) 		 * check alignment.
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1378) 		 */
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1379) 		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1380) 		    end > cur->vm_start) {
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1381) 			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1382) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1383) 			ret = -EINVAL;
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1384) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1385) 			if (end & (vma_hpagesize - 1))
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1386) 				goto out_unlock;
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1387) 		}
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1388) 		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1389) 			goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1390) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1391) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1392) 		 * Check that this vma isn't already owned by a
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1393) 		 * different userfaultfd. We can't allow more than one
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1394) 		 * userfaultfd to own a single vma simultaneously or we
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1395) 		 * wouldn't know which one to deliver the userfaults to.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1396) 		 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1397) 		ret = -EBUSY;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1398) 		if (cur->vm_userfaultfd_ctx.ctx &&
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1399) 		    cur->vm_userfaultfd_ctx.ctx != ctx)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1400) 			goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1401) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1402) 		/*
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1403) 		 * Note vmas containing huge pages
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1404) 		 */
ce53e8e6f2cb0 (Mike Rapoport       2017-09-06 16:23:12 -0700 1405) 		if (is_vm_hugetlb_page(cur))
ce53e8e6f2cb0 (Mike Rapoport       2017-09-06 16:23:12 -0700 1406) 			basic_ioctls = true;
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1407) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1408) 		found = true;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1409) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1410) 	BUG_ON(!found);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1411) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1412) 	if (vma->vm_start < start)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1413) 		prev = vma;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1414) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1415) 	ret = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1416) 	do {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1417) 		cond_resched();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1418) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1419) 		BUG_ON(!vma_can_userfault(vma, vm_flags));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1420) 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1421) 		       vma->vm_userfaultfd_ctx.ctx != ctx);
29ec90660d68b (Andrea Arcangeli    2018-11-30 14:09:32 -0800 1422) 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1423) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1424) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1425) 		 * Nothing to do: this vma is already registered into this
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1426) 		 * userfaultfd and with the right tracking mode too.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1427) 		 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1428) 		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1429) 		    (vma->vm_flags & vm_flags) == vm_flags)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1430) 			goto skip;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1431) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1432) 		if (vma->vm_start > start)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1433) 			start = vma->vm_start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1434) 		vma_end = min(end, vma->vm_end);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1435) 
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1436) 		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1437) 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1438) 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1439) 				 vma_policy(vma),
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1440) 				 ((struct vm_userfaultfd_ctx){ ctx }));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1441) 		if (prev) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1442) 			vma = prev;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1443) 			goto next;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1444) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1445) 		if (vma->vm_start < start) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1446) 			ret = split_vma(mm, vma, start, 1);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1447) 			if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1448) 				break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1449) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1450) 		if (vma->vm_end > end) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1451) 			ret = split_vma(mm, vma, end, 0);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1452) 			if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1453) 				break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1454) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1455) 	next:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1456) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1457) 		 * In the vma_merge() successful mprotect-like case 8:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1458) 		 * the next vma was merged into the current one and
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1459) 		 * the current one has not been updated yet.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1460) 		 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1461) 		vma->vm_flags = new_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1462) 		vma->vm_userfaultfd_ctx.ctx = ctx;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1463) 
6dfeaff93be1a (Peter Xu            2021-05-04 18:33:13 -0700 1464) 		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
6dfeaff93be1a (Peter Xu            2021-05-04 18:33:13 -0700 1465) 			hugetlb_unshare_all_pmds(vma);
6dfeaff93be1a (Peter Xu            2021-05-04 18:33:13 -0700 1466) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1467) 	skip:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1468) 		prev = vma;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1469) 		start = vma->vm_end;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1470) 		vma = vma->vm_next;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1471) 	} while (vma && vma->vm_start < end);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1472) out_unlock:
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700 1473) 	mmap_write_unlock(mm);
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1474) 	mmput(mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1475) 	if (!ret) {
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1476) 		__u64 ioctls_out;
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1477) 
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1478) 		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1479) 		    UFFD_API_RANGE_IOCTLS;
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1480) 
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1481) 		/*
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1482) 		 * Declare the WP ioctl only if the WP mode is
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1483) 		 * specified and all checks passed with the range
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1484) 		 */
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1485) 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1486) 			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1487) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1488) 		/* CONTINUE ioctl is only supported for MINOR ranges. */
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1489) 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1490) 			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1491) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1492) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1493) 		 * Now that we scanned all vmas we can already tell
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1494) 		 * userland which ioctls methods are guaranteed to
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1495) 		 * succeed on this range.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1496) 		 */
14819305e09fe (Peter Xu            2020-04-06 20:06:29 -0700 1497) 		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1498) 			ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1499) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1500) out:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1501) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1502) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1503) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1504) static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1505) 				  unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1506) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1507) 	struct mm_struct *mm = ctx->mm;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1508) 	struct vm_area_struct *vma, *prev, *cur;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1509) 	int ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1510) 	struct uffdio_range uffdio_unregister;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1511) 	unsigned long new_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1512) 	bool found;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1513) 	unsigned long start, end, vma_end;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1514) 	const void __user *buf = (void __user *)arg;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1515) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1516) 	ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1517) 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1518) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1519) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1520) 	ret = validate_range(mm, uffdio_unregister.start,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1521) 			     uffdio_unregister.len);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1522) 	if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1523) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1524) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1525) 	start = uffdio_unregister.start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1526) 	end = start + uffdio_unregister.len;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1527) 
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1528) 	ret = -ENOMEM;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1529) 	if (!mmget_not_zero(mm))
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1530) 		goto out;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1531) 
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700 1532) 	mmap_write_lock(mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1533) 	vma = find_vma_prev(mm, start, &prev);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1534) 	if (!vma)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1535) 		goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1536) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1537) 	/* check that there's at least one vma in the range */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1538) 	ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1539) 	if (vma->vm_start >= end)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1540) 		goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1541) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1542) 	/*
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1543) 	 * If the first vma contains huge pages, make sure start address
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1544) 	 * is aligned to huge page size.
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1545) 	 */
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1546) 	if (is_vm_hugetlb_page(vma)) {
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1547) 		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1548) 
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1549) 		if (start & (vma_hpagesize - 1))
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1550) 			goto out_unlock;
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1551) 	}
cab350afcbc9c (Mike Kravetz        2017-02-22 15:43:04 -0800 1552) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1553) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1554) 	 * Search for not compatible vmas.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1555) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1556) 	found = false;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1557) 	ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1558) 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1559) 		cond_resched();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1560) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1561) 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1562) 		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1563) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1564) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1565) 		 * Check not compatible vmas, not strictly required
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1566) 		 * here as not compatible vmas cannot have an
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1567) 		 * userfaultfd_ctx registered on them, but this
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1568) 		 * provides for more strict behavior to notice
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1569) 		 * unregistration errors.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1570) 		 */
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1571) 		if (!vma_can_userfault(cur, cur->vm_flags))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1572) 			goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1573) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1574) 		found = true;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1575) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1576) 	BUG_ON(!found);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1577) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1578) 	if (vma->vm_start < start)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1579) 		prev = vma;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1580) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1581) 	ret = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1582) 	do {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1583) 		cond_resched();
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1584) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1585) 		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1586) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1587) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1588) 		 * Nothing to do: this vma is already registered into this
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1589) 		 * userfaultfd and with the right tracking mode too.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1590) 		 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1591) 		if (!vma->vm_userfaultfd_ctx.ctx)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1592) 			goto skip;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1593) 
01e881f5a1fca (Andrea Arcangeli    2018-12-14 14:17:17 -0800 1594) 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
01e881f5a1fca (Andrea Arcangeli    2018-12-14 14:17:17 -0800 1595) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1596) 		if (vma->vm_start > start)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1597) 			start = vma->vm_start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1598) 		vma_end = min(end, vma->vm_end);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1599) 
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1600) 		if (userfaultfd_missing(vma)) {
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1601) 			/*
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1602) 			 * Wake any concurrent pending userfault while
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1603) 			 * we unregister, so they will not hang
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1604) 			 * permanently and it avoids userland to call
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1605) 			 * UFFDIO_WAKE explicitly.
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1606) 			 */
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1607) 			struct userfaultfd_wake_range range;
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1608) 			range.start = start;
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1609) 			range.len = vma_end - start;
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1610) 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1611) 		}
09fa5296a40d0 (Andrea Arcangeli    2017-02-22 15:42:46 -0800 1612) 
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1613) 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1614) 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1615) 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1616) 				 vma_policy(vma),
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1617) 				 NULL_VM_UFFD_CTX);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1618) 		if (prev) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1619) 			vma = prev;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1620) 			goto next;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1621) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1622) 		if (vma->vm_start < start) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1623) 			ret = split_vma(mm, vma, start, 1);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1624) 			if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1625) 				break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1626) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1627) 		if (vma->vm_end > end) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1628) 			ret = split_vma(mm, vma, end, 0);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1629) 			if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1630) 				break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1631) 		}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1632) 	next:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1633) 		/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1634) 		 * In the vma_merge() successful mprotect-like case 8:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1635) 		 * the next vma was merged into the current one and
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1636) 		 * the current one has not been updated yet.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1637) 		 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1638) 		vma->vm_flags = new_flags;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1639) 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1640) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1641) 	skip:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1642) 		prev = vma;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1643) 		start = vma->vm_end;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1644) 		vma = vma->vm_next;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1645) 	} while (vma && vma->vm_start < end);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1646) out_unlock:
d8ed45c5dcd45 (Michel Lespinasse   2020-06-08 21:33:25 -0700 1647) 	mmap_write_unlock(mm);
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1648) 	mmput(mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1649) out:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1650) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1651) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1652) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1653) /*
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700 1654)  * userfaultfd_wake may be used in combination with the
ba85c702e4b24 (Andrea Arcangeli    2015-09-04 15:46:41 -0700 1655)  * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1656)  */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1657) static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1658) 			    unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1659) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1660) 	int ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1661) 	struct uffdio_range uffdio_wake;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1662) 	struct userfaultfd_wake_range range;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1663) 	const void __user *buf = (void __user *)arg;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1664) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1665) 	ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1666) 	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1667) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1668) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1669) 	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1670) 	if (ret)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1671) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1672) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1673) 	range.start = uffdio_wake.start;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1674) 	range.len = uffdio_wake.len;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1675) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1676) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1677) 	 * len == 0 means wake all and we don't want to wake all here,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1678) 	 * so check it again to be sure.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1679) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1680) 	VM_BUG_ON(!range.len);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1681) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1682) 	wake_userfault(ctx, &range);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1683) 	ret = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1684) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1685) out:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1686) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1687) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1688) 
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1689) static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1690) 			    unsigned long arg)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1691) {
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1692) 	__s64 ret;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1693) 	struct uffdio_copy uffdio_copy;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1694) 	struct uffdio_copy __user *user_uffdio_copy;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1695) 	struct userfaultfd_wake_range range;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1696) 
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1697) 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1698) 
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1699) 	ret = -EAGAIN;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1700) 	if (READ_ONCE(ctx->mmap_changing))
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1701) 		goto out;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1702) 
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1703) 	ret = -EFAULT;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1704) 	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1705) 			   /* don't copy "copy" last field */
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1706) 			   sizeof(uffdio_copy)-sizeof(__s64)))
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1707) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1708) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1709) 	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1710) 	if (ret)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1711) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1712) 	/*
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1713) 	 * double check for wraparound just in case. copy_from_user()
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1714) 	 * will later check uffdio_copy.src + uffdio_copy.len to fit
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1715) 	 * in the userland range.
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1716) 	 */
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1717) 	ret = -EINVAL;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1718) 	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1719) 		goto out;
72981e0e7b609 (Andrea Arcangeli    2020-04-06 20:05:41 -0700 1720) 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1721) 		goto out;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1722) 	if (mmget_not_zero(ctx->mm)) {
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1723) 		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
72981e0e7b609 (Andrea Arcangeli    2020-04-06 20:05:41 -0700 1724) 				   uffdio_copy.len, &ctx->mmap_changing,
72981e0e7b609 (Andrea Arcangeli    2020-04-06 20:05:41 -0700 1725) 				   uffdio_copy.mode);
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1726) 		mmput(ctx->mm);
96333187ab162 (Mike Rapoport       2017-02-24 14:58:31 -0800 1727) 	} else {
e86b298bebf7e (Mike Rapoport       2017-08-10 15:24:32 -0700 1728) 		return -ESRCH;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1729) 	}
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1730) 	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1731) 		return -EFAULT;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1732) 	if (ret < 0)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1733) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1734) 	BUG_ON(!ret);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1735) 	/* len == 0 would wake all */
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1736) 	range.len = ret;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1737) 	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1738) 		range.start = uffdio_copy.dst;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1739) 		wake_userfault(ctx, &range);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1740) 	}
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1741) 	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1742) out:
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1743) 	return ret;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1744) }
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1745) 
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1746) static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1747) 				unsigned long arg)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1748) {
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1749) 	__s64 ret;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1750) 	struct uffdio_zeropage uffdio_zeropage;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1751) 	struct uffdio_zeropage __user *user_uffdio_zeropage;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1752) 	struct userfaultfd_wake_range range;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1753) 
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1754) 	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1755) 
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1756) 	ret = -EAGAIN;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1757) 	if (READ_ONCE(ctx->mmap_changing))
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1758) 		goto out;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1759) 
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1760) 	ret = -EFAULT;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1761) 	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1762) 			   /* don't copy "zeropage" last field */
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1763) 			   sizeof(uffdio_zeropage)-sizeof(__s64)))
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1764) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1765) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1766) 	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1767) 			     uffdio_zeropage.range.len);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1768) 	if (ret)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1769) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1770) 	ret = -EINVAL;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1771) 	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1772) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1773) 
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1774) 	if (mmget_not_zero(ctx->mm)) {
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1775) 		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1776) 				     uffdio_zeropage.range.len,
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 1777) 				     &ctx->mmap_changing);
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1778) 		mmput(ctx->mm);
9d95aa4bada24 (Mike Rapoport       2017-08-02 13:32:15 -0700 1779) 	} else {
e86b298bebf7e (Mike Rapoport       2017-08-10 15:24:32 -0700 1780) 		return -ESRCH;
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 1781) 	}
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1782) 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1783) 		return -EFAULT;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1784) 	if (ret < 0)
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1785) 		goto out;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1786) 	/* len == 0 would wake all */
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1787) 	BUG_ON(!ret);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1788) 	range.len = ret;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1789) 	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1790) 		range.start = uffdio_zeropage.range.start;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1791) 		wake_userfault(ctx, &range);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1792) 	}
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1793) 	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1794) out:
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1795) 	return ret;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1796) }
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1797) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1798) static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1799) 				    unsigned long arg)
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1800) {
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1801) 	int ret;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1802) 	struct uffdio_writeprotect uffdio_wp;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1803) 	struct uffdio_writeprotect __user *user_uffdio_wp;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1804) 	struct userfaultfd_wake_range range;
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1805) 	bool mode_wp, mode_dontwake;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1806) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1807) 	if (READ_ONCE(ctx->mmap_changing))
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1808) 		return -EAGAIN;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1809) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1810) 	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1811) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1812) 	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1813) 			   sizeof(struct uffdio_writeprotect)))
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1814) 		return -EFAULT;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1815) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1816) 	ret = validate_range(ctx->mm, uffdio_wp.range.start,
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1817) 			     uffdio_wp.range.len);
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1818) 	if (ret)
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1819) 		return ret;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1820) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1821) 	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1822) 			       UFFDIO_WRITEPROTECT_MODE_WP))
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1823) 		return -EINVAL;
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1824) 
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1825) 	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1826) 	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1827) 
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1828) 	if (mode_wp && mode_dontwake)
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1829) 		return -EINVAL;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1830) 
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1831) 	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1832) 				  uffdio_wp.range.len, mode_wp,
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1833) 				  &ctx->mmap_changing);
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1834) 	if (ret)
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1835) 		return ret;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1836) 
23080e2783ba4 (Peter Xu            2020-04-06 20:06:20 -0700 1837) 	if (!mode_wp && !mode_dontwake) {
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1838) 		range.start = uffdio_wp.range.start;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1839) 		range.len = uffdio_wp.range.len;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1840) 		wake_userfault(ctx, &range);
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1841) 	}
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1842) 	return ret;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1843) }
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1844) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1845) static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1846) {
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1847) 	__s64 ret;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1848) 	struct uffdio_continue uffdio_continue;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1849) 	struct uffdio_continue __user *user_uffdio_continue;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1850) 	struct userfaultfd_wake_range range;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1851) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1852) 	user_uffdio_continue = (struct uffdio_continue __user *)arg;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1853) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1854) 	ret = -EAGAIN;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1855) 	if (READ_ONCE(ctx->mmap_changing))
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1856) 		goto out;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1857) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1858) 	ret = -EFAULT;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1859) 	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1860) 			   /* don't copy the output fields */
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1861) 			   sizeof(uffdio_continue) - (sizeof(__s64))))
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1862) 		goto out;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1863) 
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1864) 	ret = validate_range(ctx->mm, uffdio_continue.range.start,
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1865) 			     uffdio_continue.range.len);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1866) 	if (ret)
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1867) 		goto out;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1868) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1869) 	ret = -EINVAL;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1870) 	/* double check for wraparound just in case. */
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1871) 	if (uffdio_continue.range.start + uffdio_continue.range.len <=
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1872) 	    uffdio_continue.range.start) {
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1873) 		goto out;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1874) 	}
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1875) 	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1876) 		goto out;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1877) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1878) 	if (mmget_not_zero(ctx->mm)) {
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1879) 		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1880) 				     uffdio_continue.range.len,
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1881) 				     &ctx->mmap_changing);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1882) 		mmput(ctx->mm);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1883) 	} else {
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1884) 		return -ESRCH;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1885) 	}
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1886) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1887) 	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1888) 		return -EFAULT;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1889) 	if (ret < 0)
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1890) 		goto out;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1891) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1892) 	/* len == 0 would wake all */
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1893) 	BUG_ON(!ret);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1894) 	range.len = ret;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1895) 	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1896) 		range.start = uffdio_continue.range.start;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1897) 		wake_userfault(ctx, &range);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1898) 	}
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1899) 	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1900) 
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1901) out:
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1902) 	return ret;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1903) }
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1904) 
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1905) static inline unsigned int uffd_ctx_features(__u64 user_features)
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1906) {
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1907) 	/*
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1908) 	 * For the current set of features the bits just coincide
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1909) 	 */
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1910) 	return (unsigned int)user_features;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1911) }
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 1912) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1913) /*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1914)  * userland asks for a certain API version and we return which bits
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1915)  * and ioctl commands are implemented in this kernel for such API
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1916)  * version or -EINVAL if unknown.
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1917)  */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1918) static int userfaultfd_api(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1919) 			   unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1920) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1921) 	struct uffdio_api uffdio_api;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1922) 	void __user *buf = (void __user *)arg;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1923) 	int ret;
656031445d5a8 (Andrea Arcangeli    2017-02-22 15:42:24 -0800 1924) 	__u64 features;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1925) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1926) 	ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1927) 	if (ctx->state != UFFD_STATE_WAIT_API)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1928) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1929) 	ret = -EFAULT;
a9b85f9415fd9 (Andrea Arcangeli    2015-09-04 15:46:37 -0700 1930) 	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1931) 		goto out;
656031445d5a8 (Andrea Arcangeli    2017-02-22 15:42:24 -0800 1932) 	features = uffdio_api.features;
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1933) 	ret = -EINVAL;
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1934) 	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1935) 		goto err_out;
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1936) 	ret = -EPERM;
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1937) 	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1938) 		goto err_out;
656031445d5a8 (Andrea Arcangeli    2017-02-22 15:42:24 -0800 1939) 	/* report all available features and ioctls to userland */
656031445d5a8 (Andrea Arcangeli    2017-02-22 15:42:24 -0800 1940) 	uffdio_api.features = UFFD_API_FEATURES;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1941) #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1942) 	uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
7677f7fd8be76 (Axel Rasmussen      2021-05-04 18:35:36 -0700 1943) #endif
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1944) 	uffdio_api.ioctls = UFFD_API_IOCTLS;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1945) 	ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1946) 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1947) 		goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1948) 	ctx->state = UFFD_STATE_RUNNING;
656031445d5a8 (Andrea Arcangeli    2017-02-22 15:42:24 -0800 1949) 	/* only enable the requested features for this uffd context */
656031445d5a8 (Andrea Arcangeli    2017-02-22 15:42:24 -0800 1950) 	ctx->features = uffd_ctx_features(features);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1951) 	ret = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1952) out:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1953) 	return ret;
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1954) err_out:
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1955) 	memset(&uffdio_api, 0, sizeof(uffdio_api));
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1956) 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1957) 		ret = -EFAULT;
3c1c24d91ffd5 (Mike Rapoport       2019-11-30 17:58:01 -0800 1958) 	goto out;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1959) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1960) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1961) static long userfaultfd_ioctl(struct file *file, unsigned cmd,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1962) 			      unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1963) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1964) 	int ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1965) 	struct userfaultfd_ctx *ctx = file->private_data;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1966) 
e6485a47b758c (Andrea Arcangeli    2015-09-04 15:47:15 -0700 1967) 	if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
e6485a47b758c (Andrea Arcangeli    2015-09-04 15:47:15 -0700 1968) 		return -EINVAL;
e6485a47b758c (Andrea Arcangeli    2015-09-04 15:47:15 -0700 1969) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1970) 	switch(cmd) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1971) 	case UFFDIO_API:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1972) 		ret = userfaultfd_api(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1973) 		break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1974) 	case UFFDIO_REGISTER:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1975) 		ret = userfaultfd_register(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1976) 		break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1977) 	case UFFDIO_UNREGISTER:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1978) 		ret = userfaultfd_unregister(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1979) 		break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1980) 	case UFFDIO_WAKE:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1981) 		ret = userfaultfd_wake(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1982) 		break;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1983) 	case UFFDIO_COPY:
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1984) 		ret = userfaultfd_copy(ctx, arg);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1985) 		break;
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1986) 	case UFFDIO_ZEROPAGE:
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1987) 		ret = userfaultfd_zeropage(ctx, arg);
ad465cae96b45 (Andrea Arcangeli    2015-09-04 15:47:11 -0700 1988) 		break;
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1989) 	case UFFDIO_WRITEPROTECT:
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1990) 		ret = userfaultfd_writeprotect(ctx, arg);
63b2d4174c4ad (Andrea Arcangeli    2020-04-06 20:06:12 -0700 1991) 		break;
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1992) 	case UFFDIO_CONTINUE:
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1993) 		ret = userfaultfd_continue(ctx, arg);
f619147104c8e (Axel Rasmussen      2021-05-04 18:35:49 -0700 1994) 		break;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1995) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1996) 	return ret;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1997) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1998) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 1999) #ifdef CONFIG_PROC_FS
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2000) static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2001) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2002) 	struct userfaultfd_ctx *ctx = f->private_data;
ac6424b981bce (Ingo Molnar         2017-06-20 12:06:13 +0200 2003) 	wait_queue_entry_t *wq;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2004) 	unsigned long pending = 0, total = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2005) 
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700 2006) 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200 2007) 	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 2008) 		pending++;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 2009) 		total++;
15b726ef048b3 (Andrea Arcangeli    2015-09-04 15:46:44 -0700 2010) 	}
2055da97389a6 (Ingo Molnar         2017-06-20 12:06:46 +0200 2011) 	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2012) 		total++;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2013) 	}
cbcfa130a911c (Eric Biggers        2019-07-04 15:14:39 -0700 2014) 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2015) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2016) 	/*
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2017) 	 * If more protocols will be added, there will be all shown
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2018) 	 * separated by a space. Like this:
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2019) 	 *	protocols: aa:... bb:...
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2020) 	 */
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2021) 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
045098e944959 (Mike Rapoport       2017-04-07 16:04:42 -0700 2022) 		   pending, total, UFFD_API, ctx->features,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2023) 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2024) }
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2025) #endif
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2026) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2027) static const struct file_operations userfaultfd_fops = {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2028) #ifdef CONFIG_PROC_FS
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2029) 	.show_fdinfo	= userfaultfd_show_fdinfo,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2030) #endif
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2031) 	.release	= userfaultfd_release,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2032) 	.poll		= userfaultfd_poll,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2033) 	.read		= userfaultfd_read,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2034) 	.unlocked_ioctl = userfaultfd_ioctl,
1832f2d8ff691 (Arnd Bergmann       2018-09-11 21:59:08 +0200 2035) 	.compat_ioctl	= compat_ptr_ioctl,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2036) 	.llseek		= noop_llseek,
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2037) };
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2038) 
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2039) static void init_once_userfaultfd_ctx(void *mem)
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2040) {
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2041) 	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2042) 
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2043) 	init_waitqueue_head(&ctx->fault_pending_wqh);
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2044) 	init_waitqueue_head(&ctx->fault_wqh);
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 2045) 	init_waitqueue_head(&ctx->event_wqh);
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2046) 	init_waitqueue_head(&ctx->fd_wqh);
2ca97ac8bdcc3 (Ahmed S. Darwish    2020-07-20 17:55:28 +0200 2047) 	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2048) }
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2049) 
284cd241a18ee (Eric Biggers        2018-01-31 16:19:48 -0800 2050) SYSCALL_DEFINE1(userfaultfd, int, flags)
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2051) {
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2052) 	struct userfaultfd_ctx *ctx;
284cd241a18ee (Eric Biggers        2018-01-31 16:19:48 -0800 2053) 	int fd;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2054) 
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2055) 	if (!sysctl_unprivileged_userfaultfd &&
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2056) 	    (flags & UFFD_USER_MODE_ONLY) == 0 &&
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2057) 	    !capable(CAP_SYS_PTRACE)) {
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2058) 		printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2059) 			"sysctl knob to 1 if kernel faults must be handled "
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2060) 			"without obtaining CAP_SYS_PTRACE capability\n");
cefdca0a86be5 (Peter Xu            2019-05-13 17:16:41 -0700 2061) 		return -EPERM;
d0d4730ac2e40 (Lokesh Gidra        2020-12-14 19:13:54 -0800 2062) 	}
cefdca0a86be5 (Peter Xu            2019-05-13 17:16:41 -0700 2063) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2064) 	BUG_ON(!current->mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2065) 
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2066) 	/* Check the UFFD_* constants for consistency.  */
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800 2067) 	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2068) 	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2069) 	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2070) 
37cd0575b8510 (Lokesh Gidra        2020-12-14 19:13:49 -0800 2071) 	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
284cd241a18ee (Eric Biggers        2018-01-31 16:19:48 -0800 2072) 		return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2073) 
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2074) 	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2075) 	if (!ctx)
284cd241a18ee (Eric Biggers        2018-01-31 16:19:48 -0800 2076) 		return -ENOMEM;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2077) 
ca880420665db (Eric Biggers        2018-12-28 00:34:43 -0800 2078) 	refcount_set(&ctx->refcount, 1);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2079) 	ctx->flags = flags;
9cd75c3cd4c3d (Pavel Emelyanov     2017-02-22 15:42:21 -0800 2080) 	ctx->features = 0;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2081) 	ctx->state = UFFD_STATE_WAIT_API;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2082) 	ctx->released = false;
df2cc96e77011 (Mike Rapoport       2018-06-07 17:09:25 -0700 2083) 	ctx->mmap_changing = false;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2084) 	ctx->mm = current->mm;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2085) 	/* prevent the mm struct to be freed */
f1f1007644ffc (Vegard Nossum       2017-02-27 14:30:07 -0800 2086) 	mmgrab(ctx->mm);
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2087) 
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800 2088) 	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
b537900f1598b (Daniel Colascione   2021-01-08 14:22:23 -0800 2089) 			O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
284cd241a18ee (Eric Biggers        2018-01-31 16:19:48 -0800 2090) 	if (fd < 0) {
d2005e3f41d4f (Oleg Nesterov       2016-05-20 16:58:36 -0700 2091) 		mmdrop(ctx->mm);
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2092) 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
c03e946fdd653 (Eric Biggers        2015-09-17 16:01:54 -0700 2093) 	}
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2094) 	return fd;
86039bd3b4e6a (Andrea Arcangeli    2015-09-04 15:46:31 -0700 2095) }
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2096) 
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2097) static int __init userfaultfd_init(void)
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2098) {
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2099) 	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2100) 						sizeof(struct userfaultfd_ctx),
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2101) 						0,
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2102) 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2103) 						init_once_userfaultfd_ctx);
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2104) 	return 0;
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2105) }
3004ec9cabf49 (Andrea Arcangeli    2015-09-04 15:46:48 -0700 2106) __initcall(userfaultfd_init);