20c8ccb1975b8 (Thomas Gleixner 2019-06-04 10:11:32 +0200 1) // SPDX-License-Identifier: GPL-2.0-only
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 3) * fs/userfaultfd.c
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 4) *
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 5) * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 6) * Copyright (C) 2008-2009 Red Hat, Inc.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 7) * Copyright (C) 2015 Red Hat, Inc.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 8) *
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 9) * Some part derived from fs/eventfd.c (anon inode setup) and
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 10) * mm/ksm.c (mm hashing).
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 11) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 12)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 13) #include <linux/list.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 14) #include <linux/hashtable.h>
174cd4b1e5fbd (Ingo Molnar 2017-02-02 19:15:33 +0100 15) #include <linux/sched/signal.h>
6e84f31522f93 (Ingo Molnar 2017-02-08 18:51:29 +0100 16) #include <linux/sched/mm.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 17) #include <linux/mm.h>
6dfeaff93be1a (Peter Xu 2021-05-04 18:33:13 -0700 18) #include <linux/mmu_notifier.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 19) #include <linux/poll.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 20) #include <linux/slab.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 21) #include <linux/seq_file.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 22) #include <linux/file.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 23) #include <linux/bug.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 24) #include <linux/anon_inodes.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 25) #include <linux/syscalls.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 26) #include <linux/userfaultfd_k.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 27) #include <linux/mempolicy.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 28) #include <linux/ioctl.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 29) #include <linux/security.h>
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 30) #include <linux/hugetlb.h>
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 31)
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 32) int sysctl_unprivileged_userfaultfd __read_mostly;
cefdca0a86be5 (Peter Xu 2019-05-13 17:16:41 -0700 33)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 34) static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 35)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 36) enum userfaultfd_state {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 37) UFFD_STATE_WAIT_API,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 38) UFFD_STATE_RUNNING,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 39) };
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 40)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 41) /*
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 42) * Start with fault_pending_wqh and fault_wqh so they're more likely
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 43) * to be in the same cacheline.
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 44) *
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 45) * Locking order:
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 46) * fd_wqh.lock
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 47) * fault_pending_wqh.lock
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 48) * fault_wqh.lock
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 49) * event_wqh.lock
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 50) *
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 51) * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 52) * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 53) * also taken in IRQ context.
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 54) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 55) struct userfaultfd_ctx {
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 56) /* waitqueue head for the pending (i.e. not read) userfaults */
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 57) wait_queue_head_t fault_pending_wqh;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 58) /* waitqueue head for the userfaults */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 59) wait_queue_head_t fault_wqh;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 60) /* waitqueue head for the pseudo fd to wakeup poll/read */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 61) wait_queue_head_t fd_wqh;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 62) /* waitqueue head for events */
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 63) wait_queue_head_t event_wqh;
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 64) /* a refile sequence protected by fault_pending_wqh lock */
2ca97ac8bdcc3 (Ahmed S. Darwish 2020-07-20 17:55:28 +0200 65) seqcount_spinlock_t refile_seq;
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 66) /* pseudo fd refcounting */
ca880420665db (Eric Biggers 2018-12-28 00:34:43 -0800 67) refcount_t refcount;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 68) /* userfaultfd syscall flags */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 69) unsigned int flags;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 70) /* features requested from the userspace */
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 71) unsigned int features;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 72) /* state machine */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 73) enum userfaultfd_state state;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 74) /* released */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 75) bool released;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 76) /* memory mappings are changing because of non-cooperative event */
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 77) bool mmap_changing;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 78) /* mm with one ore more vmas attached to this userfaultfd_ctx */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 79) struct mm_struct *mm;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 80) };
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 81)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 82) struct userfaultfd_fork_ctx {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 83) struct userfaultfd_ctx *orig;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 84) struct userfaultfd_ctx *new;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 85) struct list_head list;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 86) };
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 87)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 88) struct userfaultfd_unmap_ctx {
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 89) struct userfaultfd_ctx *ctx;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 90) unsigned long start;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 91) unsigned long end;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 92) struct list_head list;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 93) };
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 94)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 95) struct userfaultfd_wait_queue {
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 96) struct uffd_msg msg;
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 97) wait_queue_entry_t wq;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 98) struct userfaultfd_ctx *ctx;
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 99) bool waken;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 100) };
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 101)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 102) struct userfaultfd_wake_range {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 103) unsigned long start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 104) unsigned long len;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 105) };
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 106)
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 107) static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 108) int wake_flags, void *key)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 109) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 110) struct userfaultfd_wake_range *range = key;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 111) int ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 112) struct userfaultfd_wait_queue *uwq;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 113) unsigned long start, len;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 114)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 115) uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 116) ret = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 117) /* len == 0 means wake all */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 118) start = range->start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 119) len = range->len;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 120) if (len && (start > uwq->msg.arg.pagefault.address ||
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 121) start + len <= uwq->msg.arg.pagefault.address))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 122) goto out;
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 123) WRITE_ONCE(uwq->waken, true);
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 124) /*
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 125) * The Program-Order guarantees provided by the scheduler
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 126) * ensure uwq->waken is visible before the task is woken.
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 127) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 128) ret = wake_up_state(wq->private, mode);
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 129) if (ret) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 130) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 131) * Wake only once, autoremove behavior.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 132) *
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 133) * After the effect of list_del_init is visible to the other
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 134) * CPUs, the waitqueue may disappear from under us, see the
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 135) * !list_empty_careful() in handle_userfault().
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 136) *
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 137) * try_to_wake_up() has an implicit smp_mb(), and the
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 138) * wq->private is read before calling the extern function
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 139) * "wake_up_state" (which in turns calls try_to_wake_up).
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 140) */
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 141) list_del_init(&wq->entry);
a9668cd6ee288 (Peter Zijlstra 2017-06-07 17:51:27 +0200 142) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 143) out:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 144) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 145) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 146)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 147) /**
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 148) * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 149) * context.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 150) * @ctx: [in] Pointer to the userfaultfd context.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 151) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 152) static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 153) {
ca880420665db (Eric Biggers 2018-12-28 00:34:43 -0800 154) refcount_inc(&ctx->refcount);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 155) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 156)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 157) /**
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 158) * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 159) * context.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 160) * @ctx: [in] Pointer to userfaultfd context.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 161) *
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 162) * The userfaultfd context reference must have been previously acquired either
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 163) * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 164) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 165) static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 166) {
ca880420665db (Eric Biggers 2018-12-28 00:34:43 -0800 167) if (refcount_dec_and_test(&ctx->refcount)) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 168) VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 169) VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 170) VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 171) VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 172) VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 173) VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 174) VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 175) VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 176) mmdrop(ctx->mm);
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 177) kmem_cache_free(userfaultfd_ctx_cachep, ctx);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 178) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 179) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 180)
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 181) static inline void msg_init(struct uffd_msg *msg)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 182) {
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 183) BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 184) /*
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 185) * Must use memset to zero out the paddings or kernel data is
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 186) * leaked to userland.
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 187) */
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 188) memset(msg, 0, sizeof(struct uffd_msg));
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 189) }
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 190)
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 191) static inline struct uffd_msg userfault_msg(unsigned long address,
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 192) unsigned int flags,
9d4ac934829ac (Alexey Perevalov 2017-09-06 16:23:56 -0700 193) unsigned long reason,
9d4ac934829ac (Alexey Perevalov 2017-09-06 16:23:56 -0700 194) unsigned int features)
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 195) {
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 196) struct uffd_msg msg;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 197) msg_init(&msg);
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 198) msg.event = UFFD_EVENT_PAGEFAULT;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 199) msg.arg.pagefault.address = address;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 200) /*
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 201) * These flags indicate why the userfault occurred:
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 202) * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 203) * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 204) * - Neither of these flags being set indicates a MISSING fault.
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 205) *
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 206) * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 207) * fault. Otherwise, it was a read fault.
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 208) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 209) if (flags & FAULT_FLAG_WRITE)
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 210) msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 211) if (reason & VM_UFFD_WP)
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 212) msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 213) if (reason & VM_UFFD_MINOR)
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 214) msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
9d4ac934829ac (Alexey Perevalov 2017-09-06 16:23:56 -0700 215) if (features & UFFD_FEATURE_THREAD_ID)
a36985d31a65d (Andrea Arcangeli 2017-09-06 16:23:59 -0700 216) msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 217) return msg;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 218) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 219)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 220) #ifdef CONFIG_HUGETLB_PAGE
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 221) /*
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 222) * Same functionality as userfaultfd_must_wait below with modifications for
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 223) * hugepmd ranges.
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 224) */
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 225) static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
7868a2087ec13 (Punit Agrawal 2017-07-06 15:39:42 -0700 226) struct vm_area_struct *vma,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 227) unsigned long address,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 228) unsigned long flags,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 229) unsigned long reason)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 230) {
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 231) struct mm_struct *mm = ctx->mm;
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 232) pte_t *ptep, pte;
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 233) bool ret = true;
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 234)
42fc541404f24 (Michel Lespinasse 2020-06-08 21:33:44 -0700 235) mmap_assert_locked(mm);
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 236)
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 237) ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 238)
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 239) if (!ptep)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 240) goto out;
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 241)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 242) ret = false;
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 243) pte = huge_ptep_get(ptep);
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 244)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 245) /*
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 246) * Lockless access: we're in a wait_event so it's ok if it
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 247) * changes under us.
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 248) */
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 249) if (huge_pte_none(pte))
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 250) ret = true;
1e2c043628c77 (Janosch Frank 2018-07-03 17:02:39 -0700 251) if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 252) ret = true;
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 253) out:
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 254) return ret;
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 255) }
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 256) #else
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 257) static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
7868a2087ec13 (Punit Agrawal 2017-07-06 15:39:42 -0700 258) struct vm_area_struct *vma,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 259) unsigned long address,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 260) unsigned long flags,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 261) unsigned long reason)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 262) {
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 263) return false; /* should never get here */
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 264) }
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 265) #endif /* CONFIG_HUGETLB_PAGE */
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 266)
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 267) /*
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 268) * Verify the pagetables are still not ok after having reigstered into
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 269) * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 270) * userfault that has already been resolved, if userfaultfd_read and
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 271) * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 272) * threads.
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 273) */
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 274) static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 275) unsigned long address,
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 276) unsigned long flags,
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 277) unsigned long reason)
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 278) {
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 279) struct mm_struct *mm = ctx->mm;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 280) pgd_t *pgd;
c2febafc67734 (Kirill A. Shutemov 2017-03-09 17:24:07 +0300 281) p4d_t *p4d;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 282) pud_t *pud;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 283) pmd_t *pmd, _pmd;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 284) pte_t *pte;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 285) bool ret = true;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 286)
42fc541404f24 (Michel Lespinasse 2020-06-08 21:33:44 -0700 287) mmap_assert_locked(mm);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 288)
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 289) pgd = pgd_offset(mm, address);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 290) if (!pgd_present(*pgd))
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 291) goto out;
c2febafc67734 (Kirill A. Shutemov 2017-03-09 17:24:07 +0300 292) p4d = p4d_offset(pgd, address);
c2febafc67734 (Kirill A. Shutemov 2017-03-09 17:24:07 +0300 293) if (!p4d_present(*p4d))
c2febafc67734 (Kirill A. Shutemov 2017-03-09 17:24:07 +0300 294) goto out;
c2febafc67734 (Kirill A. Shutemov 2017-03-09 17:24:07 +0300 295) pud = pud_offset(p4d, address);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 296) if (!pud_present(*pud))
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 297) goto out;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 298) pmd = pmd_offset(pud, address);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 299) /*
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 300) * READ_ONCE must function as a barrier with narrower scope
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 301) * and it must be equivalent to:
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 302) * _pmd = *pmd; barrier();
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 303) *
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 304) * This is to deal with the instability (as in
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 305) * pmd_trans_unstable) of the pmd.
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 306) */
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 307) _pmd = READ_ONCE(*pmd);
a365ac09d3343 (Huang Ying 2018-01-31 16:17:32 -0800 308) if (pmd_none(_pmd))
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 309) goto out;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 310)
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 311) ret = false;
a365ac09d3343 (Huang Ying 2018-01-31 16:17:32 -0800 312) if (!pmd_present(_pmd))
a365ac09d3343 (Huang Ying 2018-01-31 16:17:32 -0800 313) goto out;
a365ac09d3343 (Huang Ying 2018-01-31 16:17:32 -0800 314)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 315) if (pmd_trans_huge(_pmd)) {
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 316) if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 317) ret = true;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 318) goto out;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 319) }
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 320)
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 321) /*
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 322) * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 323) * and use the standard pte_offset_map() instead of parsing _pmd.
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 324) */
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 325) pte = pte_offset_map(pmd, address);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 326) /*
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 327) * Lockless access: we're in a wait_event so it's ok if it
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 328) * changes under us.
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 329) */
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 330) if (pte_none(*pte))
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 331) ret = true;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 332) if (!pte_write(*pte) && (reason & VM_UFFD_WP))
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 333) ret = true;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 334) pte_unmap(pte);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 335)
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 336) out:
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 337) return ret;
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 338) }
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 339)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 340) static inline long userfaultfd_get_blocking_state(unsigned int flags)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 341) {
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 342) if (flags & FAULT_FLAG_INTERRUPTIBLE)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 343) return TASK_INTERRUPTIBLE;
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 344)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 345) if (flags & FAULT_FLAG_KILLABLE)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 346) return TASK_KILLABLE;
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 347)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 348) return TASK_UNINTERRUPTIBLE;
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 349) }
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 350)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 351) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 352) * The locking rules involved in returning VM_FAULT_RETRY depending on
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 353) * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 354) * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 355) * recommendation in __lock_page_or_retry is not an understatement.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 356) *
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 357) * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 358) * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 359) * not set.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 360) *
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 361) * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 362) * set, VM_FAULT_RETRY can still be returned if and only if there are
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 363) * fatal_signal_pending()s, and the mmap_lock must be released before
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 364) * returning it.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 365) */
2b7403035459c (Souptick Joarder 2018-08-23 17:01:36 -0700 366) vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 367) {
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 368) struct mm_struct *mm = vmf->vma->vm_mm;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 369) struct userfaultfd_ctx *ctx;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 370) struct userfaultfd_wait_queue uwq;
2b7403035459c (Souptick Joarder 2018-08-23 17:01:36 -0700 371) vm_fault_t ret = VM_FAULT_SIGBUS;
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 372) bool must_wait;
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 373) long blocking_state;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 374)
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 375) /*
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 376) * We don't do userfault handling for the final child pid update.
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 377) *
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 378) * We also don't do userfault handling during
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 379) * coredumping. hugetlbfs has the special
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 380) * follow_hugetlb_page() to skip missing pages in the
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 381) * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 382) * the no_page_table() helper in follow_page_mask(), but the
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 383) * shmem_vm_ops->fault method is invoked even during
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 384) * coredumping without mmap_lock and it ends up here.
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 385) */
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 386) if (current->flags & (PF_EXITING|PF_DUMPCORE))
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 387) goto out;
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 388)
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 389) /*
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 390) * Coredumping runs without mmap_lock so we can only check that
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 391) * the mmap_lock is held, if PF_DUMPCORE was not set.
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 392) */
42fc541404f24 (Michel Lespinasse 2020-06-08 21:33:44 -0700 393) mmap_assert_locked(mm);
64c2b20301f62 (Andrea Arcangeli 2017-06-16 14:02:37 -0700 394)
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 395) ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 396) if (!ctx)
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 397) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 398)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 399) BUG_ON(ctx->mm != mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 400)
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 401) /* Any unrecognized flag is a bug. */
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 402) VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 403) /* 0 or > 1 flags set is a bug; we expect exactly 1. */
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 404) VM_BUG_ON(!reason || (reason & (reason - 1)));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 405)
2d6d6f5a09a96 (Prakash Sangappa 2017-09-06 16:23:39 -0700 406) if (ctx->features & UFFD_FEATURE_SIGBUS)
2d6d6f5a09a96 (Prakash Sangappa 2017-09-06 16:23:39 -0700 407) goto out;
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 408) if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 409) ctx->flags & UFFD_USER_MODE_ONLY) {
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 410) printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 411) "sysctl knob to 1 if kernel faults must be handled "
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 412) "without obtaining CAP_SYS_PTRACE capability\n");
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 413) goto out;
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 414) }
2d6d6f5a09a96 (Prakash Sangappa 2017-09-06 16:23:39 -0700 415)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 416) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 417) * If it's already released don't get it. This avoids to loop
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 418) * in __get_user_pages if userfaultfd_release waits on the
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 419) * caller of handle_userfault to release the mmap_lock.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 420) */
6aa7de059173a (Mark Rutland 2017-10-23 14:07:29 -0700 421) if (unlikely(READ_ONCE(ctx->released))) {
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 422) /*
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 423) * Don't return VM_FAULT_SIGBUS in this case, so a non
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 424) * cooperative manager can close the uffd after the
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 425) * last UFFDIO_COPY, without risking to trigger an
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 426) * involuntary SIGBUS if the process was starting the
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 427) * userfaultfd while the userfaultfd was still armed
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 428) * (but after the last UFFDIO_COPY). If the uffd
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 429) * wasn't already closed when the userfault reached
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 430) * this point, that would normally be solved by
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 431) * userfaultfd_must_wait returning 'false'.
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 432) *
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 433) * If we were to return VM_FAULT_SIGBUS here, the non
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 434) * cooperative manager would be instead forced to
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 435) * always call UFFDIO_UNREGISTER before it can safely
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 436) * close the uffd.
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 437) */
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 438) ret = VM_FAULT_NOPAGE;
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 439) goto out;
656710a60e369 (Andrea Arcangeli 2017-09-08 16:12:42 -0700 440) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 441)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 442) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 443) * Check that we can return VM_FAULT_RETRY.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 444) *
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 445) * NOTE: it should become possible to return VM_FAULT_RETRY
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 446) * even if FAULT_FLAG_TRIED is set without leading to gup()
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 447) * -EBUSY failures, if the userfaultfd is to be extended for
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 448) * VM_UFFD_WP tracking and we intend to arm the userfault
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 449) * without first stopping userland access to the memory. For
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 450) * VM_UFFD_MISSING userfaults this is enough for now.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 451) */
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 452) if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 453) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 454) * Validate the invariant that nowait must allow retry
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 455) * to be sure not to return SIGBUS erroneously on
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 456) * nowait invocations.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 457) */
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 458) BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 459) #ifdef CONFIG_DEBUG_VM
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 460) if (printk_ratelimit()) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 461) printk(KERN_WARNING
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 462) "FAULT_FLAG_ALLOW_RETRY missing %x\n",
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 463) vmf->flags);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 464) dump_stack();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 465) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 466) #endif
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 467) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 468) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 469)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 470) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 471) * Handle nowait, not much to do other than tell it to retry
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 472) * and wait.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 473) */
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 474) ret = VM_FAULT_RETRY;
82b0f8c39a386 (Jan Kara 2016-12-14 15:06:58 -0800 475) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 476) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 477)
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 478) /* take the reference before dropping the mmap_lock */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 479) userfaultfd_ctx_get(ctx);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 480)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 481) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 482) uwq.wq.private = current;
9d4ac934829ac (Alexey Perevalov 2017-09-06 16:23:56 -0700 483) uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
9d4ac934829ac (Alexey Perevalov 2017-09-06 16:23:56 -0700 484) ctx->features);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 485) uwq.ctx = ctx;
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 486) uwq.waken = false;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 487)
3e69ad081c18d (Peter Xu 2020-04-01 21:09:00 -0700 488) blocking_state = userfaultfd_get_blocking_state(vmf->flags);
dfa37dc3fc1f6 (Andrea Arcangeli 2015-09-04 15:47:18 -0700 489)
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 490) spin_lock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 491) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 492) * After the __add_wait_queue the uwq is visible to userland
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 493) * through poll/read().
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 494) */
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 495) __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 496) /*
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 497) * The smp_mb() after __set_current_state prevents the reads
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 498) * following the spin_unlock to happen before the list_add in
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 499) * __add_wait_queue.
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 500) */
15a77c6fe494f (Andrea Arcangeli 2017-01-24 15:17:59 -0800 501) set_current_state(blocking_state);
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 502) spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 503)
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 504) if (!is_vm_hugetlb_page(vmf->vma))
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 505) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 506) reason);
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 507) else
7868a2087ec13 (Punit Agrawal 2017-07-06 15:39:42 -0700 508) must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
7868a2087ec13 (Punit Agrawal 2017-07-06 15:39:42 -0700 509) vmf->address,
369cd2121be44 (Mike Kravetz 2017-02-22 15:43:10 -0800 510) vmf->flags, reason);
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 511) mmap_read_unlock(mm);
8d2afd96c2031 (Andrea Arcangeli 2015-09-04 15:46:51 -0700 512)
f9bf352224d7d (Linus Torvalds 2020-08-02 10:42:31 -0700 513) if (likely(must_wait && !READ_ONCE(ctx->released))) {
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 514) wake_up_poll(&ctx->fd_wqh, EPOLLIN);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 515) schedule();
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 516) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 517)
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 518) __set_current_state(TASK_RUNNING);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 519)
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 520) /*
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 521) * Here we race with the list_del; list_add in
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 522) * userfaultfd_ctx_read(), however because we don't ever run
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 523) * list_del_init() to refile across the two lists, the prev
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 524) * and next pointers will never point to self. list_add also
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 525) * would never let any of the two pointers to point to
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 526) * self. So list_empty_careful won't risk to see both pointers
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 527) * pointing to self at any time during the list refile. The
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 528) * only case where list_del_init() is called is the full
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 529) * removal in the wake function and there we don't re-list_add
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 530) * and it's fine not to block on the spinlock. The uwq on this
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 531) * kernel stack can be released after the list_del_init.
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 532) */
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 533) if (!list_empty_careful(&uwq.wq.entry)) {
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 534) spin_lock_irq(&ctx->fault_pending_wqh.lock);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 535) /*
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 536) * No need of list_del_init(), the uwq on the stack
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 537) * will be freed shortly anyway.
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 538) */
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 539) list_del(&uwq.wq.entry);
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 540) spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 541) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 542)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 543) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 544) * ctx may go away after this if the userfault pseudo fd is
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 545) * already released.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 546) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 547) userfaultfd_ctx_put(ctx);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 548)
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 549) out:
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 550) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 551) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 552)
8c9e7bb7a41f2 (Andrea Arcangeli 2017-03-09 16:16:54 -0800 553) static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
8c9e7bb7a41f2 (Andrea Arcangeli 2017-03-09 16:16:54 -0800 554) struct userfaultfd_wait_queue *ewq)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 555) {
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 556) struct userfaultfd_ctx *release_new_ctx;
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 557)
9a69a829f9b65 (Andrea Arcangeli 2017-03-09 16:16:52 -0800 558) if (WARN_ON_ONCE(current->flags & PF_EXITING))
9a69a829f9b65 (Andrea Arcangeli 2017-03-09 16:16:52 -0800 559) goto out;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 560)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 561) ewq->ctx = ctx;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 562) init_waitqueue_entry(&ewq->wq, current);
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 563) release_new_ctx = NULL;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 564)
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 565) spin_lock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 566) /*
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 567) * After the __add_wait_queue the uwq is visible to userland
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 568) * through poll/read().
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 569) */
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 570) __add_wait_queue(&ctx->event_wqh, &ewq->wq);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 571) for (;;) {
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 572) set_current_state(TASK_KILLABLE);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 573) if (ewq->msg.event == 0)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 574) break;
6aa7de059173a (Mark Rutland 2017-10-23 14:07:29 -0700 575) if (READ_ONCE(ctx->released) ||
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 576) fatal_signal_pending(current)) {
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 577) /*
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 578) * &ewq->wq may be queued in fork_event, but
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 579) * __remove_wait_queue ignores the head
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 580) * parameter. It would be a problem if it
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 581) * didn't.
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 582) */
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 583) __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 584) if (ewq->msg.event == UFFD_EVENT_FORK) {
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 585) struct userfaultfd_ctx *new;
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 586)
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 587) new = (struct userfaultfd_ctx *)
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 588) (unsigned long)
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 589) ewq->msg.arg.reserved.reserved1;
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 590) release_new_ctx = new;
7eb76d457fd75 (Mike Rapoport 2017-03-09 16:17:09 -0800 591) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 592) break;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 593) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 594)
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 595) spin_unlock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 596)
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 597) wake_up_poll(&ctx->fd_wqh, EPOLLIN);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 598) schedule();
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 599)
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 600) spin_lock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 601) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 602) __set_current_state(TASK_RUNNING);
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 603) spin_unlock_irq(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 604)
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 605) if (release_new_ctx) {
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 606) struct vm_area_struct *vma;
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 607) struct mm_struct *mm = release_new_ctx->mm;
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 608)
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 609) /* the various vma->vm_userfaultfd_ctx still points to it */
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 610) mmap_write_lock(mm);
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 611) for (vma = mm->mmap; vma; vma = vma->vm_next)
31e810aa1033a (Mike Rapoport 2018-08-02 15:36:09 -0700 612) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 613) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 614) vma->vm_flags &= ~__VM_UFFD_FLAGS;
31e810aa1033a (Mike Rapoport 2018-08-02 15:36:09 -0700 615) }
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 616) mmap_write_unlock(mm);
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 617)
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 618) userfaultfd_ctx_put(release_new_ctx);
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 619) }
0cbb4b4f4c44f (Andrea Arcangeli 2018-01-04 16:18:09 -0800 620)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 621) /*
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 622) * ctx may go away after this if the userfault pseudo fd is
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 623) * already released.
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 624) */
9a69a829f9b65 (Andrea Arcangeli 2017-03-09 16:16:52 -0800 625) out:
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 626) WRITE_ONCE(ctx->mmap_changing, false);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 627) userfaultfd_ctx_put(ctx);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 628) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 629)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 630) static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 631) struct userfaultfd_wait_queue *ewq)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 632) {
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 633) ewq->msg.event = 0;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 634) wake_up_locked(&ctx->event_wqh);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 635) __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 636) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 637)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 638) int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 639) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 640) struct userfaultfd_ctx *ctx = NULL, *octx;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 641) struct userfaultfd_fork_ctx *fctx;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 642)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 643) octx = vma->vm_userfaultfd_ctx.ctx;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 644) if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 645) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 646) vma->vm_flags &= ~__VM_UFFD_FLAGS;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 647) return 0;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 648) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 649)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 650) list_for_each_entry(fctx, fcs, list)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 651) if (fctx->orig == octx) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 652) ctx = fctx->new;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 653) break;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 654) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 655)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 656) if (!ctx) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 657) fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 658) if (!fctx)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 659) return -ENOMEM;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 660)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 661) ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 662) if (!ctx) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 663) kfree(fctx);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 664) return -ENOMEM;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 665) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 666)
ca880420665db (Eric Biggers 2018-12-28 00:34:43 -0800 667) refcount_set(&ctx->refcount, 1);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 668) ctx->flags = octx->flags;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 669) ctx->state = UFFD_STATE_RUNNING;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 670) ctx->features = octx->features;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 671) ctx->released = false;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 672) ctx->mmap_changing = false;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 673) ctx->mm = vma->vm_mm;
00bb31fa44acf (Mike Rapoport 2017-11-15 17:36:56 -0800 674) mmgrab(ctx->mm);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 675)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 676) userfaultfd_ctx_get(octx);
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 677) WRITE_ONCE(octx->mmap_changing, true);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 678) fctx->orig = octx;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 679) fctx->new = ctx;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 680) list_add_tail(&fctx->list, fcs);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 681) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 682)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 683) vma->vm_userfaultfd_ctx.ctx = ctx;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 684) return 0;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 685) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 686)
8c9e7bb7a41f2 (Andrea Arcangeli 2017-03-09 16:16:54 -0800 687) static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 688) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 689) struct userfaultfd_ctx *ctx = fctx->orig;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 690) struct userfaultfd_wait_queue ewq;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 691)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 692) msg_init(&ewq.msg);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 693)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 694) ewq.msg.event = UFFD_EVENT_FORK;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 695) ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 696)
8c9e7bb7a41f2 (Andrea Arcangeli 2017-03-09 16:16:54 -0800 697) userfaultfd_event_wait_completion(ctx, &ewq);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 698) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 699)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 700) void dup_userfaultfd_complete(struct list_head *fcs)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 701) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 702) struct userfaultfd_fork_ctx *fctx, *n;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 703)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 704) list_for_each_entry_safe(fctx, n, fcs, list) {
8c9e7bb7a41f2 (Andrea Arcangeli 2017-03-09 16:16:54 -0800 705) dup_fctx(fctx);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 706) list_del(&fctx->list);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 707) kfree(fctx);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 708) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 709) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 710)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 711) void mremap_userfaultfd_prep(struct vm_area_struct *vma,
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 712) struct vm_userfaultfd_ctx *vm_ctx)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 713) {
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 714) struct userfaultfd_ctx *ctx;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 715)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 716) ctx = vma->vm_userfaultfd_ctx.ctx;
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 717)
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 718) if (!ctx)
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 719) return;
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 720)
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 721) if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 722) vm_ctx->ctx = ctx;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 723) userfaultfd_ctx_get(ctx);
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 724) WRITE_ONCE(ctx->mmap_changing, true);
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 725) } else {
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 726) /* Drop uffd context if remap feature not enabled */
3cfd22be0ad66 (Peter Xu 2018-12-28 00:38:47 -0800 727) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 728) vma->vm_flags &= ~__VM_UFFD_FLAGS;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 729) }
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 730) }
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 731)
90794bf19dc19 (Andrea Arcangeli 2017-02-22 15:42:37 -0800 732) void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 733) unsigned long from, unsigned long to,
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 734) unsigned long len)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 735) {
90794bf19dc19 (Andrea Arcangeli 2017-02-22 15:42:37 -0800 736) struct userfaultfd_ctx *ctx = vm_ctx->ctx;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 737) struct userfaultfd_wait_queue ewq;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 738)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 739) if (!ctx)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 740) return;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 741)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 742) if (to & ~PAGE_MASK) {
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 743) userfaultfd_ctx_put(ctx);
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 744) return;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 745) }
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 746)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 747) msg_init(&ewq.msg);
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 748)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 749) ewq.msg.event = UFFD_EVENT_REMAP;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 750) ewq.msg.arg.remap.from = from;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 751) ewq.msg.arg.remap.to = to;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 752) ewq.msg.arg.remap.len = len;
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 753)
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 754) userfaultfd_event_wait_completion(ctx, &ewq);
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 755) }
72f87654c6969 (Pavel Emelyanov 2017-02-22 15:42:34 -0800 756)
70ccb92fdd90b (Andrea Arcangeli 2017-03-09 16:17:11 -0800 757) bool userfaultfd_remove(struct vm_area_struct *vma,
d811914d87576 (Mike Rapoport 2017-02-24 14:56:02 -0800 758) unsigned long start, unsigned long end)
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 759) {
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 760) struct mm_struct *mm = vma->vm_mm;
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 761) struct userfaultfd_ctx *ctx;
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 762) struct userfaultfd_wait_queue ewq;
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 763)
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 764) ctx = vma->vm_userfaultfd_ctx.ctx;
d811914d87576 (Mike Rapoport 2017-02-24 14:56:02 -0800 765) if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
70ccb92fdd90b (Andrea Arcangeli 2017-03-09 16:17:11 -0800 766) return true;
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 767)
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 768) userfaultfd_ctx_get(ctx);
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 769) WRITE_ONCE(ctx->mmap_changing, true);
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 770) mmap_read_unlock(mm);
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 771)
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 772) msg_init(&ewq.msg);
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 773)
d811914d87576 (Mike Rapoport 2017-02-24 14:56:02 -0800 774) ewq.msg.event = UFFD_EVENT_REMOVE;
d811914d87576 (Mike Rapoport 2017-02-24 14:56:02 -0800 775) ewq.msg.arg.remove.start = start;
d811914d87576 (Mike Rapoport 2017-02-24 14:56:02 -0800 776) ewq.msg.arg.remove.end = end;
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 777)
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 778) userfaultfd_event_wait_completion(ctx, &ewq);
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 779)
70ccb92fdd90b (Andrea Arcangeli 2017-03-09 16:17:11 -0800 780) return false;
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 781) }
05ce77249d506 (Pavel Emelyanov 2017-02-22 15:42:40 -0800 782)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 783) static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 784) unsigned long start, unsigned long end)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 785) {
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 786) struct userfaultfd_unmap_ctx *unmap_ctx;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 787)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 788) list_for_each_entry(unmap_ctx, unmaps, list)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 789) if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 790) unmap_ctx->end == end)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 791) return true;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 792)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 793) return false;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 794) }
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 795)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 796) int userfaultfd_unmap_prep(struct vm_area_struct *vma,
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 797) unsigned long start, unsigned long end,
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 798) struct list_head *unmaps)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 799) {
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 800) for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 801) struct userfaultfd_unmap_ctx *unmap_ctx;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 802) struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 803)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 804) if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 805) has_unmap_ctx(ctx, unmaps, start, end))
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 806) continue;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 807)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 808) unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 809) if (!unmap_ctx)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 810) return -ENOMEM;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 811)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 812) userfaultfd_ctx_get(ctx);
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 813) WRITE_ONCE(ctx->mmap_changing, true);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 814) unmap_ctx->ctx = ctx;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 815) unmap_ctx->start = start;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 816) unmap_ctx->end = end;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 817) list_add_tail(&unmap_ctx->list, unmaps);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 818) }
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 819)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 820) return 0;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 821) }
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 822)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 823) void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 824) {
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 825) struct userfaultfd_unmap_ctx *ctx, *n;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 826) struct userfaultfd_wait_queue ewq;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 827)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 828) list_for_each_entry_safe(ctx, n, uf, list) {
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 829) msg_init(&ewq.msg);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 830)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 831) ewq.msg.event = UFFD_EVENT_UNMAP;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 832) ewq.msg.arg.remove.start = ctx->start;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 833) ewq.msg.arg.remove.end = ctx->end;
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 834)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 835) userfaultfd_event_wait_completion(ctx->ctx, &ewq);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 836)
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 837) list_del(&ctx->list);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 838) kfree(ctx);
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 839) }
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 840) }
897ab3e0c49e2 (Mike Rapoport 2017-02-24 14:58:22 -0800 841)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 842) static int userfaultfd_release(struct inode *inode, struct file *file)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 843) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 844) struct userfaultfd_ctx *ctx = file->private_data;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 845) struct mm_struct *mm = ctx->mm;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 846) struct vm_area_struct *vma, *prev;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 847) /* len == 0 means wake all */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 848) struct userfaultfd_wake_range range = { .len = 0, };
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 849) unsigned long new_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 850)
6aa7de059173a (Mark Rutland 2017-10-23 14:07:29 -0700 851) WRITE_ONCE(ctx->released, true);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 852)
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 853) if (!mmget_not_zero(mm))
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 854) goto wakeup;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 855)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 856) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 857) * Flush page faults out of all CPUs. NOTE: all page faults
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 858) * must be retried without returning VM_FAULT_SIGBUS if
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 859) * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 860) * changes while handle_userfault released the mmap_lock. So
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 861) * it's critical that released is set to true (above), before
c1e8d7c6a7a68 (Michel Lespinasse 2020-06-08 21:33:54 -0700 862) * taking the mmap_lock for writing.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 863) */
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 864) mmap_write_lock(mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 865) prev = NULL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 866) for (vma = mm->mmap; vma; vma = vma->vm_next) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 867) cond_resched();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 868) BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 869) !!(vma->vm_flags & __VM_UFFD_FLAGS));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 870) if (vma->vm_userfaultfd_ctx.ctx != ctx) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 871) prev = vma;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 872) continue;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 873) }
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 874) new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 875) prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 876) new_flags, vma->anon_vma,
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 877) vma->vm_file, vma->vm_pgoff,
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 878) vma_policy(vma),
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 879) NULL_VM_UFFD_CTX);
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 880) if (prev)
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 881) vma = prev;
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 882) else
4d45e75a9955a (Jann Horn 2020-10-15 20:13:00 -0700 883) prev = vma;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 884) vma->vm_flags = new_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 885) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 886) }
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 887) mmap_write_unlock(mm);
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 888) mmput(mm);
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 889) wakeup:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 890) /*
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 891) * After no new page faults can wait on this fault_*wqh, flush
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 892) * the last page faults that may have been already waiting on
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 893) * the fault_*wqh.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 894) */
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 895) spin_lock_irq(&ctx->fault_pending_wqh.lock);
ac5be6b47e8bd (Andrea Arcangeli 2015-09-22 14:58:49 -0700 896) __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
c430d1e848ff1 (Matthew Wilcox 2018-08-21 21:56:30 -0700 897) __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 898) spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 899)
5a18b64e3f021 (Mike Rapoport 2017-08-02 13:32:24 -0700 900) /* Flush pending events that may still wait on event_wqh */
5a18b64e3f021 (Mike Rapoport 2017-08-02 13:32:24 -0700 901) wake_up_all(&ctx->event_wqh);
5a18b64e3f021 (Mike Rapoport 2017-08-02 13:32:24 -0700 902)
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 903) wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 904) userfaultfd_ctx_put(ctx);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 905) return 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 906) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 907)
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 908) /* fault_pending_wqh.lock must be hold by the caller */
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 909) static inline struct userfaultfd_wait_queue *find_userfault_in(
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 910) wait_queue_head_t *wqh)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 911) {
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 912) wait_queue_entry_t *wq;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 913) struct userfaultfd_wait_queue *uwq;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 914)
456a737896b25 (Lance Roy 2018-10-04 23:45:44 -0700 915) lockdep_assert_held(&wqh->lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 916)
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 917) uwq = NULL;
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 918) if (!waitqueue_active(wqh))
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 919) goto out;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 920) /* walk in reverse to provide FIFO behavior to read userfaults */
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 921) wq = list_last_entry(&wqh->head, typeof(*wq), entry);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 922) uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 923) out:
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 924) return uwq;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 925) }
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 926)
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 927) static inline struct userfaultfd_wait_queue *find_userfault(
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 928) struct userfaultfd_ctx *ctx)
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 929) {
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 930) return find_userfault_in(&ctx->fault_pending_wqh);
6dcc27fd39437 (Pavel Emelyanov 2017-02-22 15:42:18 -0800 931) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 932)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 933) static inline struct userfaultfd_wait_queue *find_userfault_evt(
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 934) struct userfaultfd_ctx *ctx)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 935) {
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 936) return find_userfault_in(&ctx->event_wqh);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 937) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 938)
076ccb76e1a6c (Al Viro 2017-07-03 01:02:18 -0400 939) static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 940) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 941) struct userfaultfd_ctx *ctx = file->private_data;
076ccb76e1a6c (Al Viro 2017-07-03 01:02:18 -0400 942) __poll_t ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 943)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 944) poll_wait(file, &ctx->fd_wqh, wait);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 945)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 946) switch (ctx->state) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 947) case UFFD_STATE_WAIT_API:
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 948) return EPOLLERR;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 949) case UFFD_STATE_RUNNING:
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 950) /*
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 951) * poll() never guarantees that read won't block.
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 952) * userfaults can be waken before they're read().
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 953) */
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 954) if (unlikely(!(file->f_flags & O_NONBLOCK)))
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 955) return EPOLLERR;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 956) /*
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 957) * lockless access to see if there are pending faults
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 958) * __pollwait last action is the add_wait_queue but
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 959) * the spin_unlock would allow the waitqueue_active to
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 960) * pass above the actual list_add inside
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 961) * add_wait_queue critical section. So use a full
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 962) * memory barrier to serialize the list_add write of
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 963) * add_wait_queue() with the waitqueue_active read
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 964) * below.
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 965) */
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 966) ret = 0;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 967) smp_mb();
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 968) if (waitqueue_active(&ctx->fault_pending_wqh))
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 969) ret = EPOLLIN;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 970) else if (waitqueue_active(&ctx->event_wqh))
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 971) ret = EPOLLIN;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 972)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 973) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 974) default:
8474901a33d8a (Andrea Arcangeli 2017-02-22 15:42:12 -0800 975) WARN_ON_ONCE(1);
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 976) return EPOLLERR;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 977) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 978) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 979)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 980) static const struct file_operations userfaultfd_fops;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 981)
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 982) static int resolve_userfault_fork(struct userfaultfd_ctx *new,
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 983) struct inode *inode,
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 984) struct uffd_msg *msg)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 985) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 986) int fd;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 987)
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 988) fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 989) O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 990) if (fd < 0)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 991) return fd;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 992)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 993) msg->arg.reserved.reserved1 = 0;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 994) msg->arg.fork.ufd = fd;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 995) return 0;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 996) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 997)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 998) static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 999) struct uffd_msg *msg, struct inode *inode)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1000) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1001) ssize_t ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1002) DECLARE_WAITQUEUE(wait, current);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1003) struct userfaultfd_wait_queue *uwq;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1004) /*
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1005) * Handling fork event requires sleeping operations, so
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1006) * we drop the event_wqh lock, then do these ops, then
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1007) * lock it back and wake up the waiter. While the lock is
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1008) * dropped the ewq may go away so we keep track of it
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1009) * carefully.
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1010) */
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1011) LIST_HEAD(fork_event);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1012) struct userfaultfd_ctx *fork_nctx = NULL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1013)
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1014) /* always take the fd_wqh lock before the fault_pending_wqh lock */
ae62c16e105a8 (Christoph Hellwig 2018-10-26 15:02:19 -0700 1015) spin_lock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1016) __add_wait_queue(&ctx->fd_wqh, &wait);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1017) for (;;) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1018) set_current_state(TASK_INTERRUPTIBLE);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1019) spin_lock(&ctx->fault_pending_wqh.lock);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1020) uwq = find_userfault(ctx);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1021) if (uwq) {
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1022) /*
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1023) * Use a seqcount to repeat the lockless check
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1024) * in wake_userfault() to avoid missing
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1025) * wakeups because during the refile both
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1026) * waitqueue could become empty if this is the
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1027) * only userfault.
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1028) */
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1029) write_seqcount_begin(&ctx->refile_seq);
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1030)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1031) /*
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1032) * The fault_pending_wqh.lock prevents the uwq
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1033) * to disappear from under us.
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1034) *
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1035) * Refile this userfault from
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1036) * fault_pending_wqh to fault_wqh, it's not
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1037) * pending anymore after we read it.
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1038) *
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1039) * Use list_del() by hand (as
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1040) * userfaultfd_wake_function also uses
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1041) * list_del_init() by hand) to be sure nobody
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1042) * changes __remove_wait_queue() to use
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1043) * list_del_init() in turn breaking the
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1044) * !list_empty_careful() check in
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 1045) * handle_userfault(). The uwq->wq.head list
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1046) * must never be empty at any time during the
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1047) * refile, or the waitqueue could disappear
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1048) * from under us. The "wait_queue_head_t"
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1049) * parameter of __remove_wait_queue() is unused
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1050) * anyway.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1051) */
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 1052) list_del(&uwq->wq.entry);
c430d1e848ff1 (Matthew Wilcox 2018-08-21 21:56:30 -0700 1053) add_wait_queue(&ctx->fault_wqh, &uwq->wq);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1054)
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1055) write_seqcount_end(&ctx->refile_seq);
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1056)
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1057) /* careful to always initialize msg if ret == 0 */
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1058) *msg = uwq->msg;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1059) spin_unlock(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1060) ret = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1061) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1062) }
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1063) spin_unlock(&ctx->fault_pending_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1064)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1065) spin_lock(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1066) uwq = find_userfault_evt(ctx);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1067) if (uwq) {
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1068) *msg = uwq->msg;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1069)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1070) if (uwq->msg.event == UFFD_EVENT_FORK) {
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1071) fork_nctx = (struct userfaultfd_ctx *)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1072) (unsigned long)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1073) uwq->msg.arg.reserved.reserved1;
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 1074) list_move(&uwq->wq.entry, &fork_event);
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1075) /*
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1076) * fork_nctx can be freed as soon as
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1077) * we drop the lock, unless we take a
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1078) * reference on it.
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1079) */
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1080) userfaultfd_ctx_get(fork_nctx);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1081) spin_unlock(&ctx->event_wqh.lock);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1082) ret = 0;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1083) break;
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1084) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1085)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1086) userfaultfd_event_complete(ctx, uwq);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1087) spin_unlock(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1088) ret = 0;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1089) break;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1090) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1091) spin_unlock(&ctx->event_wqh.lock);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1092)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1093) if (signal_pending(current)) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1094) ret = -ERESTARTSYS;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1095) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1096) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1097) if (no_wait) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1098) ret = -EAGAIN;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1099) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1100) }
ae62c16e105a8 (Christoph Hellwig 2018-10-26 15:02:19 -0700 1101) spin_unlock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1102) schedule();
ae62c16e105a8 (Christoph Hellwig 2018-10-26 15:02:19 -0700 1103) spin_lock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1104) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1105) __remove_wait_queue(&ctx->fd_wqh, &wait);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1106) __set_current_state(TASK_RUNNING);
ae62c16e105a8 (Christoph Hellwig 2018-10-26 15:02:19 -0700 1107) spin_unlock_irq(&ctx->fd_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1108)
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1109) if (!ret && msg->event == UFFD_EVENT_FORK) {
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 1110) ret = resolve_userfault_fork(fork_nctx, inode, msg);
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 1111) spin_lock_irq(&ctx->event_wqh.lock);
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1112) if (!list_empty(&fork_event)) {
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1113) /*
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1114) * The fork thread didn't abort, so we can
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1115) * drop the temporary refcount.
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1116) */
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1117) userfaultfd_ctx_put(fork_nctx);
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1118)
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1119) uwq = list_first_entry(&fork_event,
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1120) typeof(*uwq),
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1121) wq.entry);
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1122) /*
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1123) * If fork_event list wasn't empty and in turn
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1124) * the event wasn't already released by fork
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1125) * (the event is allocated on fork kernel
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1126) * stack), put the event back to its place in
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1127) * the event_wq. fork_event head will be freed
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1128) * as soon as we return so the event cannot
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1129) * stay queued there no matter the current
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1130) * "ret" value.
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1131) */
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1132) list_del(&uwq->wq.entry);
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1133) __add_wait_queue(&ctx->event_wqh, &uwq->wq);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1134)
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1135) /*
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1136) * Leave the event in the waitqueue and report
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1137) * error to userland if we failed to resolve
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1138) * the userfault fork.
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1139) */
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1140) if (likely(!ret))
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1141) userfaultfd_event_complete(ctx, uwq);
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1142) } else {
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1143) /*
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1144) * Here the fork thread aborted and the
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1145) * refcount from the fork thread on fork_nctx
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1146) * has already been released. We still hold
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1147) * the reference we took before releasing the
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1148) * lock above. If resolve_userfault_fork
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1149) * failed we've to drop it because the
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1150) * fork_nctx has to be freed in such case. If
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1151) * it succeeded we'll hold it because the new
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1152) * uffd references it.
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1153) */
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1154) if (ret)
384632e67e082 (Andrea Arcangeli 2017-10-03 16:15:38 -0700 1155) userfaultfd_ctx_put(fork_nctx);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1156) }
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 1157) spin_unlock_irq(&ctx->event_wqh.lock);
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1158) }
893e26e61d04e (Pavel Emelyanov 2017-02-22 15:42:27 -0800 1159)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1160) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1161) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1162)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1163) static ssize_t userfaultfd_read(struct file *file, char __user *buf,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1164) size_t count, loff_t *ppos)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1165) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1166) struct userfaultfd_ctx *ctx = file->private_data;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1167) ssize_t _ret, ret = 0;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1168) struct uffd_msg msg;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1169) int no_wait = file->f_flags & O_NONBLOCK;
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 1170) struct inode *inode = file_inode(file);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1171)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1172) if (ctx->state == UFFD_STATE_WAIT_API)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1173) return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1174)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1175) for (;;) {
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1176) if (count < sizeof(msg))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1177) return ret ? ret : -EINVAL;
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 1178) _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1179) if (_ret < 0)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1180) return ret ? ret : _ret;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1181) if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1182) return ret ? ret : -EFAULT;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1183) ret += sizeof(msg);
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1184) buf += sizeof(msg);
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1185) count -= sizeof(msg);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1186) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1187) * Allow to read more than one fault at time but only
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1188) * block if waiting for the very first one.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1189) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1190) no_wait = O_NONBLOCK;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1191) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1192) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1193)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1194) static void __wake_userfault(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1195) struct userfaultfd_wake_range *range)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1196) {
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 1197) spin_lock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1198) /* wake all in the range and autoremove */
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1199) if (waitqueue_active(&ctx->fault_pending_wqh))
ac5be6b47e8bd (Andrea Arcangeli 2015-09-22 14:58:49 -0700 1200) __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1201) range);
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 1202) if (waitqueue_active(&ctx->fault_wqh))
c430d1e848ff1 (Matthew Wilcox 2018-08-21 21:56:30 -0700 1203) __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 1204) spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1205) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1206)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1207) static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1208) struct userfaultfd_wake_range *range)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1209) {
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1210) unsigned seq;
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1211) bool need_wakeup;
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1212)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1213) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1214) * To be sure waitqueue_active() is not reordered by the CPU
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1215) * before the pagetable update, use an explicit SMP memory
3e4e28c5a8f01 (Michel Lespinasse 2020-06-08 21:33:51 -0700 1216) * barrier here. PT lock release or mmap_read_unlock(mm) still
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1217) * have release semantics that can allow the
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1218) * waitqueue_active() to be reordered before the pte update.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1219) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1220) smp_mb();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1221)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1222) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1223) * Use waitqueue_active because it's very frequent to
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1224) * change the address space atomically even if there are no
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1225) * userfaults yet. So we take the spinlock only when we're
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1226) * sure we've userfaults to wake.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1227) */
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1228) do {
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1229) seq = read_seqcount_begin(&ctx->refile_seq);
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1230) need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1231) waitqueue_active(&ctx->fault_wqh);
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1232) cond_resched();
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1233) } while (read_seqcount_retry(&ctx->refile_seq, seq));
2c5b7e1be74ff (Andrea Arcangeli 2015-09-04 15:47:23 -0700 1234) if (need_wakeup)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1235) __wake_userfault(ctx, range);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1236) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1237)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1238) static __always_inline int validate_range(struct mm_struct *mm,
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1239) __u64 start, __u64 len)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1240) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1241) __u64 task_size = mm->task_size;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1242)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1243) if (start & ~PAGE_MASK)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1244) return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1245) if (len & ~PAGE_MASK)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1246) return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1247) if (!len)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1248) return -EINVAL;
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1249) if (start < mmap_min_addr)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1250) return -EINVAL;
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1251) if (start >= task_size)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1252) return -EINVAL;
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1253) if (len > task_size - start)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1254) return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1255) return 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1256) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1257)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1258) static inline bool vma_can_userfault(struct vm_area_struct *vma,
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1259) unsigned long vm_flags)
ba6907db6de17 (Mike Rapoport 2017-02-22 15:43:22 -0800 1260) {
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1261) /* FIXME: add WP support to hugetlbfs and shmem */
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1262) if (vm_flags & VM_UFFD_WP) {
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1263) if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1264) return false;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1265) }
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1266)
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1267) if (vm_flags & VM_UFFD_MINOR) {
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1268) /* FIXME: Add minor fault interception for shmem. */
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1269) if (!is_vm_hugetlb_page(vma))
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1270) return false;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1271) }
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1272)
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1273) return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1274) vma_is_shmem(vma);
ba6907db6de17 (Mike Rapoport 2017-02-22 15:43:22 -0800 1275) }
ba6907db6de17 (Mike Rapoport 2017-02-22 15:43:22 -0800 1276)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1277) static int userfaultfd_register(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1278) unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1279) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1280) struct mm_struct *mm = ctx->mm;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1281) struct vm_area_struct *vma, *prev, *cur;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1282) int ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1283) struct uffdio_register uffdio_register;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1284) struct uffdio_register __user *user_uffdio_register;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1285) unsigned long vm_flags, new_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1286) bool found;
ce53e8e6f2cb0 (Mike Rapoport 2017-09-06 16:23:12 -0700 1287) bool basic_ioctls;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1288) unsigned long start, end, vma_end;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1289)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1290) user_uffdio_register = (struct uffdio_register __user *) arg;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1291)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1292) ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1293) if (copy_from_user(&uffdio_register, user_uffdio_register,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1294) sizeof(uffdio_register)-sizeof(__u64)))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1295) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1296)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1297) ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1298) if (!uffdio_register.mode)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1299) goto out;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1300) if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1301) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1302) vm_flags = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1303) if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1304) vm_flags |= VM_UFFD_MISSING;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1305) if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1306) vm_flags |= VM_UFFD_WP;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1307) if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1308) #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1309) goto out;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1310) #endif
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1311) vm_flags |= VM_UFFD_MINOR;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1312) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1313)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1314) ret = validate_range(mm, uffdio_register.range.start,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1315) uffdio_register.range.len);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1316) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1317) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1318)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1319) start = uffdio_register.range.start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1320) end = start + uffdio_register.range.len;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1321)
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1322) ret = -ENOMEM;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1323) if (!mmget_not_zero(mm))
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1324) goto out;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1325)
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 1326) mmap_write_lock(mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1327) vma = find_vma_prev(mm, start, &prev);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1328) if (!vma)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1329) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1330)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1331) /* check that there's at least one vma in the range */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1332) ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1333) if (vma->vm_start >= end)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1334) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1335)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1336) /*
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1337) * If the first vma contains huge pages, make sure start address
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1338) * is aligned to huge page size.
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1339) */
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1340) if (is_vm_hugetlb_page(vma)) {
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1341) unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1342)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1343) if (start & (vma_hpagesize - 1))
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1344) goto out_unlock;
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1345) }
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1346)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1347) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1348) * Search for not compatible vmas.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1349) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1350) found = false;
ce53e8e6f2cb0 (Mike Rapoport 2017-09-06 16:23:12 -0700 1351) basic_ioctls = false;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1352) for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1353) cond_resched();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1354)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1355) BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1356) !!(cur->vm_flags & __VM_UFFD_FLAGS));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1357)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1358) /* check not compatible vmas */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1359) ret = -EINVAL;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1360) if (!vma_can_userfault(cur, vm_flags))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1361) goto out_unlock;
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1362)
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1363) /*
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1364) * UFFDIO_COPY will fill file holes even without
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1365) * PROT_WRITE. This check enforces that if this is a
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1366) * MAP_SHARED, the process has write permission to the backing
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1367) * file. If VM_MAYWRITE is set it also enforces that on a
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1368) * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1369) * F_WRITE_SEAL can be taken until the vma is destroyed.
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1370) */
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1371) ret = -EPERM;
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1372) if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1373) goto out_unlock;
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1374)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1375) /*
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1376) * If this vma contains ending address, and huge pages
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1377) * check alignment.
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1378) */
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1379) if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1380) end > cur->vm_start) {
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1381) unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1382)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1383) ret = -EINVAL;
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1384)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1385) if (end & (vma_hpagesize - 1))
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1386) goto out_unlock;
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1387) }
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1388) if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1389) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1390)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1391) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1392) * Check that this vma isn't already owned by a
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1393) * different userfaultfd. We can't allow more than one
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1394) * userfaultfd to own a single vma simultaneously or we
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1395) * wouldn't know which one to deliver the userfaults to.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1396) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1397) ret = -EBUSY;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1398) if (cur->vm_userfaultfd_ctx.ctx &&
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1399) cur->vm_userfaultfd_ctx.ctx != ctx)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1400) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1401)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1402) /*
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1403) * Note vmas containing huge pages
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1404) */
ce53e8e6f2cb0 (Mike Rapoport 2017-09-06 16:23:12 -0700 1405) if (is_vm_hugetlb_page(cur))
ce53e8e6f2cb0 (Mike Rapoport 2017-09-06 16:23:12 -0700 1406) basic_ioctls = true;
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1407)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1408) found = true;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1409) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1410) BUG_ON(!found);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1411)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1412) if (vma->vm_start < start)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1413) prev = vma;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1414)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1415) ret = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1416) do {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1417) cond_resched();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1418)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1419) BUG_ON(!vma_can_userfault(vma, vm_flags));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1420) BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1421) vma->vm_userfaultfd_ctx.ctx != ctx);
29ec90660d68b (Andrea Arcangeli 2018-11-30 14:09:32 -0800 1422) WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1423)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1424) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1425) * Nothing to do: this vma is already registered into this
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1426) * userfaultfd and with the right tracking mode too.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1427) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1428) if (vma->vm_userfaultfd_ctx.ctx == ctx &&
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1429) (vma->vm_flags & vm_flags) == vm_flags)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1430) goto skip;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1431)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1432) if (vma->vm_start > start)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1433) start = vma->vm_start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1434) vma_end = min(end, vma->vm_end);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1435)
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1436) new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1437) prev = vma_merge(mm, prev, start, vma_end, new_flags,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1438) vma->anon_vma, vma->vm_file, vma->vm_pgoff,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1439) vma_policy(vma),
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1440) ((struct vm_userfaultfd_ctx){ ctx }));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1441) if (prev) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1442) vma = prev;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1443) goto next;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1444) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1445) if (vma->vm_start < start) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1446) ret = split_vma(mm, vma, start, 1);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1447) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1448) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1449) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1450) if (vma->vm_end > end) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1451) ret = split_vma(mm, vma, end, 0);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1452) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1453) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1454) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1455) next:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1456) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1457) * In the vma_merge() successful mprotect-like case 8:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1458) * the next vma was merged into the current one and
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1459) * the current one has not been updated yet.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1460) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1461) vma->vm_flags = new_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1462) vma->vm_userfaultfd_ctx.ctx = ctx;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1463)
6dfeaff93be1a (Peter Xu 2021-05-04 18:33:13 -0700 1464) if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
6dfeaff93be1a (Peter Xu 2021-05-04 18:33:13 -0700 1465) hugetlb_unshare_all_pmds(vma);
6dfeaff93be1a (Peter Xu 2021-05-04 18:33:13 -0700 1466)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1467) skip:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1468) prev = vma;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1469) start = vma->vm_end;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1470) vma = vma->vm_next;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1471) } while (vma && vma->vm_start < end);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1472) out_unlock:
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 1473) mmap_write_unlock(mm);
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1474) mmput(mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1475) if (!ret) {
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1476) __u64 ioctls_out;
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1477)
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1478) ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1479) UFFD_API_RANGE_IOCTLS;
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1480)
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1481) /*
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1482) * Declare the WP ioctl only if the WP mode is
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1483) * specified and all checks passed with the range
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1484) */
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1485) if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1486) ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1487)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1488) /* CONTINUE ioctl is only supported for MINOR ranges. */
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1489) if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1490) ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1491)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1492) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1493) * Now that we scanned all vmas we can already tell
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1494) * userland which ioctls methods are guaranteed to
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1495) * succeed on this range.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1496) */
14819305e09fe (Peter Xu 2020-04-06 20:06:29 -0700 1497) if (put_user(ioctls_out, &user_uffdio_register->ioctls))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1498) ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1499) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1500) out:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1501) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1502) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1503)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1504) static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1505) unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1506) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1507) struct mm_struct *mm = ctx->mm;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1508) struct vm_area_struct *vma, *prev, *cur;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1509) int ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1510) struct uffdio_range uffdio_unregister;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1511) unsigned long new_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1512) bool found;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1513) unsigned long start, end, vma_end;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1514) const void __user *buf = (void __user *)arg;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1515)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1516) ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1517) if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1518) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1519)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1520) ret = validate_range(mm, uffdio_unregister.start,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1521) uffdio_unregister.len);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1522) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1523) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1524)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1525) start = uffdio_unregister.start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1526) end = start + uffdio_unregister.len;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1527)
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1528) ret = -ENOMEM;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1529) if (!mmget_not_zero(mm))
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1530) goto out;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1531)
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 1532) mmap_write_lock(mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1533) vma = find_vma_prev(mm, start, &prev);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1534) if (!vma)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1535) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1536)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1537) /* check that there's at least one vma in the range */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1538) ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1539) if (vma->vm_start >= end)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1540) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1541)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1542) /*
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1543) * If the first vma contains huge pages, make sure start address
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1544) * is aligned to huge page size.
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1545) */
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1546) if (is_vm_hugetlb_page(vma)) {
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1547) unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1548)
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1549) if (start & (vma_hpagesize - 1))
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1550) goto out_unlock;
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1551) }
cab350afcbc9c (Mike Kravetz 2017-02-22 15:43:04 -0800 1552)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1553) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1554) * Search for not compatible vmas.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1555) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1556) found = false;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1557) ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1558) for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1559) cond_resched();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1560)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1561) BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1562) !!(cur->vm_flags & __VM_UFFD_FLAGS));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1563)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1564) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1565) * Check not compatible vmas, not strictly required
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1566) * here as not compatible vmas cannot have an
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1567) * userfaultfd_ctx registered on them, but this
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1568) * provides for more strict behavior to notice
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1569) * unregistration errors.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1570) */
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1571) if (!vma_can_userfault(cur, cur->vm_flags))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1572) goto out_unlock;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1573)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1574) found = true;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1575) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1576) BUG_ON(!found);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1577)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1578) if (vma->vm_start < start)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1579) prev = vma;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1580)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1581) ret = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1582) do {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1583) cond_resched();
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1584)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1585) BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1586)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1587) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1588) * Nothing to do: this vma is already registered into this
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1589) * userfaultfd and with the right tracking mode too.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1590) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1591) if (!vma->vm_userfaultfd_ctx.ctx)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1592) goto skip;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1593)
01e881f5a1fca (Andrea Arcangeli 2018-12-14 14:17:17 -0800 1594) WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
01e881f5a1fca (Andrea Arcangeli 2018-12-14 14:17:17 -0800 1595)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1596) if (vma->vm_start > start)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1597) start = vma->vm_start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1598) vma_end = min(end, vma->vm_end);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1599)
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1600) if (userfaultfd_missing(vma)) {
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1601) /*
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1602) * Wake any concurrent pending userfault while
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1603) * we unregister, so they will not hang
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1604) * permanently and it avoids userland to call
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1605) * UFFDIO_WAKE explicitly.
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1606) */
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1607) struct userfaultfd_wake_range range;
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1608) range.start = start;
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1609) range.len = vma_end - start;
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1610) wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1611) }
09fa5296a40d0 (Andrea Arcangeli 2017-02-22 15:42:46 -0800 1612)
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1613) new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1614) prev = vma_merge(mm, prev, start, vma_end, new_flags,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1615) vma->anon_vma, vma->vm_file, vma->vm_pgoff,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1616) vma_policy(vma),
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1617) NULL_VM_UFFD_CTX);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1618) if (prev) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1619) vma = prev;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1620) goto next;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1621) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1622) if (vma->vm_start < start) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1623) ret = split_vma(mm, vma, start, 1);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1624) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1625) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1626) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1627) if (vma->vm_end > end) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1628) ret = split_vma(mm, vma, end, 0);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1629) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1630) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1631) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1632) next:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1633) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1634) * In the vma_merge() successful mprotect-like case 8:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1635) * the next vma was merged into the current one and
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1636) * the current one has not been updated yet.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1637) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1638) vma->vm_flags = new_flags;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1639) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1640)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1641) skip:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1642) prev = vma;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1643) start = vma->vm_end;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1644) vma = vma->vm_next;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1645) } while (vma && vma->vm_start < end);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1646) out_unlock:
d8ed45c5dcd45 (Michel Lespinasse 2020-06-08 21:33:25 -0700 1647) mmap_write_unlock(mm);
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1648) mmput(mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1649) out:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1650) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1651) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1652)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1653) /*
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 1654) * userfaultfd_wake may be used in combination with the
ba85c702e4b24 (Andrea Arcangeli 2015-09-04 15:46:41 -0700 1655) * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1656) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1657) static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1658) unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1659) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1660) int ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1661) struct uffdio_range uffdio_wake;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1662) struct userfaultfd_wake_range range;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1663) const void __user *buf = (void __user *)arg;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1664)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1665) ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1666) if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1667) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1668)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1669) ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1670) if (ret)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1671) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1672)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1673) range.start = uffdio_wake.start;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1674) range.len = uffdio_wake.len;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1675)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1676) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1677) * len == 0 means wake all and we don't want to wake all here,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1678) * so check it again to be sure.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1679) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1680) VM_BUG_ON(!range.len);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1681)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1682) wake_userfault(ctx, &range);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1683) ret = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1684)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1685) out:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1686) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1687) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1688)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1689) static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1690) unsigned long arg)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1691) {
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1692) __s64 ret;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1693) struct uffdio_copy uffdio_copy;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1694) struct uffdio_copy __user *user_uffdio_copy;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1695) struct userfaultfd_wake_range range;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1696)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1697) user_uffdio_copy = (struct uffdio_copy __user *) arg;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1698)
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1699) ret = -EAGAIN;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1700) if (READ_ONCE(ctx->mmap_changing))
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1701) goto out;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1702)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1703) ret = -EFAULT;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1704) if (copy_from_user(&uffdio_copy, user_uffdio_copy,
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1705) /* don't copy "copy" last field */
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1706) sizeof(uffdio_copy)-sizeof(__s64)))
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1707) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1708)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1709) ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1710) if (ret)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1711) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1712) /*
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1713) * double check for wraparound just in case. copy_from_user()
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1714) * will later check uffdio_copy.src + uffdio_copy.len to fit
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1715) * in the userland range.
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1716) */
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1717) ret = -EINVAL;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1718) if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1719) goto out;
72981e0e7b609 (Andrea Arcangeli 2020-04-06 20:05:41 -0700 1720) if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1721) goto out;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1722) if (mmget_not_zero(ctx->mm)) {
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1723) ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
72981e0e7b609 (Andrea Arcangeli 2020-04-06 20:05:41 -0700 1724) uffdio_copy.len, &ctx->mmap_changing,
72981e0e7b609 (Andrea Arcangeli 2020-04-06 20:05:41 -0700 1725) uffdio_copy.mode);
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1726) mmput(ctx->mm);
96333187ab162 (Mike Rapoport 2017-02-24 14:58:31 -0800 1727) } else {
e86b298bebf7e (Mike Rapoport 2017-08-10 15:24:32 -0700 1728) return -ESRCH;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1729) }
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1730) if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1731) return -EFAULT;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1732) if (ret < 0)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1733) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1734) BUG_ON(!ret);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1735) /* len == 0 would wake all */
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1736) range.len = ret;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1737) if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1738) range.start = uffdio_copy.dst;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1739) wake_userfault(ctx, &range);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1740) }
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1741) ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1742) out:
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1743) return ret;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1744) }
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1745)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1746) static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1747) unsigned long arg)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1748) {
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1749) __s64 ret;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1750) struct uffdio_zeropage uffdio_zeropage;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1751) struct uffdio_zeropage __user *user_uffdio_zeropage;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1752) struct userfaultfd_wake_range range;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1753)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1754) user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1755)
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1756) ret = -EAGAIN;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1757) if (READ_ONCE(ctx->mmap_changing))
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1758) goto out;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1759)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1760) ret = -EFAULT;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1761) if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1762) /* don't copy "zeropage" last field */
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1763) sizeof(uffdio_zeropage)-sizeof(__s64)))
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1764) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1765)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1766) ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1767) uffdio_zeropage.range.len);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1768) if (ret)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1769) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1770) ret = -EINVAL;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1771) if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1772) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1773)
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1774) if (mmget_not_zero(ctx->mm)) {
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1775) ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1776) uffdio_zeropage.range.len,
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 1777) &ctx->mmap_changing);
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1778) mmput(ctx->mm);
9d95aa4bada24 (Mike Rapoport 2017-08-02 13:32:15 -0700 1779) } else {
e86b298bebf7e (Mike Rapoport 2017-08-10 15:24:32 -0700 1780) return -ESRCH;
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 1781) }
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1782) if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1783) return -EFAULT;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1784) if (ret < 0)
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1785) goto out;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1786) /* len == 0 would wake all */
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1787) BUG_ON(!ret);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1788) range.len = ret;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1789) if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1790) range.start = uffdio_zeropage.range.start;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1791) wake_userfault(ctx, &range);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1792) }
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1793) ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1794) out:
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1795) return ret;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1796) }
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1797)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1798) static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1799) unsigned long arg)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1800) {
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1801) int ret;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1802) struct uffdio_writeprotect uffdio_wp;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1803) struct uffdio_writeprotect __user *user_uffdio_wp;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1804) struct userfaultfd_wake_range range;
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1805) bool mode_wp, mode_dontwake;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1806)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1807) if (READ_ONCE(ctx->mmap_changing))
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1808) return -EAGAIN;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1809)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1810) user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1811)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1812) if (copy_from_user(&uffdio_wp, user_uffdio_wp,
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1813) sizeof(struct uffdio_writeprotect)))
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1814) return -EFAULT;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1815)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1816) ret = validate_range(ctx->mm, uffdio_wp.range.start,
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1817) uffdio_wp.range.len);
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1818) if (ret)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1819) return ret;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1820)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1821) if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1822) UFFDIO_WRITEPROTECT_MODE_WP))
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1823) return -EINVAL;
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1824)
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1825) mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1826) mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1827)
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1828) if (mode_wp && mode_dontwake)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1829) return -EINVAL;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1830)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1831) ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1832) uffdio_wp.range.len, mode_wp,
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1833) &ctx->mmap_changing);
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1834) if (ret)
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1835) return ret;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1836)
23080e2783ba4 (Peter Xu 2020-04-06 20:06:20 -0700 1837) if (!mode_wp && !mode_dontwake) {
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1838) range.start = uffdio_wp.range.start;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1839) range.len = uffdio_wp.range.len;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1840) wake_userfault(ctx, &range);
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1841) }
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1842) return ret;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1843) }
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1844)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1845) static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1846) {
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1847) __s64 ret;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1848) struct uffdio_continue uffdio_continue;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1849) struct uffdio_continue __user *user_uffdio_continue;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1850) struct userfaultfd_wake_range range;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1851)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1852) user_uffdio_continue = (struct uffdio_continue __user *)arg;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1853)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1854) ret = -EAGAIN;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1855) if (READ_ONCE(ctx->mmap_changing))
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1856) goto out;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1857)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1858) ret = -EFAULT;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1859) if (copy_from_user(&uffdio_continue, user_uffdio_continue,
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1860) /* don't copy the output fields */
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1861) sizeof(uffdio_continue) - (sizeof(__s64))))
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1862) goto out;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1863)
60e7f63de3372 (Peter Collingbourne 2021-07-23 15:50:01 -0700 1864) ret = validate_range(ctx->mm, uffdio_continue.range.start,
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1865) uffdio_continue.range.len);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1866) if (ret)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1867) goto out;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1868)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1869) ret = -EINVAL;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1870) /* double check for wraparound just in case. */
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1871) if (uffdio_continue.range.start + uffdio_continue.range.len <=
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1872) uffdio_continue.range.start) {
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1873) goto out;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1874) }
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1875) if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1876) goto out;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1877)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1878) if (mmget_not_zero(ctx->mm)) {
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1879) ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1880) uffdio_continue.range.len,
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1881) &ctx->mmap_changing);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1882) mmput(ctx->mm);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1883) } else {
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1884) return -ESRCH;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1885) }
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1886)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1887) if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1888) return -EFAULT;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1889) if (ret < 0)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1890) goto out;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1891)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1892) /* len == 0 would wake all */
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1893) BUG_ON(!ret);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1894) range.len = ret;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1895) if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1896) range.start = uffdio_continue.range.start;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1897) wake_userfault(ctx, &range);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1898) }
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1899) ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1900)
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1901) out:
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1902) return ret;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1903) }
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1904)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1905) static inline unsigned int uffd_ctx_features(__u64 user_features)
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1906) {
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1907) /*
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1908) * For the current set of features the bits just coincide
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1909) */
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1910) return (unsigned int)user_features;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1911) }
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 1912)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1913) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1914) * userland asks for a certain API version and we return which bits
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1915) * and ioctl commands are implemented in this kernel for such API
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1916) * version or -EINVAL if unknown.
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1917) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1918) static int userfaultfd_api(struct userfaultfd_ctx *ctx,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1919) unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1920) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1921) struct uffdio_api uffdio_api;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1922) void __user *buf = (void __user *)arg;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1923) int ret;
656031445d5a8 (Andrea Arcangeli 2017-02-22 15:42:24 -0800 1924) __u64 features;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1925)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1926) ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1927) if (ctx->state != UFFD_STATE_WAIT_API)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1928) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1929) ret = -EFAULT;
a9b85f9415fd9 (Andrea Arcangeli 2015-09-04 15:46:37 -0700 1930) if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1931) goto out;
656031445d5a8 (Andrea Arcangeli 2017-02-22 15:42:24 -0800 1932) features = uffdio_api.features;
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1933) ret = -EINVAL;
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1934) if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1935) goto err_out;
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1936) ret = -EPERM;
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1937) if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1938) goto err_out;
656031445d5a8 (Andrea Arcangeli 2017-02-22 15:42:24 -0800 1939) /* report all available features and ioctls to userland */
656031445d5a8 (Andrea Arcangeli 2017-02-22 15:42:24 -0800 1940) uffdio_api.features = UFFD_API_FEATURES;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1941) #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1942) uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
7677f7fd8be76 (Axel Rasmussen 2021-05-04 18:35:36 -0700 1943) #endif
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1944) uffdio_api.ioctls = UFFD_API_IOCTLS;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1945) ret = -EFAULT;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1946) if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1947) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1948) ctx->state = UFFD_STATE_RUNNING;
656031445d5a8 (Andrea Arcangeli 2017-02-22 15:42:24 -0800 1949) /* only enable the requested features for this uffd context */
656031445d5a8 (Andrea Arcangeli 2017-02-22 15:42:24 -0800 1950) ctx->features = uffd_ctx_features(features);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1951) ret = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1952) out:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1953) return ret;
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1954) err_out:
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1955) memset(&uffdio_api, 0, sizeof(uffdio_api));
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1956) if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1957) ret = -EFAULT;
3c1c24d91ffd5 (Mike Rapoport 2019-11-30 17:58:01 -0800 1958) goto out;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1959) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1960)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1961) static long userfaultfd_ioctl(struct file *file, unsigned cmd,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1962) unsigned long arg)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1963) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1964) int ret = -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1965) struct userfaultfd_ctx *ctx = file->private_data;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1966)
e6485a47b758c (Andrea Arcangeli 2015-09-04 15:47:15 -0700 1967) if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
e6485a47b758c (Andrea Arcangeli 2015-09-04 15:47:15 -0700 1968) return -EINVAL;
e6485a47b758c (Andrea Arcangeli 2015-09-04 15:47:15 -0700 1969)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1970) switch(cmd) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1971) case UFFDIO_API:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1972) ret = userfaultfd_api(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1973) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1974) case UFFDIO_REGISTER:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1975) ret = userfaultfd_register(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1976) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1977) case UFFDIO_UNREGISTER:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1978) ret = userfaultfd_unregister(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1979) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1980) case UFFDIO_WAKE:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1981) ret = userfaultfd_wake(ctx, arg);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1982) break;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1983) case UFFDIO_COPY:
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1984) ret = userfaultfd_copy(ctx, arg);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1985) break;
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1986) case UFFDIO_ZEROPAGE:
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1987) ret = userfaultfd_zeropage(ctx, arg);
ad465cae96b45 (Andrea Arcangeli 2015-09-04 15:47:11 -0700 1988) break;
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1989) case UFFDIO_WRITEPROTECT:
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1990) ret = userfaultfd_writeprotect(ctx, arg);
63b2d4174c4ad (Andrea Arcangeli 2020-04-06 20:06:12 -0700 1991) break;
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1992) case UFFDIO_CONTINUE:
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1993) ret = userfaultfd_continue(ctx, arg);
f619147104c8e (Axel Rasmussen 2021-05-04 18:35:49 -0700 1994) break;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1995) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1996) return ret;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1997) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1998)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 1999) #ifdef CONFIG_PROC_FS
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2000) static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2001) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2002) struct userfaultfd_ctx *ctx = f->private_data;
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 2003) wait_queue_entry_t *wq;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2004) unsigned long pending = 0, total = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2005)
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 2006) spin_lock_irq(&ctx->fault_pending_wqh.lock);
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 2007) list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 2008) pending++;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 2009) total++;
15b726ef048b3 (Andrea Arcangeli 2015-09-04 15:46:44 -0700 2010) }
2055da97389a6 (Ingo Molnar 2017-06-20 12:06:46 +0200 2011) list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2012) total++;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2013) }
cbcfa130a911c (Eric Biggers 2019-07-04 15:14:39 -0700 2014) spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2015)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2016) /*
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2017) * If more protocols will be added, there will be all shown
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2018) * separated by a space. Like this:
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2019) * protocols: aa:... bb:...
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2020) */
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2021) seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
045098e944959 (Mike Rapoport 2017-04-07 16:04:42 -0700 2022) pending, total, UFFD_API, ctx->features,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2023) UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2024) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2025) #endif
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2026)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2027) static const struct file_operations userfaultfd_fops = {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2028) #ifdef CONFIG_PROC_FS
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2029) .show_fdinfo = userfaultfd_show_fdinfo,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2030) #endif
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2031) .release = userfaultfd_release,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2032) .poll = userfaultfd_poll,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2033) .read = userfaultfd_read,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2034) .unlocked_ioctl = userfaultfd_ioctl,
1832f2d8ff691 (Arnd Bergmann 2018-09-11 21:59:08 +0200 2035) .compat_ioctl = compat_ptr_ioctl,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2036) .llseek = noop_llseek,
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2037) };
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2038)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2039) static void init_once_userfaultfd_ctx(void *mem)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2040) {
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2041) struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2042)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2043) init_waitqueue_head(&ctx->fault_pending_wqh);
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2044) init_waitqueue_head(&ctx->fault_wqh);
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 2045) init_waitqueue_head(&ctx->event_wqh);
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2046) init_waitqueue_head(&ctx->fd_wqh);
2ca97ac8bdcc3 (Ahmed S. Darwish 2020-07-20 17:55:28 +0200 2047) seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2048) }
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2049)
284cd241a18ee (Eric Biggers 2018-01-31 16:19:48 -0800 2050) SYSCALL_DEFINE1(userfaultfd, int, flags)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2051) {
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2052) struct userfaultfd_ctx *ctx;
284cd241a18ee (Eric Biggers 2018-01-31 16:19:48 -0800 2053) int fd;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2054)
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2055) if (!sysctl_unprivileged_userfaultfd &&
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2056) (flags & UFFD_USER_MODE_ONLY) == 0 &&
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2057) !capable(CAP_SYS_PTRACE)) {
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2058) printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2059) "sysctl knob to 1 if kernel faults must be handled "
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2060) "without obtaining CAP_SYS_PTRACE capability\n");
cefdca0a86be5 (Peter Xu 2019-05-13 17:16:41 -0700 2061) return -EPERM;
d0d4730ac2e40 (Lokesh Gidra 2020-12-14 19:13:54 -0800 2062) }
cefdca0a86be5 (Peter Xu 2019-05-13 17:16:41 -0700 2063)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2064) BUG_ON(!current->mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2065)
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2066) /* Check the UFFD_* constants for consistency. */
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 2067) BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2068) BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2069) BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2070)
37cd0575b8510 (Lokesh Gidra 2020-12-14 19:13:49 -0800 2071) if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
284cd241a18ee (Eric Biggers 2018-01-31 16:19:48 -0800 2072) return -EINVAL;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2073)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2074) ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2075) if (!ctx)
284cd241a18ee (Eric Biggers 2018-01-31 16:19:48 -0800 2076) return -ENOMEM;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2077)
ca880420665db (Eric Biggers 2018-12-28 00:34:43 -0800 2078) refcount_set(&ctx->refcount, 1);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2079) ctx->flags = flags;
9cd75c3cd4c3d (Pavel Emelyanov 2017-02-22 15:42:21 -0800 2080) ctx->features = 0;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2081) ctx->state = UFFD_STATE_WAIT_API;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2082) ctx->released = false;
df2cc96e77011 (Mike Rapoport 2018-06-07 17:09:25 -0700 2083) ctx->mmap_changing = false;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2084) ctx->mm = current->mm;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2085) /* prevent the mm struct to be freed */
f1f1007644ffc (Vegard Nossum 2017-02-27 14:30:07 -0800 2086) mmgrab(ctx->mm);
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2087)
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 2088) fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
b537900f1598b (Daniel Colascione 2021-01-08 14:22:23 -0800 2089) O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
284cd241a18ee (Eric Biggers 2018-01-31 16:19:48 -0800 2090) if (fd < 0) {
d2005e3f41d4f (Oleg Nesterov 2016-05-20 16:58:36 -0700 2091) mmdrop(ctx->mm);
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2092) kmem_cache_free(userfaultfd_ctx_cachep, ctx);
c03e946fdd653 (Eric Biggers 2015-09-17 16:01:54 -0700 2093) }
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2094) return fd;
86039bd3b4e6a (Andrea Arcangeli 2015-09-04 15:46:31 -0700 2095) }
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2096)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2097) static int __init userfaultfd_init(void)
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2098) {
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2099) userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2100) sizeof(struct userfaultfd_ctx),
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2101) 0,
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2102) SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2103) init_once_userfaultfd_ctx);
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2104) return 0;
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2105) }
3004ec9cabf49 (Andrea Arcangeli 2015-09-04 15:46:48 -0700 2106) __initcall(userfaultfd_init);