2874c5fd28426 (Thomas Gleixner 2019-05-27 08:55:01 +0200 1) // SPDX-License-Identifier: GPL-2.0-or-later
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 3) * fs/eventpoll.c (Efficient event retrieval implementation)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 4) * Copyright (C) 2001,...,2009 Davide Libenzi
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 5) *
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 6) * Davide Libenzi <davidel@xmailserver.org>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 7) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 8)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 9) #include <linux/init.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 10) #include <linux/kernel.h>
174cd4b1e5fbd (Ingo Molnar 2017-02-02 19:15:33 +0100 11) #include <linux/sched/signal.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 12) #include <linux/fs.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 13) #include <linux/file.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 14) #include <linux/signal.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 15) #include <linux/errno.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 16) #include <linux/mm.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 17) #include <linux/slab.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 18) #include <linux/poll.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 19) #include <linux/string.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 20) #include <linux/list.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 21) #include <linux/hash.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 22) #include <linux/spinlock.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 23) #include <linux/syscalls.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 24) #include <linux/rbtree.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 25) #include <linux/wait.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 26) #include <linux/eventpoll.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 27) #include <linux/mount.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 28) #include <linux/bitops.h>
144efe3e3e5ad (Arjan van de Ven 2006-03-23 03:00:32 -0800 29) #include <linux/mutex.h>
da66f7cb0f69a (Davide Libenzi 2007-05-10 22:23:21 -0700 30) #include <linux/anon_inodes.h>
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 31) #include <linux/device.h>
7c0f6ba682b9c (Linus Torvalds 2016-12-24 11:46:01 -0800 32) #include <linux/uaccess.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 33) #include <asm/io.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 34) #include <asm/mman.h>
60063497a95e7 (Arun Sharma 2011-07-26 16:09:06 -0700 35) #include <linux/atomic.h>
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 36) #include <linux/proc_fs.h>
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 37) #include <linux/seq_file.h>
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 38) #include <linux/compat.h>
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 39) #include <linux/rculist.h>
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 40) #include <net/busy_poll.h>
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 41)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 42) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 43) * LOCKING:
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 44) * There are three level of locking required by epoll :
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 45) *
144efe3e3e5ad (Arjan van de Ven 2006-03-23 03:00:32 -0800 46) * 1) epmutex (mutex)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 47) * 2) ep->mtx (mutex)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 48) * 3) ep->lock (rwlock)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 49) *
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 50) * The acquire order is the one listed above, from 1 to 3.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 51) * We need a rwlock (ep->lock) because we manipulate objects
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 52) * from inside the poll callback, that might be triggered from
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 53) * a wake_up() that in turn might be called from IRQ context.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 54) * So we can't sleep inside the poll callback and hence we need
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 55) * a spinlock. During the event transfer loop (from kernel to
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 56) * user space) we could end up sleeping due a copy_to_user(), so
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 57) * we need a lock that will allow us to sleep. This lock is a
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 58) * mutex (ep->mtx). It is acquired during the event transfer loop,
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 59) * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 60) * Then we also need a global mutex to serialize eventpoll_release_file()
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 61) * and ep_free().
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 62) * This mutex is acquired by ep_free() during the epoll file
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 63) * cleanup path and it is also acquired by eventpoll_release_file()
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 64) * if a file has been pushed inside an epoll set and it is then
bf6a41db7726e (Daniel Baluta 2011-01-30 23:42:29 +0200 65) * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 66) * It is also acquired when inserting an epoll fd onto another epoll
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 67) * fd. We do this so that we walk the epoll tree and ensure that this
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 68) * insertion does not create a cycle of epoll file descriptors, which
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 69) * could lead to deadlock. We need a global mutex to prevent two
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 70) * simultaneous inserts (A into B and B into A) from racing and
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 71) * constructing a cycle without either insert observing that it is
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 72) * going to.
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 73) * It is necessary to acquire multiple "ep->mtx"es at once in the
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 74) * case when one epoll fd is added to another. In this case, we
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 75) * always acquire the locks in the order of nesting (i.e. after
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 76) * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 77) * before e2->mtx). Since we disallow cycles of epoll file
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 78) * descriptors, this ensures that the mutexes are well-ordered. In
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 79) * order to communicate this nesting to lockdep, when walking a tree
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 80) * of epoll file descriptors, we use the current recursion depth as
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 81) * the lockdep subkey.
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 82) * It is possible to drop the "ep->mtx" and to use the global
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 83) * mutex "epmutex" (together with "ep->lock") to have it working,
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 84) * but having "ep->mtx" will make the interface more scalable.
144efe3e3e5ad (Arjan van de Ven 2006-03-23 03:00:32 -0800 85) * Events that require holding "epmutex" are very rare, while for
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 86) * normal operations the epoll private "ep->mtx" will guarantee
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 87) * a better scalability.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 88) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 89)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 90) /* Epoll private bits inside the event mask */
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 91) #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 92)
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 93) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 94)
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 95) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 96) EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 97)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 98) /* Maximum number of nesting allowed inside epoll sets */
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 99) #define EP_MAX_NESTS 4
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 100)
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 101) #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 102)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 103) #define EP_UNACTIVE_PTR ((void *) -1L)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 104)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 105) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 106)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 107) struct epoll_filefd {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 108) struct file *file;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 109) int fd;
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 110) } __packed;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 111)
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 112) /* Wait structure used by the poll hooks */
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 113) struct eppoll_entry {
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 114) /* List header used to link this structure to the "struct epitem" */
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 115) struct eppoll_entry *next;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 116)
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 117) /* The "base" pointer is set to the container "struct epitem" */
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 118) struct epitem *base;
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 119)
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 120) /*
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 121) * Wait queue item that will be linked to the target file wait
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 122) * queue head.
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 123) */
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 124) wait_queue_entry_t wait;
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 125)
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 126) /* The wait queue head that linked the "wait" wait queue item */
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 127) wait_queue_head_t *whead;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 128) };
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 129)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 130) /*
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 131) * Each file descriptor added to the eventpoll interface will
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 132) * have an entry of this type linked to the "rbr" RB tree.
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 133) * Avoid increasing the size of this struct, there can be many thousands
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 134) * of these on a server and we do not want this to take another cache line.
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 135) */
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 136) struct epitem {
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 137) union {
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 138) /* RB tree node links this structure to the eventpoll RB tree */
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 139) struct rb_node rbn;
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 140) /* Used to free the struct epitem */
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 141) struct rcu_head rcu;
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 142) };
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 143)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 144) /* List header used to link this structure to the eventpoll ready list */
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 145) struct list_head rdllink;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 146)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 147) /*
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 148) * Works together "struct eventpoll"->ovflist in keeping the
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 149) * single linked chain of items.
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 150) */
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 151) struct epitem *next;
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 152)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 153) /* The file descriptor information this item refers to */
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 154) struct epoll_filefd ffd;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 155)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 156) /* List containing poll wait queues */
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 157) struct eppoll_entry *pwqlist;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 158)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 159) /* The "container" of this item */
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 160) struct eventpoll *ep;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 161)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 162) /* List header used to link this item to the "struct file" items list */
44cdc1d952e3f (Al Viro 2020-09-27 11:18:30 -0400 163) struct hlist_node fllink;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 164)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 165) /* wakeup_source used when EPOLLWAKEUP is set */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 166) struct wakeup_source __rcu *ws;
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 167)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 168) /* The structure that describe the interested events and the source fd */
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 169) struct epoll_event event;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 170) };
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 171)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 172) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 173) * This structure is stored inside the "private_data" member of the file
bf6a41db7726e (Daniel Baluta 2011-01-30 23:42:29 +0200 174) * structure and represents the main data structure for the eventpoll
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 175) * interface.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 176) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 177) struct eventpoll {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 178) /*
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 179) * This mutex is used to ensure that files are not removed
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 180) * while epoll is using them. This is held during the event
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 181) * collection loop, the file cleanup path, the epoll file exit
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 182) * code and the ctl operations.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 183) */
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 184) struct mutex mtx;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 185)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 186) /* Wait queue used by sys_epoll_wait() */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 187) wait_queue_head_t wq;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 188)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 189) /* Wait queue used by file->poll() */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 190) wait_queue_head_t poll_wait;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 191)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 192) /* List of ready file descriptors */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 193) struct list_head rdllist;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 194)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 195) /* Lock which protects rdllist and ovflist */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 196) rwlock_t lock;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 197)
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 198) /* RB tree root used to store monitored fd structs */
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 199) struct rb_root_cached rbr;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 200)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 201) /*
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 202) * This is a single linked list that chains all the "struct epitem" that
25985edcedea6 (Lucas De Marchi 2011-03-30 22:57:33 -0300 203) * happened while transferring ready events to userspace w/out
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 204) * holding ->lock.
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 205) */
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 206) struct epitem *ovflist;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 207)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 208) /* wakeup_source used when ep_scan_ready_list is running */
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 209) struct wakeup_source *ws;
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 210)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 211) /* The user that created the eventpoll descriptor */
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 212) struct user_struct *user;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 213)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 214) struct file *file;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 215)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 216) /* used to optimize loop detection check */
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 217) u64 gen;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 218) struct hlist_head refs;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 219)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 220) #ifdef CONFIG_NET_RX_BUSY_POLL
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 221) /* used to track busy poll napi_id */
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 222) unsigned int napi_id;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 223) #endif
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 224)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 225) #ifdef CONFIG_DEBUG_LOCK_ALLOC
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 226) /* tracks wakeup nests for lockdep validation */
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 227) u8 nests;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 228) #endif
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 229) };
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 230)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 231) /* Wrapper struct used by poll queueing */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 232) struct ep_pqueue {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 233) poll_table pt;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 234) struct epitem *epi;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 235) };
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 236)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 237) /*
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 238) * Configuration options available inside /proc/sys/fs/epoll/
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 239) */
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 240) /* Maximum number of epoll watched descriptors, per user */
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 241) static long max_user_watches __read_mostly;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 242)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 243) /*
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 244) * This mutex is used to serialize ep_free() and eventpoll_release_file().
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 245) */
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 246) static DEFINE_MUTEX(epmutex);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 247)
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 248) static u64 loop_check_gen = 0;
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 249)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 250) /* Used to check for epoll file descriptor inclusion loops */
6a3890c474795 (Al Viro 2020-09-26 16:29:02 -0400 251) static struct eventpoll *inserting_into;
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 252)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 253) /* Slab cache used to allocate "struct epitem" */
e18b890bb0881 (Christoph Lameter 2006-12-06 20:33:20 -0800 254) static struct kmem_cache *epi_cache __read_mostly;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 255)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 256) /* Slab cache used to allocate "struct eppoll_entry" */
e18b890bb0881 (Christoph Lameter 2006-12-06 20:33:20 -0800 257) static struct kmem_cache *pwq_cache __read_mostly;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 258)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 259) /*
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 260) * List of files with newly added links, where we may need to limit the number
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 261) * of emanating paths. Protected by the epmutex.
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 262) */
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 263) struct epitems_head {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 264) struct hlist_head epitems;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 265) struct epitems_head *next;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 266) };
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 267) static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 268)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 269) static struct kmem_cache *ephead_cache __read_mostly;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 270)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 271) static inline void free_ephead(struct epitems_head *head)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 272) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 273) if (head)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 274) kmem_cache_free(ephead_cache, head);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 275) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 276)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 277) static void list_file(struct file *file)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 278) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 279) struct epitems_head *head;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 280)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 281) head = container_of(file->f_ep, struct epitems_head, epitems);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 282) if (!head->next) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 283) head->next = tfile_check_list;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 284) tfile_check_list = head;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 285) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 286) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 287)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 288) static void unlist_file(struct epitems_head *head)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 289) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 290) struct epitems_head *to_free = head;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 291) struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 292) if (p) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 293) struct epitem *epi= container_of(p, struct epitem, fllink);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 294) spin_lock(&epi->ffd.file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 295) if (!hlist_empty(&head->epitems))
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 296) to_free = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 297) head->next = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 298) spin_unlock(&epi->ffd.file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 299) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 300) free_ephead(to_free);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 301) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 302)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 303) #ifdef CONFIG_SYSCTL
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 304)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 305) #include <linux/sysctl.h>
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 306)
eec4844fae7c0 (Matteo Croce 2019-07-18 15:58:50 -0700 307) static long long_zero;
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 308) static long long_max = LONG_MAX;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 309)
1f7e0616cd4f5 (Joe Perches 2014-06-06 14:38:05 -0700 310) struct ctl_table epoll_table[] = {
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 311) {
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 312) .procname = "max_user_watches",
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 313) .data = &max_user_watches,
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 314) .maxlen = sizeof(max_user_watches),
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 315) .mode = 0644,
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 316) .proc_handler = proc_doulongvec_minmax,
eec4844fae7c0 (Matteo Croce 2019-07-18 15:58:50 -0700 317) .extra1 = &long_zero,
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 318) .extra2 = &long_max,
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 319) },
ab09203e302b6 (Eric W. Biederman 2009-11-05 14:25:10 -0800 320) { }
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 321) };
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 322) #endif /* CONFIG_SYSCTL */
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 323)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 324) static const struct file_operations eventpoll_fops;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 325)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 326) static inline int is_file_epoll(struct file *f)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 327) {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 328) return f->f_op == &eventpoll_fops;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 329) }
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 330)
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 331) /* Setup the structure that is used as key for the RB tree */
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 332) static inline void ep_set_ffd(struct epoll_filefd *ffd,
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 333) struct file *file, int fd)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 334) {
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 335) ffd->file = file;
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 336) ffd->fd = fd;
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 337) }
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 338)
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 339) /* Compare RB tree keys */
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 340) static inline int ep_cmp_ffd(struct epoll_filefd *p1,
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 341) struct epoll_filefd *p2)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 342) {
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 343) return (p1->file > p2->file ? +1:
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 344) (p1->file < p2->file ? -1 : p1->fd - p2->fd));
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 345) }
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 346)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 347) /* Tells us if the item is currently linked */
992991c03ca03 (Davidlohr Bueso 2018-08-21 21:58:26 -0700 348) static inline int ep_is_linked(struct epitem *epi)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 349) {
992991c03ca03 (Davidlohr Bueso 2018-08-21 21:58:26 -0700 350) return !list_empty(&epi->rdllink);
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 351) }
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 352)
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 353) static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 354) {
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 355) return container_of(p, struct eppoll_entry, wait);
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 356) }
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 357)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 358) /* Get the "struct epitem" from a wait queue pointer */
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 359) static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 360) {
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 361) return container_of(p, struct eppoll_entry, wait)->base;
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 362) }
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 363)
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 364) /**
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 365) * ep_events_available - Checks if ready events might be available.
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 366) *
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 367) * @ep: Pointer to the eventpoll context.
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 368) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 369) * Return: a value different than %zero if ready events are available,
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 370) * or %zero otherwise.
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 371) */
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 372) static inline int ep_events_available(struct eventpoll *ep)
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 373) {
c5a282e9635e9 (Davidlohr Bueso 2019-01-03 15:27:15 -0800 374) return !list_empty_careful(&ep->rdllist) ||
c5a282e9635e9 (Davidlohr Bueso 2019-01-03 15:27:15 -0800 375) READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 376) }
3fb0e584a68cd (Davide Libenzi 2011-03-22 16:34:46 -0700 377)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 378) #ifdef CONFIG_NET_RX_BUSY_POLL
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 379) static bool ep_busy_loop_end(void *p, unsigned long start_time)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 380) {
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 381) struct eventpoll *ep = p;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 382)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 383) return ep_events_available(ep) || busy_loop_timeout(start_time);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 384) }
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 385)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 386) /*
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 387) * Busy poll if globally on and supporting sockets found && no events,
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 388) * busy loop will return if need_resched or ep_events_available.
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 389) *
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 390) * we must do our busy polling with irqs enabled
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 391) */
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 392) static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 393) {
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 394) unsigned int napi_id = READ_ONCE(ep->napi_id);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 395)
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 396) if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
7c951cafc0cb2 (Björn Töpel 2020-11-30 19:51:57 +0100 397) napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
7c951cafc0cb2 (Björn Töpel 2020-11-30 19:51:57 +0100 398) BUSY_POLL_BUDGET);
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 399) if (ep_events_available(ep))
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 400) return true;
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 401) /*
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 402) * Busy poll timed out. Drop NAPI ID for now, we can add
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 403) * it back in when we have moved a socket with a valid NAPI
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 404) * ID onto the ready list.
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 405) */
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 406) ep->napi_id = 0;
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 407) return false;
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 408) }
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 409) return false;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 410) }
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 411)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 412) /*
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 413) * Set epoll busy poll NAPI ID from sk.
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 414) */
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 415) static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 416) {
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 417) struct eventpoll *ep;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 418) unsigned int napi_id;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 419) struct socket *sock;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 420) struct sock *sk;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 421)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 422) if (!net_busy_loop_on())
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 423) return;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 424)
dba4a9256bb4d (Florent Revest 2020-12-04 12:36:04 +0100 425) sock = sock_from_file(epi->ffd.file);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 426) if (!sock)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 427) return;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 428)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 429) sk = sock->sk;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 430) if (!sk)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 431) return;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 432)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 433) napi_id = READ_ONCE(sk->sk_napi_id);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 434) ep = epi->ep;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 435)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 436) /* Non-NAPI IDs can be rejected
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 437) * or
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 438) * Nothing to do if we already have this ID
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 439) */
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 440) if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 441) return;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 442)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 443) /* record NAPI ID for use in next busy poll */
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 444) ep->napi_id = napi_id;
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 445) }
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 446)
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 447) #else
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 448)
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 449) static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 450) {
1493c47fb140d (Soheil Hassas Yeganeh 2020-12-18 14:01:57 -0800 451) return false;
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 452) }
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 453)
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 454) static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 455) {
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 456) }
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 457)
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 458) #endif /* CONFIG_NET_RX_BUSY_POLL */
514056d506e44 (Davidlohr Bueso 2018-08-21 21:58:19 -0700 459)
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 460) /*
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 461) * As described in commit 0ccf831cb lockdep: annotate epoll
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 462) * the use of wait queues used by epoll is done in a very controlled
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 463) * manner. Wake ups can nest inside each other, but are never done
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 464) * with the same locking. For example:
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 465) *
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 466) * dfd = socket(...);
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 467) * efd1 = epoll_create();
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 468) * efd2 = epoll_create();
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 469) * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 470) * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 471) *
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 472) * When a packet arrives to the device underneath "dfd", the net code will
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 473) * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 474) * callback wakeup entry on that queue, and the wake_up() performed by the
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 475) * "dfd" net code will end up in ep_poll_callback(). At this point epoll
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 476) * (efd1) notices that it may have some event ready, so it needs to wake up
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 477) * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 478) * that ends up in another wake_up(), after having checked about the
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 479) * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 480) * avoid stack blasting.
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 481) *
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 482) * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 483) * this special case of epoll.
02edc6fc4d5fe (Steven Rostedt 2012-03-23 15:02:27 -0700 484) */
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 485) #ifdef CONFIG_DEBUG_LOCK_ALLOC
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 486)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 487) static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 488) {
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 489) struct eventpoll *ep_src;
f6520c520842c (Jason Baron 2019-12-04 16:52:12 -0800 490) unsigned long flags;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 491) u8 nests = 0;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 492)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 493) /*
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 494) * To set the subclass or nesting level for spin_lock_irqsave_nested()
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 495) * it might be natural to create a per-cpu nest count. However, since
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 496) * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 497) * schedule() in the -rt kernel, the per-cpu variable are no longer
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 498) * protected. Thus, we are introducing a per eventpoll nest field.
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 499) * If we are not being call from ep_poll_callback(), epi is NULL and
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 500) * we are at the first level of nesting, 0. Otherwise, we are being
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 501) * called from ep_poll_callback() and if a previous wakeup source is
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 502) * not an epoll file itself, we are at depth 1 since the wakeup source
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 503) * is depth 0. If the wakeup source is a previous epoll file in the
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 504) * wakeup chain then we use its nests value and record ours as
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 505) * nests + 1. The previous epoll file nests value is stable since its
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 506) * already holding its own poll_wait.lock.
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 507) */
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 508) if (epi) {
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 509) if ((is_file_epoll(epi->ffd.file))) {
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 510) ep_src = epi->ffd.file->private_data;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 511) nests = ep_src->nests;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 512) } else {
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 513) nests = 1;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 514) }
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 515) }
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 516) spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 517) ep->nests = nests + 1;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 518) wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 519) ep->nests = 0;
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 520) spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 521) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 522)
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 523) #else
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 524)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 525) static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 526) {
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 527) wake_up_poll(&ep->poll_wait, EPOLLIN);
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 528) }
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 529)
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 530) #endif
57a173bdf5baa (Jason Baron 2017-11-17 15:29:02 -0800 531)
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 532) static void ep_remove_wait_queue(struct eppoll_entry *pwq)
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 533) {
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 534) wait_queue_head_t *whead;
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 535)
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 536) rcu_read_lock();
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 537) /*
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 538) * If it is cleared by POLLFREE, it should be rcu-safe.
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 539) * If we read NULL we need a barrier paired with
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 540) * smp_store_release() in ep_poll_callback(), otherwise
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 541) * we rely on whead->lock.
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 542) */
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 543) whead = smp_load_acquire(&pwq->whead);
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 544) if (whead)
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 545) remove_wait_queue(whead, &pwq->wait);
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 546) rcu_read_unlock();
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 547) }
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 548)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 549) /*
d1bc90dd5d037 (Tony Battersby 2009-03-31 15:24:15 -0700 550) * This function unregisters poll callbacks from the associated file
d1bc90dd5d037 (Tony Battersby 2009-03-31 15:24:15 -0700 551) * descriptor. Must be called with "mtx" held (or "epmutex" if called from
d1bc90dd5d037 (Tony Battersby 2009-03-31 15:24:15 -0700 552) * ep_free).
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 553) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 554) static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 555) {
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 556) struct eppoll_entry **p = &epi->pwqlist;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 557) struct eppoll_entry *pwq;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 558)
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 559) while ((pwq = *p) != NULL) {
80285b75c6834 (Al Viro 2020-09-02 11:45:57 -0400 560) *p = pwq->next;
971316f0503a5 (Oleg Nesterov 2012-02-24 20:07:29 +0100 561) ep_remove_wait_queue(pwq);
d1bc90dd5d037 (Tony Battersby 2009-03-31 15:24:15 -0700 562) kmem_cache_free(pwq_cache, pwq);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 563) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 564) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 565)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 566) /* call only when ep->mtx is held */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 567) static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 568) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 569) return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 570) }
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 571)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 572) /* call only when ep->mtx is held */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 573) static inline void ep_pm_stay_awake(struct epitem *epi)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 574) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 575) struct wakeup_source *ws = ep_wakeup_source(epi);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 576)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 577) if (ws)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 578) __pm_stay_awake(ws);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 579) }
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 580)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 581) static inline bool ep_has_wakeup_source(struct epitem *epi)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 582) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 583) return rcu_access_pointer(epi->ws) ? true : false;
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 584) }
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 585)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 586) /* call when ep->mtx cannot be held (ep_poll_callback) */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 587) static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 588) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 589) struct wakeup_source *ws;
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 590)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 591) rcu_read_lock();
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 592) ws = rcu_dereference(epi->ws);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 593) if (ws)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 594) __pm_stay_awake(ws);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 595) rcu_read_unlock();
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 596) }
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 597)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 598)
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 599) /*
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 600) * ep->mutex needs to be held because we could be hit by
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 601) * eventpoll_release_file() and epoll_ctl().
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 602) */
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 603) static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 604) {
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 605) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 606) * Steal the ready list, and re-init the original one to the
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 607) * empty list. Also, set ep->ovflist to NULL so that events
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 608) * happening while looping w/out locks, are not lost. We cannot
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 609) * have the poll callback to queue directly on ep->rdllist,
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 610) * because we want the "sproc" callback to be able to do it
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 611) * in a lockless way.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 612) */
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 613) lockdep_assert_irqs_enabled();
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 614) write_lock_irq(&ep->lock);
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 615) list_splice_init(&ep->rdllist, txlist);
c5a282e9635e9 (Davidlohr Bueso 2019-01-03 15:27:15 -0800 616) WRITE_ONCE(ep->ovflist, NULL);
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 617) write_unlock_irq(&ep->lock);
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 618) }
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 619)
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 620) static void ep_done_scan(struct eventpoll *ep,
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 621) struct list_head *txlist)
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 622) {
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 623) struct epitem *epi, *nepi;
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 624)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 625) write_lock_irq(&ep->lock);
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 626) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 627) * During the time we spent inside the "sproc" callback, some
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 628) * other events might have been queued by the poll callback.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 629) * We re-insert them inside the main ready-list here.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 630) */
c5a282e9635e9 (Davidlohr Bueso 2019-01-03 15:27:15 -0800 631) for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 632) nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 633) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 634) * We need to check if the item is already in the list.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 635) * During the "sproc" callback execution time, items are
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 636) * queued into ->ovflist but the "txlist" might already
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 637) * contain them, and the list_splice() below takes care of them.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 638) */
992991c03ca03 (Davidlohr Bueso 2018-08-21 21:58:26 -0700 639) if (!ep_is_linked(epi)) {
c141175d011f1 (Roman Penyaev 2019-03-07 16:28:46 -0800 640) /*
c141175d011f1 (Roman Penyaev 2019-03-07 16:28:46 -0800 641) * ->ovflist is LIFO, so we have to reverse it in order
c141175d011f1 (Roman Penyaev 2019-03-07 16:28:46 -0800 642) * to keep in FIFO.
c141175d011f1 (Roman Penyaev 2019-03-07 16:28:46 -0800 643) */
c141175d011f1 (Roman Penyaev 2019-03-07 16:28:46 -0800 644) list_add(&epi->rdllink, &ep->rdllist);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 645) ep_pm_stay_awake(epi);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 646) }
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 647) }
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 648) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 649) * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 650) * releasing the lock, events will be queued in the normal way inside
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 651) * ep->rdllist.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 652) */
c5a282e9635e9 (Davidlohr Bueso 2019-01-03 15:27:15 -0800 653) WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 654)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 655) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 656) * Quickly re-inject items left on "txlist".
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 657) */
db502f8a3b0bb (Al Viro 2020-08-31 13:06:51 -0400 658) list_splice(txlist, &ep->rdllist);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 659) __pm_relax(ep->ws);
7fab29e356309 (Davidlohr Bueso 2021-05-06 18:04:07 -0700 660)
7fab29e356309 (Davidlohr Bueso 2021-05-06 18:04:07 -0700 661) if (!list_empty(&ep->rdllist)) {
7fab29e356309 (Davidlohr Bueso 2021-05-06 18:04:07 -0700 662) if (waitqueue_active(&ep->wq))
7fab29e356309 (Davidlohr Bueso 2021-05-06 18:04:07 -0700 663) wake_up(&ep->wq);
7fab29e356309 (Davidlohr Bueso 2021-05-06 18:04:07 -0700 664) }
7fab29e356309 (Davidlohr Bueso 2021-05-06 18:04:07 -0700 665)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 666) write_unlock_irq(&ep->lock);
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 667) }
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 668)
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 669) static void epi_rcu_free(struct rcu_head *head)
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 670) {
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 671) struct epitem *epi = container_of(head, struct epitem, rcu);
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 672) kmem_cache_free(epi_cache, epi);
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 673) }
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 674)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 675) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 676) * Removes a "struct epitem" from the eventpoll RB tree and deallocates
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 677) * all the associated resources. Must be called with "mtx" held.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 678) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 679) static int ep_remove(struct eventpoll *ep, struct epitem *epi)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 680) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 681) struct file *file = epi->ffd.file;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 682) struct epitems_head *to_free;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 683) struct hlist_head *head;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 684)
92e6417840559 (Davidlohr Bueso 2018-08-21 21:56:45 -0700 685) lockdep_assert_irqs_enabled();
92e6417840559 (Davidlohr Bueso 2018-08-21 21:56:45 -0700 686)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 687) /*
ee8ef0a4b167c (Christoph Hellwig 2018-08-21 21:56:26 -0700 688) * Removes poll wait queue hooks.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 689) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 690) ep_unregister_pollwait(ep, epi);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 691)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 692) /* Remove the current item from the list of epoll hooks */
684999149002d (Jonathan Corbet 2009-02-06 13:52:43 -0700 693) spin_lock(&file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 694) to_free = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 695) head = file->f_ep;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 696) if (head->first == &epi->fllink && !epi->fllink.next) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 697) file->f_ep = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 698) if (!is_file_epoll(file)) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 699) struct epitems_head *v;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 700) v = container_of(head, struct epitems_head, epitems);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 701) if (!smp_load_acquire(&v->next))
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 702) to_free = v;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 703) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 704) }
44cdc1d952e3f (Al Viro 2020-09-27 11:18:30 -0400 705) hlist_del_rcu(&epi->fllink);
684999149002d (Jonathan Corbet 2009-02-06 13:52:43 -0700 706) spin_unlock(&file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 707) free_ephead(to_free);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 708)
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 709) rb_erase_cached(&epi->rbn, &ep->rbr);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 710)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 711) write_lock_irq(&ep->lock);
992991c03ca03 (Davidlohr Bueso 2018-08-21 21:58:26 -0700 712) if (ep_is_linked(epi))
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 713) list_del_init(&epi->rdllink);
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 714) write_unlock_irq(&ep->lock);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 715)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 716) wakeup_source_unregister(ep_wakeup_source(epi));
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 717) /*
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 718) * At this point it is safe to free the eventpoll item. Use the union
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 719) * field epi->rcu, since we are trying to minimize the size of
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 720) * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 721) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 722) * use of the rbn field.
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 723) */
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 724) call_rcu(&epi->rcu, epi_rcu_free);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 725)
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 726) atomic_long_dec(&ep->user->epoll_watches);
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 727)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 728) return 0;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 729) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 730)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 731) static void ep_free(struct eventpoll *ep)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 732) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 733) struct rb_node *rbp;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 734) struct epitem *epi;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 735)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 736) /* We need to release all tasks waiting for these file */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 737) if (waitqueue_active(&ep->poll_wait))
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 738) ep_poll_safewake(ep, NULL);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 739)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 740) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 741) * We need to lock this because we could be hit by
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 742) * eventpoll_release_file() while we're freeing the "struct eventpoll".
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 743) * We do not need to hold "ep->mtx" here because the epoll file
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 744) * is on the way to be removed and no one has references to it
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 745) * anymore. The only hit might come from eventpoll_release_file() but
25985edcedea6 (Lucas De Marchi 2011-03-30 22:57:33 -0300 746) * holding "epmutex" is sufficient here.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 747) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 748) mutex_lock(&epmutex);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 749)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 750) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 751) * Walks through the whole tree by unregistering poll callbacks.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 752) */
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 753) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 754) epi = rb_entry(rbp, struct epitem, rbn);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 755)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 756) ep_unregister_pollwait(ep, epi);
91cf5ab60ff82 (Eric Dumazet 2013-09-11 14:24:06 -0700 757) cond_resched();
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 758) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 759)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 760) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 761) * Walks through the whole tree by freeing each "struct epitem". At this
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 762) * point we are sure no poll callbacks will be lingering around, and also by
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 763) * holding "epmutex" we can be sure that no file cleanup code will hit
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 764) * us during this operation. So we can avoid the lock on "ep->lock".
ddf676c38b56a (Eric Wong 2013-04-30 15:27:40 -0700 765) * We do not need to lock ep->mtx, either, we only do it to prevent
ddf676c38b56a (Eric Wong 2013-04-30 15:27:40 -0700 766) * a lockdep warning.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 767) */
ddf676c38b56a (Eric Wong 2013-04-30 15:27:40 -0700 768) mutex_lock(&ep->mtx);
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 769) while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 770) epi = rb_entry(rbp, struct epitem, rbn);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 771) ep_remove(ep, epi);
91cf5ab60ff82 (Eric Dumazet 2013-09-11 14:24:06 -0700 772) cond_resched();
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 773) }
ddf676c38b56a (Eric Wong 2013-04-30 15:27:40 -0700 774) mutex_unlock(&ep->mtx);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 775)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 776) mutex_unlock(&epmutex);
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 777) mutex_destroy(&ep->mtx);
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 778) free_uid(ep->user);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 779) wakeup_source_unregister(ep->ws);
f0ee9aabb0520 (Davide Libenzi 2007-05-15 01:40:57 -0700 780) kfree(ep);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 781) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 782)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 783) static int ep_eventpoll_release(struct inode *inode, struct file *file)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 784) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 785) struct eventpoll *ep = file->private_data;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 786)
f0ee9aabb0520 (Davide Libenzi 2007-05-15 01:40:57 -0700 787) if (ep)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 788) ep_free(ep);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 789)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 790) return 0;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 791) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 792)
2c0b71c1e9c93 (Al Viro 2020-09-26 18:48:57 -0400 793) static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 794)
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 795) static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 796) {
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 797) struct eventpoll *ep = file->private_data;
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 798) LIST_HEAD(txlist);
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 799) struct epitem *epi, *tmp;
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 800) poll_table pt;
2c0b71c1e9c93 (Al Viro 2020-09-26 18:48:57 -0400 801) __poll_t res = 0;
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 802)
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 803) init_poll_funcptr(&pt, NULL);
450d89ec0a91d (Eric Wong 2013-04-30 15:27:42 -0700 804)
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 805) /* Insert inside our poll wait queue */
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 806) poll_wait(file, &ep->poll_wait, wait);
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 807)
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 808) /*
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 809) * Proceed to find out if wanted events are really available inside
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 810) * the ready list.
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 811) */
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 812) mutex_lock_nested(&ep->mtx, depth);
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 813) ep_start_scan(ep, &txlist);
2c0b71c1e9c93 (Al Viro 2020-09-26 18:48:57 -0400 814) list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
2c0b71c1e9c93 (Al Viro 2020-09-26 18:48:57 -0400 815) if (ep_item_poll(epi, &pt, depth + 1)) {
2c0b71c1e9c93 (Al Viro 2020-09-26 18:48:57 -0400 816) res = EPOLLIN | EPOLLRDNORM;
2c0b71c1e9c93 (Al Viro 2020-09-26 18:48:57 -0400 817) break;
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 818) } else {
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 819) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 820) * Item has been dropped into the ready list by the poll
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 821) * callback, but it's not actually ready, as far as
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 822) * caller requested events goes. We can remove it here.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 823) */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 824) __pm_relax(ep_wakeup_source(epi));
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 825) list_del_init(&epi->rdllink);
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 826) }
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 827) }
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 828) ep_done_scan(ep, &txlist);
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 829) mutex_unlock(&ep->mtx);
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 830) return res;
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 831) }
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 832)
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 833) /*
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 834) * Differs from ep_eventpoll_poll() in that internal callers already have
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 835) * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 836) * is correctly annotated.
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 837) */
d85e2aa2e34da (Al Viro 2018-02-01 15:24:58 -0500 838) static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
bec1a502d34dc (Al Viro 2017-11-28 19:43:33 -0500 839) int depth)
11c5ad0ec4411 (Ben Noordhuis 2018-06-15 00:32:07 +0200 840) {
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 841) struct file *file = epi->ffd.file;
1ec09974d845b (Al Viro 2020-08-31 13:16:39 -0400 842) __poll_t res;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 843)
450d89ec0a91d (Eric Wong 2013-04-30 15:27:42 -0700 844) pt->_key = epi->event.events;
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 845) if (!is_file_epoll(file))
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 846) res = vfs_poll(file, pt);
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 847) else
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 848) res = __ep_eventpoll_poll(file, pt, depth);
1ec09974d845b (Al Viro 2020-08-31 13:16:39 -0400 849) return res & epi->event.events;
450d89ec0a91d (Eric Wong 2013-04-30 15:27:42 -0700 850) }
a11e1d432b51f (Linus Torvalds 2018-06-28 09:43:44 -0700 851)
a11e1d432b51f (Linus Torvalds 2018-06-28 09:43:44 -0700 852) static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
11c5ad0ec4411 (Ben Noordhuis 2018-06-15 00:32:07 +0200 853) {
ad9366b1361fd (Al Viro 2020-09-26 18:32:48 -0400 854) return __ep_eventpoll_poll(file, wait, 0);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 855) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 856)
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 857) #ifdef CONFIG_PROC_FS
a3816ab0e8fe5 (Joe Perches 2014-09-29 16:08:25 -0700 858) static void ep_show_fdinfo(struct seq_file *m, struct file *f)
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 859) {
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 860) struct eventpoll *ep = f->private_data;
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 861) struct rb_node *rbp;
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 862)
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 863) mutex_lock(&ep->mtx);
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 864) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 865) struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
77493f04b74cd (Cyrill Gorcunov 2017-07-12 14:34:25 -0700 866) struct inode *inode = file_inode(epi->ffd.file);
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 867)
77493f04b74cd (Cyrill Gorcunov 2017-07-12 14:34:25 -0700 868) seq_printf(m, "tfd: %8d events: %8x data: %16llx "
77493f04b74cd (Cyrill Gorcunov 2017-07-12 14:34:25 -0700 869) " pos:%lli ino:%lx sdev:%x\n",
a3816ab0e8fe5 (Joe Perches 2014-09-29 16:08:25 -0700 870) epi->ffd.fd, epi->event.events,
77493f04b74cd (Cyrill Gorcunov 2017-07-12 14:34:25 -0700 871) (long long)epi->event.data,
77493f04b74cd (Cyrill Gorcunov 2017-07-12 14:34:25 -0700 872) (long long)epi->ffd.file->f_pos,
77493f04b74cd (Cyrill Gorcunov 2017-07-12 14:34:25 -0700 873) inode->i_ino, inode->i_sb->s_dev);
a3816ab0e8fe5 (Joe Perches 2014-09-29 16:08:25 -0700 874) if (seq_has_overflowed(m))
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 875) break;
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 876) }
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 877) mutex_unlock(&ep->mtx);
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 878) }
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 879) #endif
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 880)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 881) /* File callbacks that implement the eventpoll file behaviour */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 882) static const struct file_operations eventpoll_fops = {
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 883) #ifdef CONFIG_PROC_FS
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 884) .show_fdinfo = ep_show_fdinfo,
138d22b58696c (Cyrill Gorcunov 2012-12-17 16:05:02 -0800 885) #endif
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 886) .release = ep_eventpoll_release,
a11e1d432b51f (Linus Torvalds 2018-06-28 09:43:44 -0700 887) .poll = ep_eventpoll_poll,
6038f373a3dc1 (Arnd Bergmann 2010-08-15 18:52:59 +0200 888) .llseek = noop_llseek,
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 889) };
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 890)
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 891) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 892) * This is called from eventpoll_release() to unlink files from the eventpoll
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 893) * interface. We need to have this facility to cleanup correctly files that are
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 894) * closed without being removed from the eventpoll interface.
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 895) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 896) void eventpoll_release_file(struct file *file)
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 897) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 898) struct eventpoll *ep;
44cdc1d952e3f (Al Viro 2020-09-27 11:18:30 -0400 899) struct epitem *epi;
44cdc1d952e3f (Al Viro 2020-09-27 11:18:30 -0400 900) struct hlist_node *next;
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 901)
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 902) /*
684999149002d (Jonathan Corbet 2009-02-06 13:52:43 -0700 903) * We don't want to get "file->f_lock" because it is not
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 904) * necessary. It is not necessary because we're in the "struct file"
25985edcedea6 (Lucas De Marchi 2011-03-30 22:57:33 -0300 905) * cleanup path, and this means that no one is using this file anymore.
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 906) * So, for example, epoll_ctl() cannot hit here since if we reach this
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 907) * point, the file counter already went to zero and fget() would fail.
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 908) * The only hit might come from ep_free() but by holding the mutex
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 909) * will correctly serialize the operation. We do need to acquire
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 910) * "ep->mtx" after "epmutex" because ep_remove() requires it when called
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 911) * from anywhere but ep_free().
684999149002d (Jonathan Corbet 2009-02-06 13:52:43 -0700 912) *
684999149002d (Jonathan Corbet 2009-02-06 13:52:43 -0700 913) * Besides, ep_remove() acquires the lock, so we can't hold it here.
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 914) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 915) mutex_lock(&epmutex);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 916) if (unlikely(!file->f_ep)) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 917) mutex_unlock(&epmutex);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 918) return;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 919) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 920) hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 921) ep = epi->ep;
d8805e633e054 (Nelson Elhage 2011-10-31 17:13:14 -0700 922) mutex_lock_nested(&ep->mtx, 0);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 923) ep_remove(ep, epi);
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 924) mutex_unlock(&ep->mtx);
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 925) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 926) mutex_unlock(&epmutex);
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 927) }
b611967de4dc5 (Davide Libenzi 2006-10-11 01:21:44 -0700 928)
53d2be79d5981 (Davide Libenzi 2005-09-16 19:28:06 -0700 929) static int ep_alloc(struct eventpoll **pep)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 930) {
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 931) int error;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 932) struct user_struct *user;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 933) struct eventpoll *ep;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 934)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 935) user = get_current_user();
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 936) error = -ENOMEM;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 937) ep = kzalloc(sizeof(*ep), GFP_KERNEL);
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 938) if (unlikely(!ep))
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 939) goto free_uid;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 940)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 941) mutex_init(&ep->mtx);
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 942) rwlock_init(&ep->lock);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 943) init_waitqueue_head(&ep->wq);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 944) init_waitqueue_head(&ep->poll_wait);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 945) INIT_LIST_HEAD(&ep->rdllist);
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 946) ep->rbr = RB_ROOT_CACHED;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 947) ep->ovflist = EP_UNACTIVE_PTR;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 948) ep->user = user;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 949)
53d2be79d5981 (Davide Libenzi 2005-09-16 19:28:06 -0700 950) *pep = ep;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 951)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 952) return 0;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 953)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 954) free_uid:
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 955) free_uid(user);
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 956) return error;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 957) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 958)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 959) /*
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 960) * Search the file inside the eventpoll tree. The RB tree operations
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 961) * are protected by the "mtx" mutex, and ep_find() must be called with
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 962) * "mtx" held.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 963) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 964) static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 965) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 966) int kcmp;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 967) struct rb_node *rbp;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 968) struct epitem *epi, *epir = NULL;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 969) struct epoll_filefd ffd;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 970)
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 971) ep_set_ffd(&ffd, file, fd);
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 972) for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 973) epi = rb_entry(rbp, struct epitem, rbn);
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 974) kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 975) if (kcmp > 0)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 976) rbp = rbp->rb_right;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 977) else if (kcmp < 0)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 978) rbp = rbp->rb_left;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 979) else {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 980) epir = epi;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 981) break;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 982) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 983) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 984)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 985) return epir;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 986) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 987)
bfe3911a91047 (Chris Wilson 2021-02-05 22:00:12 +0000 988) #ifdef CONFIG_KCMP
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 989) static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 990) {
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 991) struct rb_node *rbp;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 992) struct epitem *epi;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 993)
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 994) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 995) epi = rb_entry(rbp, struct epitem, rbn);
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 996) if (epi->ffd.fd == tfd) {
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 997) if (toff == 0)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 998) return epi;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 999) else
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1000) toff--;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1001) }
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1002) cond_resched();
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1003) }
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1004)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1005) return NULL;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1006) }
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1007)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1008) struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1009) unsigned long toff)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1010) {
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1011) struct file *file_raw;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1012) struct eventpoll *ep;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1013) struct epitem *epi;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1014)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1015) if (!is_file_epoll(file))
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1016) return ERR_PTR(-EINVAL);
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1017)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1018) ep = file->private_data;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1019)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1020) mutex_lock(&ep->mtx);
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1021) epi = ep_find_tfd(ep, tfd, toff);
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1022) if (epi)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1023) file_raw = epi->ffd.file;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1024) else
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1025) file_raw = ERR_PTR(-ENOENT);
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1026) mutex_unlock(&ep->mtx);
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1027)
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1028) return file_raw;
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1029) }
bfe3911a91047 (Chris Wilson 2021-02-05 22:00:12 +0000 1030) #endif /* CONFIG_KCMP */
0791e3644e5ef (Cyrill Gorcunov 2017-07-12 14:34:28 -0700 1031)
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1032) /*
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1033) * Adds a new entry to the tail of the list in a lockless way, i.e.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1034) * multiple CPUs are allowed to call this function concurrently.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1035) *
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1036) * Beware: it is necessary to prevent any other modifications of the
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1037) * existing list until all changes are completed, in other words
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1038) * concurrent list_add_tail_lockless() calls should be protected
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1039) * with a read lock, where write lock acts as a barrier which
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1040) * makes sure all list_add_tail_lockless() calls are fully
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1041) * completed.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1042) *
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1043) * Also an element can be locklessly added to the list only in one
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1044) * direction i.e. either to the tail or to the head, otherwise
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1045) * concurrent access will corrupt the list.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1046) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1047) * Return: %false if element has been already added to the list, %true
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1048) * otherwise.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1049) */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1050) static inline bool list_add_tail_lockless(struct list_head *new,
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1051) struct list_head *head)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1052) {
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1053) struct list_head *prev;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1054)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1055) /*
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1056) * This is simple 'new->next = head' operation, but cmpxchg()
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1057) * is used in order to detect that same element has been just
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1058) * added to the list from another CPU: the winner observes
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1059) * new->next == new.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1060) */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1061) if (cmpxchg(&new->next, new, head) != new)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1062) return false;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1063)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1064) /*
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1065) * Initially ->next of a new element must be updated with the head
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1066) * (we are inserting to the tail) and only then pointers are atomically
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1067) * exchanged. XCHG guarantees memory ordering, thus ->next should be
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1068) * updated before pointers are actually swapped and pointers are
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1069) * swapped before prev->next is updated.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1070) */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1071)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1072) prev = xchg(&head->prev, new);
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1073)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1074) /*
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1075) * It is safe to modify prev->next and new->prev, because a new element
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1076) * is added only to the tail and new->next is updated before XCHG.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1077) */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1078)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1079) prev->next = new;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1080) new->prev = prev;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1081)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1082) return true;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1083) }
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1084)
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1085) /*
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1086) * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1087) * i.e. multiple CPUs are allowed to call this function concurrently.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1088) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1089) * Return: %false if epi element has been already chained, %true otherwise.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1090) */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1091) static inline bool chain_epi_lockless(struct epitem *epi)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1092) {
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1093) struct eventpoll *ep = epi->ep;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1094)
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1095) /* Fast preliminary check */
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1096) if (epi->next != EP_UNACTIVE_PTR)
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1097) return false;
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1098)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1099) /* Check that the same epi has not been just chained from another CPU */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1100) if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1101) return false;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1102)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1103) /* Atomically exchange tail */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1104) epi->next = xchg(&ep->ovflist, epi);
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1105)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1106) return true;
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1107) }
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1108)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1109) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1110) * This is the callback that is passed to the wait queue wakeup
bf6a41db7726e (Daniel Baluta 2011-01-30 23:42:29 +0200 1111) * mechanism. It is called by the stored file descriptors when they
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1112) * have events to report.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1113) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1114) * This callback takes a read lock in order not to contend with concurrent
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1115) * events from another file descriptor, thus all modifications to ->rdllist
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1116) * or ->ovflist are lockless. Read lock is paired with the write lock from
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1117) * ep_scan_ready_list(), which stops all list modifications and guarantees
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1118) * that lists state is seen correctly.
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1119) *
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1120) * Another thing worth to mention is that ep_poll_callback() can be called
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1121) * concurrently for the same @epi from different CPUs if poll table was inited
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1122) * with several wait queues entries. Plural wakeup from different CPUs of a
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1123) * single wait queue is serialized by wq.lock, but the case when multiple wait
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1124) * queues are used should be detected accordingly. This is detected using
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1125) * cmpxchg() operation.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1126) */
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 1127) static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1128) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1129) int pwake = 0;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1130) struct epitem *epi = ep_item_from_wait(wait);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1131) struct eventpoll *ep = epi->ep;
3ad6f93e98d6d (Al Viro 2017-07-03 20:14:56 -0400 1132) __poll_t pollflags = key_to_poll(key);
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1133) unsigned long flags;
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 1134) int ewake = 0;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1135)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1136) read_lock_irqsave(&ep->lock, flags);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1137)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1138) ep_set_busy_poll_napi_id(epi);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1139)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1140) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1141) * If the event mask does not contain any poll(2) event, we consider the
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1142) * descriptor to be disabled. This condition is likely the effect of the
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1143) * EPOLLONESHOT bit that disables the descriptor when an event is received,
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1144) * until the next EPOLL_CTL_MOD will be issued.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1145) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1146) if (!(epi->event.events & ~EP_PRIVATE_BITS))
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1147) goto out_unlock;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1148)
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1149) /*
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1150) * Check the events coming with the callback. At this stage, not
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1151) * every device reports the events in the "key" parameter of the
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1152) * callback. We need to be able to handle both cases here, hence the
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1153) * test for "key" != NULL before the event match test.
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1154) */
3ad6f93e98d6d (Al Viro 2017-07-03 20:14:56 -0400 1155) if (pollflags && !(pollflags & epi->event.events))
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1156) goto out_unlock;
2dfa4eeab0fc7 (Davide Libenzi 2009-03-31 15:24:22 -0700 1157)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1158) /*
bf6a41db7726e (Daniel Baluta 2011-01-30 23:42:29 +0200 1159) * If we are transferring events to userspace, we can hold no locks
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1160) * (because we're accessing user memory, and because of linux f_op->poll()
bf6a41db7726e (Daniel Baluta 2011-01-30 23:42:29 +0200 1161) * semantics). All the events that happen during that period of time are
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1162) * chained in ep->ovflist and requeued later on.
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1163) */
c5a282e9635e9 (Davidlohr Bueso 2019-01-03 15:27:15 -0800 1164) if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1165) if (chain_epi_lockless(epi))
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1166) ep_pm_stay_awake_rcu(epi);
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1167) } else if (!ep_is_linked(epi)) {
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1168) /* In the usual case, add event to ready list. */
0c54a6a44bf3d (Khazhismel Kumykov 2020-05-07 18:35:59 -0700 1169) if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
c3e320b61581e (Roman Penyaev 2019-03-07 16:28:49 -0800 1170) ep_pm_stay_awake_rcu(epi);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1171) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1172)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1173) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1174) * Wake up ( if active ) both the eventpoll wait list and the ->poll()
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1175) * wait list.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1176) */
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 1177) if (waitqueue_active(&ep->wq)) {
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1178) if ((epi->event.events & EPOLLEXCLUSIVE) &&
3ad6f93e98d6d (Al Viro 2017-07-03 20:14:56 -0400 1179) !(pollflags & POLLFREE)) {
3ad6f93e98d6d (Al Viro 2017-07-03 20:14:56 -0400 1180) switch (pollflags & EPOLLINOUT_BITS) {
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 1181) case EPOLLIN:
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 1182) if (epi->event.events & EPOLLIN)
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1183) ewake = 1;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1184) break;
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 1185) case EPOLLOUT:
a9a08845e9acb (Linus Torvalds 2018-02-11 14:34:03 -0800 1186) if (epi->event.events & EPOLLOUT)
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1187) ewake = 1;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1188) break;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1189) case 0:
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1190) ewake = 1;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1191) break;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1192) }
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 1193) }
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1194) wake_up(&ep->wq);
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 1195) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1196) if (waitqueue_active(&ep->poll_wait))
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1197) pwake++;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1198)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1199) out_unlock:
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1200) read_unlock_irqrestore(&ep->lock, flags);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1201)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1202) /* We have to call this outside the lock */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1203) if (pwake)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 1204) ep_poll_safewake(ep, epi);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1205)
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1206) if (!(epi->event.events & EPOLLEXCLUSIVE))
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1207) ewake = 1;
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1208)
3ad6f93e98d6d (Al Viro 2017-07-03 20:14:56 -0400 1209) if (pollflags & POLLFREE) {
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1210) /*
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1211) * If we race with ep_remove_wait_queue() it can miss
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1212) * ->whead = NULL and do another remove_wait_queue() after
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1213) * us, so we can't use __remove_wait_queue().
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1214) */
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1215) list_del_init(&wait->entry);
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1216) /*
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1217) * ->whead != NULL protects us from the race with ep_free()
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1218) * or ep_remove(), ep_remove_wait_queue() takes whead->lock
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1219) * held by the caller. Once we nullify it, nothing protects
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1220) * ep/epi or even wait.
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1221) */
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1222) smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1223) }
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 1224)
138e4ad67afd5 (Oleg Nesterov 2017-09-01 18:55:33 +0200 1225) return ewake;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1226) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1227)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1228) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1229) * This is the callback that is used to add our wait queue to the
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1230) * target file wakeup lists.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1231) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1232) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1233) poll_table *pt)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1234) {
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1235) struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1236) struct epitem *epi = epq->epi;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1237) struct eppoll_entry *pwq;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1238)
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1239) if (unlikely(!epi)) // an earlier allocation has failed
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1240) return;
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1241)
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1242) pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1243) if (unlikely(!pwq)) {
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1244) epq->epi = NULL;
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1245) return;
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1246) }
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1247)
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1248) init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1249) pwq->whead = whead;
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1250) pwq->base = epi;
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1251) if (epi->event.events & EPOLLEXCLUSIVE)
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1252) add_wait_queue_exclusive(whead, &pwq->wait);
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1253) else
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1254) add_wait_queue(whead, &pwq->wait);
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1255) pwq->next = epi->pwqlist;
364f374f22ba1 (Al Viro 2020-09-02 11:55:09 -0400 1256) epi->pwqlist = pwq;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1257) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1258)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1259) static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1260) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1261) int kcmp;
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1262) struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1263) struct epitem *epic;
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1264) bool leftmost = true;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1265)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1266) while (*p) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1267) parent = *p;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1268) epic = rb_entry(parent, struct epitem, rbn);
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 1269) kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1270) if (kcmp > 0) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1271) p = &parent->rb_right;
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1272) leftmost = false;
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1273) } else
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1274) p = &parent->rb_left;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1275) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1276) rb_link_node(&epi->rbn, parent, p);
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1277) rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1278) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1279)
a80a6b85b428e (Andrew Morton 2012-11-08 15:53:35 -0800 1280)
a80a6b85b428e (Andrew Morton 2012-11-08 15:53:35 -0800 1281)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1282) #define PATH_ARR_SIZE 5
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1283) /*
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1284) * These are the number paths of length 1 to 5, that we are allowing to emanate
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1285) * from a single file of interest. For example, we allow 1000 paths of length
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1286) * 1, to emanate from each file of interest. This essentially represents the
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1287) * potential wakeup paths, which need to be limited in order to avoid massive
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1288) * uncontrolled wakeup storms. The common use case should be a single ep which
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1289) * is connected to n file sources. In this case each file source has 1 path
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1290) * of length 1. Thus, the numbers below should be more than sufficient. These
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1291) * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1292) * and delete can't add additional paths. Protected by the epmutex.
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1293) */
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1294) static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1295) static int path_count[PATH_ARR_SIZE];
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1296)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1297) static int path_count_inc(int nests)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1298) {
93dc6107a76da (Jason Baron 2012-03-16 16:34:03 -0400 1299) /* Allow an arbitrary number of depth 1 paths */
93dc6107a76da (Jason Baron 2012-03-16 16:34:03 -0400 1300) if (nests == 0)
93dc6107a76da (Jason Baron 2012-03-16 16:34:03 -0400 1301) return 0;
93dc6107a76da (Jason Baron 2012-03-16 16:34:03 -0400 1302)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1303) if (++path_count[nests] > path_limits[nests])
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1304) return -1;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1305) return 0;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1306) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1307)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1308) static void path_count_init(void)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1309) {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1310) int i;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1311)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1312) for (i = 0; i < PATH_ARR_SIZE; i++)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1313) path_count[i] = 0;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1314) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1315)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1316) static int reverse_path_check_proc(struct hlist_head *refs, int depth)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1317) {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1318) int error = 0;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1319) struct epitem *epi;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1320)
0c320f776ed83 (Al Viro 2020-09-25 19:48:56 -0400 1321) if (depth > EP_MAX_NESTS) /* too deep nesting */
99d84d4330e8a (Al Viro 2020-08-22 23:08:37 -0400 1322) return -1;
99d84d4330e8a (Al Viro 2020-08-22 23:08:37 -0400 1323)
ae10b2b4eb01b (Jason Baron 2013-11-12 15:10:16 -0800 1324) /* CTL_DEL can remove links here, but that can't increase our count */
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1325) hlist_for_each_entry_rcu(epi, refs, fllink) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1326) struct hlist_head *refs = &epi->ep->refs;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1327) if (hlist_empty(refs))
d16312a46936b (Al Viro 2020-09-26 15:54:05 -0400 1328) error = path_count_inc(depth);
d16312a46936b (Al Viro 2020-09-26 15:54:05 -0400 1329) else
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1330) error = reverse_path_check_proc(refs, depth + 1);
d16312a46936b (Al Viro 2020-09-26 15:54:05 -0400 1331) if (error != 0)
d16312a46936b (Al Viro 2020-09-26 15:54:05 -0400 1332) break;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1333) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1334) return error;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1335) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1336)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1337) /**
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1338) * reverse_path_check - The tfile_check_list is list of epitem_head, which have
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1339) * links that are proposed to be newly added. We need to
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1340) * make sure that those added links don't add too many
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1341) * paths such that we will spend all our time waking up
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1342) * eventpoll objects.
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1343) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1344) * Return: %zero if the proposed links don't create too many paths,
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1345) * %-1 otherwise.
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1346) */
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1347) static int reverse_path_check(void)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1348) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1349) struct epitems_head *p;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1350)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1351) for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1352) int error;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1353) path_count_init();
b62d2706a7548 (Al Viro 2020-10-01 14:11:00 -0400 1354) rcu_read_lock();
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1355) error = reverse_path_check_proc(&p->epitems, 0);
b62d2706a7548 (Al Viro 2020-10-01 14:11:00 -0400 1356) rcu_read_unlock();
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1357) if (error)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1358) return error;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1359) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1360) return 0;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1361) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1362)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1363) static int ep_create_wakeup_source(struct epitem *epi)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1364) {
3701cb59d892b (Al Viro 2020-09-24 19:41:58 -0400 1365) struct name_snapshot n;
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1366) struct wakeup_source *ws;
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1367)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1368) if (!epi->ep->ws) {
c8377adfa7810 (Tri Vo 2019-08-06 18:48:46 -0700 1369) epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1370) if (!epi->ep->ws)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1371) return -ENOMEM;
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1372) }
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1373)
3701cb59d892b (Al Viro 2020-09-24 19:41:58 -0400 1374) take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
3701cb59d892b (Al Viro 2020-09-24 19:41:58 -0400 1375) ws = wakeup_source_register(NULL, n.name.name);
3701cb59d892b (Al Viro 2020-09-24 19:41:58 -0400 1376) release_dentry_name_snapshot(&n);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1377)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1378) if (!ws)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1379) return -ENOMEM;
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1380) rcu_assign_pointer(epi->ws, ws);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1381)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1382) return 0;
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1383) }
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1384)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1385) /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1386) static noinline void ep_destroy_wakeup_source(struct epitem *epi)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1387) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1388) struct wakeup_source *ws = ep_wakeup_source(epi);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1389)
d6d67e7231c97 (Eric Wong 2013-04-30 15:27:43 -0700 1390) RCU_INIT_POINTER(epi->ws, NULL);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1391)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1392) /*
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1393) * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1394) * used internally by wakeup_source_remove, too (called by
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1395) * wakeup_source_unregister), so we cannot use call_rcu
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1396) */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1397) synchronize_rcu();
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1398) wakeup_source_unregister(ws);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1399) }
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1400)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1401) static int attach_epitem(struct file *file, struct epitem *epi)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1402) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1403) struct epitems_head *to_free = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1404) struct hlist_head *head = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1405) struct eventpoll *ep = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1406)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1407) if (is_file_epoll(file))
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1408) ep = file->private_data;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1409)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1410) if (ep) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1411) head = &ep->refs;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1412) } else if (!READ_ONCE(file->f_ep)) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1413) allocate:
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1414) to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1415) if (!to_free)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1416) return -ENOMEM;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1417) head = &to_free->epitems;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1418) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1419) spin_lock(&file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1420) if (!file->f_ep) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1421) if (unlikely(!head)) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1422) spin_unlock(&file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1423) goto allocate;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1424) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1425) file->f_ep = head;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1426) to_free = NULL;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1427) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1428) hlist_add_head_rcu(&epi->fllink, file->f_ep);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1429) spin_unlock(&file->f_lock);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1430) free_ephead(to_free);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1431) return 0;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1432) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1433)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1434) /*
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1435) * Must be called with "mtx" held.
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1436) */
bec1a502d34dc (Al Viro 2017-11-28 19:43:33 -0500 1437) static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 1438) struct file *tfile, int fd, int full_check)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1439) {
d85e2aa2e34da (Al Viro 2018-02-01 15:24:58 -0500 1440) int error, pwake = 0;
d85e2aa2e34da (Al Viro 2018-02-01 15:24:58 -0500 1441) __poll_t revents;
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 1442) long user_watches;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1443) struct epitem *epi;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1444) struct ep_pqueue epq;
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1445) struct eventpoll *tep = NULL;
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1446)
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1447) if (is_file_epoll(tfile))
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1448) tep = tfile->private_data;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1449)
92e6417840559 (Davidlohr Bueso 2018-08-21 21:56:45 -0700 1450) lockdep_assert_irqs_enabled();
92e6417840559 (Davidlohr Bueso 2018-08-21 21:56:45 -0700 1451)
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 1452) user_watches = atomic_long_read(&ep->user->epoll_watches);
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 1453) if (unlikely(user_watches >= max_user_watches))
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 1454) return -ENOSPC;
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1455) if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 1456) return -ENOMEM;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1457)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1458) /* Item initialization follow here ... */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1459) INIT_LIST_HEAD(&epi->rdllink);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1460) epi->ep = ep;
b030a4dd609e1 (Pekka Enberg 2005-06-23 00:10:03 -0700 1461) ep_set_ffd(&epi->ffd, tfile, fd);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1462) epi->event = *event;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1463) epi->next = EP_UNACTIVE_PTR;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1464)
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1465) if (tep)
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1466) mutex_lock_nested(&tep->mtx, 1);
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1467) /* Add the current item to the list of active epoll hook for this file */
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1468) if (unlikely(attach_epitem(tfile, epi) < 0)) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1469) kmem_cache_free(epi_cache, epi);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1470) if (tep)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1471) mutex_unlock(&tep->mtx);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1472) return -ENOMEM;
d9f41e3c95a17 (Al Viro 2020-10-01 16:10:11 -0400 1473) }
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1474)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1475) if (full_check && !tep)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1476) list_file(tfile);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1477)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1478) atomic_long_inc(&ep->user->epoll_watches);
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1479)
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1480) /*
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1481) * Add the current item to the RB tree. All RB tree operations are
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1482) * protected by "mtx", and ep_insert() is called with "mtx" held.
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1483) */
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1484) ep_rbtree_insert(ep, epi);
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1485) if (tep)
85353e919f6eb (Al Viro 2020-09-26 18:15:26 -0400 1486) mutex_unlock(&tep->mtx);
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1487)
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1488) /* now check if we've created too many backpaths */
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1489) if (unlikely(full_check && reverse_path_check())) {
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1490) ep_remove(ep, epi);
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1491) return -EINVAL;
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1492) }
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1493)
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1494) if (epi->event.events & EPOLLWAKEUP) {
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1495) error = ep_create_wakeup_source(epi);
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1496) if (error) {
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1497) ep_remove(ep, epi);
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1498) return error;
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1499) }
d1ec50adb5609 (Al Viro 2020-09-27 11:03:32 -0400 1500) }
f8d4f44df056c (Al Viro 2020-09-09 22:25:06 -0400 1501)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1502) /* Initialize the poll table using the queue callback */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1503) epq.epi = epi;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1504) init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1505)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1506) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1507) * Attach the item to the poll hooks and get current event bits.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1508) * We can safely use the file* here because its usage count has
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1509) * been increased by the caller of this function. Note that after
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1510) * this operation completes, the poll callback can start hitting
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1511) * the new item.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1512) */
37b5e5212a448 (Jason Baron 2017-11-17 15:29:06 -0800 1513) revents = ep_item_poll(epi, &epq.pt, 1);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1514)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1515) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1516) * We have to check if something went wrong during the poll wait queue
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1517) * install process. Namely an allocation for a wait queue failed due
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1518) * high memory pressure.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1519) */
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1520) if (unlikely(!epq.epi)) {
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1521) ep_remove(ep, epi);
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1522) return -ENOMEM;
e3e096e7fc30c (Al Viro 2020-09-26 18:09:29 -0400 1523) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1524)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1525) /* We have to drop the new item inside our item list to keep track of it */
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1526) write_lock_irq(&ep->lock);
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1527)
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1528) /* record NAPI ID of new item if present */
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1529) ep_set_busy_poll_napi_id(epi);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1530)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1531) /* If the file is already "ready" we drop it inside the ready list */
992991c03ca03 (Davidlohr Bueso 2018-08-21 21:58:26 -0700 1532) if (revents && !ep_is_linked(epi)) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1533) list_add_tail(&epi->rdllink, &ep->rdllist);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1534) ep_pm_stay_awake(epi);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1535)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1536) /* Notify waiting tasks that events are available */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1537) if (waitqueue_active(&ep->wq))
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1538) wake_up(&ep->wq);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1539) if (waitqueue_active(&ep->poll_wait))
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1540) pwake++;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1541) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1542)
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1543) write_unlock_irq(&ep->lock);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1544)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1545) /* We have to call this outside the lock */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1546) if (pwake)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 1547) ep_poll_safewake(ep, NULL);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1548)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1549) return 0;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1550) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1551)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1552) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1553) * Modify the interest event mask by dropping an event if the new mask
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1554) * has a match in the current file status. Must be called with "mtx" held.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1555) */
bec1a502d34dc (Al Viro 2017-11-28 19:43:33 -0500 1556) static int ep_modify(struct eventpoll *ep, struct epitem *epi,
bec1a502d34dc (Al Viro 2017-11-28 19:43:33 -0500 1557) const struct epoll_event *event)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1558) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1559) int pwake = 0;
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 1560) poll_table pt;
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 1561)
92e6417840559 (Davidlohr Bueso 2018-08-21 21:56:45 -0700 1562) lockdep_assert_irqs_enabled();
92e6417840559 (Davidlohr Bueso 2018-08-21 21:56:45 -0700 1563)
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 1564) init_poll_funcptr(&pt, NULL);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1565)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1566) /*
e057e15ff66a6 (Tony Battersby 2009-03-31 15:24:15 -0700 1567) * Set the new event interest mask before calling f_op->poll();
e057e15ff66a6 (Tony Battersby 2009-03-31 15:24:15 -0700 1568) * otherwise we might miss an event that happens between the
e057e15ff66a6 (Tony Battersby 2009-03-31 15:24:15 -0700 1569) * f_op->poll() call and the new event set registering.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1570) */
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1571) epi->event.events = event->events; /* need barrier below */
e057e15ff66a6 (Tony Battersby 2009-03-31 15:24:15 -0700 1572) epi->event.data = event->data; /* protected by mtx */
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1573) if (epi->event.events & EPOLLWAKEUP) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1574) if (!ep_has_wakeup_source(epi))
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1575) ep_create_wakeup_source(epi);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1576) } else if (ep_has_wakeup_source(epi)) {
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1577) ep_destroy_wakeup_source(epi);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1578) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1579)
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1580) /*
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1581) * The following barrier has two effects:
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1582) *
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1583) * 1) Flush epi changes above to other CPUs. This ensures
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1584) * we do not miss events from ep_poll_callback if an
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1585) * event occurs immediately after we call f_op->poll().
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1586) * We need this because we did not take ep->lock while
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1587) * changing epi above (but ep_poll_callback does take
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1588) * ep->lock).
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1589) *
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1590) * 2) We also need to ensure we do not miss _past_ events
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1591) * when calling f_op->poll(). This barrier also
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1592) * pairs with the barrier in wq_has_sleeper (see
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1593) * comments for wq_has_sleeper).
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1594) *
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1595) * This barrier will now guarantee ep_poll_callback or f_op->poll
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1596) * (or both) will notice the readiness of an item.
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1597) */
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1598) smp_mb();
128dd1759d96a (Eric Wong 2013-01-01 21:20:27 +0000 1599)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1600) /*
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1601) * Get current event bits. We can safely use the file* here because
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1602) * its usage count has been increased by the caller of this function.
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1603) * If the item is "hot" and it is not registered inside the ready
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 1604) * list, push it inside.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1605) */
69112736e2f02 (Al Viro 2017-11-28 19:56:15 -0500 1606) if (ep_item_poll(epi, &pt, 1)) {
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1607) write_lock_irq(&ep->lock);
992991c03ca03 (Davidlohr Bueso 2018-08-21 21:58:26 -0700 1608) if (!ep_is_linked(epi)) {
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1609) list_add_tail(&epi->rdllink, &ep->rdllist);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1610) ep_pm_stay_awake(epi);
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1611)
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1612) /* Notify waiting tasks that events are available */
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1613) if (waitqueue_active(&ep->wq))
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1614) wake_up(&ep->wq);
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1615) if (waitqueue_active(&ep->poll_wait))
c7ea76302547f (Davide Libenzi 2007-05-15 01:40:47 -0700 1616) pwake++;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1617) }
a218cc4914209 (Roman Penyaev 2019-03-07 16:28:53 -0800 1618) write_unlock_irq(&ep->lock);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1619) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1620)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1621) /* We have to call this outside the lock */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1622) if (pwake)
efcdd350d1f8a (Jason Baron 2020-04-06 20:11:23 -0700 1623) ep_poll_safewake(ep, NULL);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1624)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1625) return 0;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1626) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1627)
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1628) static int ep_send_events(struct eventpoll *ep,
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1629) struct epoll_event __user *events, int maxevents)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1630) {
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1631) struct epitem *epi, *tmp;
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1632) LIST_HEAD(txlist);
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 1633) poll_table pt;
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1634) int res = 0;
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 1635)
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1636) /*
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1637) * Always short-circuit for fatal signals to allow threads to make a
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1638) * timely exit without the chance of finding more events available and
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1639) * fetching repeatedly.
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1640) */
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1641) if (fatal_signal_pending(current))
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1642) return -EINTR;
cccd29bf0823b (Soheil Hassas Yeganeh 2020-12-18 14:01:51 -0800 1643)
626cf23660850 (Hans Verkuil 2012-03-23 15:02:27 -0700 1644) init_poll_funcptr(&pt, NULL);
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1645)
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1646) mutex_lock(&ep->mtx);
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1647) ep_start_scan(ep, &txlist);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1648)
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1649) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 1650) * We can loop without lock because we are passed a task private list.
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1651) * Items cannot vanish during the loop we are holding ep->mtx.
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1652) */
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1653) list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1654) struct wakeup_source *ws;
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1655) __poll_t revents;
21877e1a5b520 (Davidlohr Bueso 2019-01-03 15:27:12 -0800 1656)
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1657) if (res >= maxevents)
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1658) break;
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1659)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1660) /*
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1661) * Activate ep->ws before deactivating epi->ws to prevent
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1662) * triggering auto-suspend here (in case we reactive epi->ws
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1663) * below).
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1664) *
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1665) * This could be rearranged to delay the deactivation of epi->ws
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1666) * instead, but then epi->ws would temporarily be out of sync
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1667) * with ep_is_linked().
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 1668) */
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1669) ws = ep_wakeup_source(epi);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1670) if (ws) {
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1671) if (ws->active)
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1672) __pm_stay_awake(ep->ws);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1673) __pm_relax(ws);
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1674) }
eea1d585917c5 (Eric Wong 2013-04-30 15:27:39 -0700 1675)
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 1676) list_del_init(&epi->rdllink);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1677)
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1678) /*
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 1679) * If the event mask intersect the caller-requested one,
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1680) * deliver the event to userspace. Again, we are holding ep->mtx,
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1681) * so no operations coming from userspace can change the item.
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1682) */
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1683) revents = ep_item_poll(epi, &pt, 1);
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1684) if (!revents)
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1685) continue;
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1686)
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1687) if (__put_user(revents, &events->events) ||
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1688) __put_user(epi->event.data, &events->data)) {
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1689) list_add(&epi->rdllink, &txlist);
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1690) ep_pm_stay_awake(epi);
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1691) if (!res)
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1692) res = -EFAULT;
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1693) break;
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1694) }
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1695) res++;
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1696) events++;
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1697) if (epi->event.events & EPOLLONESHOT)
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1698) epi->event.events &= EP_PRIVATE_BITS;
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1699) else if (!(epi->event.events & EPOLLET)) {
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1700) /*
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1701) * If this file has been added with Level
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1702) * Trigger mode, we need to insert back inside
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1703) * the ready list, so that the next call to
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1704) * epoll_wait() will check again the events
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1705) * availability. At this point, no one can insert
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1706) * into ep->rdllist besides us. The epoll_ctl()
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1707) * callers are locked out by
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1708) * ep_scan_ready_list() holding "mtx" and the
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1709) * poll callback will queue them in ep->ovflist.
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1710) */
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1711) list_add_tail(&epi->rdllink, &ep->rdllist);
4e0982a00564c (Davidlohr Bueso 2019-01-03 15:27:05 -0800 1712) ep_pm_stay_awake(epi);
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1713) }
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1714) }
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1715) ep_done_scan(ep, &txlist);
57804b1cc4616 (Al Viro 2020-08-31 13:41:30 -0400 1716) mutex_unlock(&ep->mtx);
5071f97ec6d74 (Davide Libenzi 2009-03-31 15:24:10 -0700 1717)
ff07952aeda85 (Al Viro 2020-08-31 13:39:52 -0400 1718) return res;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1719) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1720)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1721) static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
0781b909b5586 (Eric Dumazet 2011-02-01 15:52:35 -0800 1722) {
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1723) struct timespec64 now;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1724)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1725) if (ms < 0)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1726) return NULL;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1727)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1728) if (!ms) {
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1729) to->tv_sec = 0;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1730) to->tv_nsec = 0;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1731) return to;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1732) }
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1733)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1734) to->tv_sec = ms / MSEC_PER_SEC;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1735) to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
0781b909b5586 (Eric Dumazet 2011-02-01 15:52:35 -0800 1736)
766b9f928bd5b (Deepa Dinamani 2016-05-19 17:09:05 -0700 1737) ktime_get_ts64(&now);
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1738) *to = timespec64_add_safe(now, *to);
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1739) return to;
0781b909b5586 (Eric Dumazet 2011-02-01 15:52:35 -0800 1740) }
0781b909b5586 (Eric Dumazet 2011-02-01 15:52:35 -0800 1741)
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1742) /**
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1743) * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1744) * event buffer.
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1745) *
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1746) * @ep: Pointer to the eventpoll context.
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1747) * @events: Pointer to the userspace buffer where the ready events should be
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1748) * stored.
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1749) * @maxevents: Size (in terms of number of events) of the caller event buffer.
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1750) * @timeout: Maximum timeout for the ready events fetch operation, in
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1751) * timespec. If the timeout is zero, the function will not block,
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1752) * while if the @timeout ptr is NULL, the function will block
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1753) * until at least one event has been retrieved (or an error
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1754) * occurred).
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1755) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1756) * Return: the number of ready events which have been fetched, or an
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1757) * error code, in case of error.
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1758) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1759) static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1760) int maxevents, struct timespec64 *timeout)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1761) {
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1762) int res, eavail, timed_out = 0;
da8b44d5a9f8b (John Stultz 2016-03-17 14:20:51 -0700 1763) u64 slack = 0;
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 1764) wait_queue_entry_t wait;
95aac7b1cd224 (Shawn Bohrer 2010-10-27 15:34:54 -0700 1765) ktime_t expires, *to = NULL;
95aac7b1cd224 (Shawn Bohrer 2010-10-27 15:34:54 -0700 1766)
679abf381a18e (Davidlohr Bueso 2018-08-21 21:58:23 -0700 1767) lockdep_assert_irqs_enabled();
679abf381a18e (Davidlohr Bueso 2018-08-21 21:58:23 -0700 1768)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1769) if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1770) slack = select_estimate_accuracy(timeout);
95aac7b1cd224 (Shawn Bohrer 2010-10-27 15:34:54 -0700 1771) to = &expires;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1772) *to = timespec64_to_ktime(*timeout);
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 1773) } else if (timeout) {
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1774) /*
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1775) * Avoid the unnecessary trip to the wait queue loop, if the
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1776) * caller specified a non blocking operation.
f4d93ad74c181 (Shawn Bohrer 2011-03-22 16:34:47 -0700 1777) */
95aac7b1cd224 (Shawn Bohrer 2010-10-27 15:34:54 -0700 1778) timed_out = 1;
95aac7b1cd224 (Shawn Bohrer 2010-10-27 15:34:54 -0700 1779) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1780)
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1781) /*
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1782) * This call is racy: We may or may not see events that are being added
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1783) * to the ready list under the lock (e.g., in IRQ callbacks). For cases
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1784) * with a non-zero timeout, this thread will check the ready list under
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1785) * lock and will add to the wait queue. For cases with a zero
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1786) * timeout, the user by definition should not care and will have to
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1787) * recheck again.
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1788) */
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1789) eavail = ep_events_available(ep);
e59d3c64cba69 (Soheil Hassas Yeganeh 2020-12-18 14:02:06 -0800 1790)
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1791) while (1) {
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1792) if (eavail) {
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1793) /*
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1794) * Try to transfer events to user space. In case we get
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1795) * 0 events and there's still timeout left over, we go
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1796) * trying again in search of more luck.
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1797) */
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1798) res = ep_send_events(ep, events, maxevents);
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1799) if (res)
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1800) return res;
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1801) }
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1802)
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1803) if (timed_out)
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1804) return 0;
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1805)
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1806) eavail = ep_busy_loop(ep, timed_out);
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1807) if (eavail)
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1808) continue;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1809)
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1810) if (signal_pending(current))
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1811) return -EINTR;
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1812)
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1813) /*
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1814) * Internally init_wait() uses autoremove_wake_function(),
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1815) * thus wait entry is removed from the wait queue on each
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1816) * wakeup. Why it is important? In case of several waiters
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1817) * each new wakeup will hit the next waiter, giving it the
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1818) * chance to harvest new event. Otherwise wakeup can be
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1819) * lost. This is also good performance-wise, because on
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1820) * normal wakeup path no need to call __remove_wait_queue()
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1821) * explicitly, thus ep->lock is not taken, which halts the
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1822) * event delivery.
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1823) */
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1824) init_wait(&wait);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1825)
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1826) write_lock_irq(&ep->lock);
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1827) /*
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1828) * Barrierless variant, waitqueue_active() is called under
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1829) * the same lock on wakeup ep_poll_callback() side, so it
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1830) * is safe to avoid an explicit barrier.
bf3b9f6372c45 (Sridhar Samudrala 2017-03-24 10:08:30 -0700 1831) */
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1832) __set_current_state(TASK_INTERRUPTIBLE);
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1833)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1834) /*
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1835) * Do the final check under the lock. ep_scan_ready_list()
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1836) * plays with two lists (->rdllist and ->ovflist) and there
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1837) * is always a race when both lists are empty for short
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1838) * period of time although events are pending, so lock is
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1839) * important.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1840) */
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1841) eavail = ep_events_available(ep);
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1842) if (!eavail)
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1843) __add_wait_queue_exclusive(&ep->wq, &wait);
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1844)
65759097d804d (Roman Penyaev 2020-05-13 17:50:38 -0700 1845) write_unlock_irq(&ep->lock);
95aac7b1cd224 (Shawn Bohrer 2010-10-27 15:34:54 -0700 1846)
2efdaf7660c40 (Soheil Hassas Yeganeh 2020-12-18 14:01:48 -0800 1847) if (!eavail)
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1848) timed_out = !schedule_hrtimeout_range(to, slack,
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1849) HRTIMER_MODE_ABS);
e411596d48b5b (Soheil Hassas Yeganeh 2020-12-18 14:01:54 -0800 1850) __set_current_state(TASK_RUNNING);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1851)
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1852) /*
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1853) * We were woken up, thus go and try to harvest some events.
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1854) * If timed out and still on the wait queue, recheck eavail
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1855) * carefully under lock, below.
289caf5d8f6c6 (Soheil Hassas Yeganeh 2020-12-18 14:01:44 -0800 1856) */
412895f03cbf9 (Roman Penyaev 2020-05-07 18:36:16 -0700 1857) eavail = 1;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1858)
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1859) if (!list_empty_careful(&wait.entry)) {
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1860) write_lock_irq(&ep->lock);
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1861) /*
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1862) * If the thread timed out and is not on the wait queue,
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1863) * it means that the thread was woken up after its
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1864) * timeout expired before it could reacquire the lock.
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1865) * Thus, when wait.entry is empty, it needs to harvest
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1866) * events.
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1867) */
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1868) if (timed_out)
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1869) eavail = list_empty(&wait.entry);
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1870) __remove_wait_queue(&ep->wq, &wait);
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1871) write_unlock_irq(&ep->lock);
e8c85328b1e88 (Soheil Hassas Yeganeh 2020-12-18 14:02:00 -0800 1872) }
00b27634bc471 (Soheil Hassas Yeganeh 2020-12-18 14:02:03 -0800 1873) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1874) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 1875)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1876) /**
773318eddbacf (Al Viro 2020-08-22 23:13:27 -0400 1877) * ep_loop_check_proc - verify that adding an epoll file inside another
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1878) * epoll structure does not violate the constraints, in
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1879) * terms of closed loops, or too deep chains (which can
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1880) * result in excessive stack usage).
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1881) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1882) * @ep: the &struct eventpoll to be currently checked.
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1883) * @depth: Current depth of the path being checked.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1884) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1885) * Return: %zero if adding the epoll @file inside current epoll
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1886) * structure @ep does not violate the constraints, or %-1 otherwise.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1887) */
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1888) static int ep_loop_check_proc(struct eventpoll *ep, int depth)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1889) {
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1890) int error = 0;
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1891) struct rb_node *rbp;
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1892) struct epitem *epi;
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1893)
773318eddbacf (Al Viro 2020-08-22 23:13:27 -0400 1894) mutex_lock_nested(&ep->mtx, depth + 1);
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 1895) ep->gen = loop_check_gen;
b2ac2ea6296e7 (Davidlohr Bueso 2017-09-08 16:15:18 -0700 1896) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1897) epi = rb_entry(rbp, struct epitem, rbn);
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1898) if (unlikely(is_file_epoll(epi->ffd.file))) {
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1899) struct eventpoll *ep_tovisit;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1900) ep_tovisit = epi->ffd.file->private_data;
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 1901) if (ep_tovisit->gen == loop_check_gen)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1902) continue;
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1903) if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
56c428cac5a2c (Al Viro 2020-09-26 16:38:44 -0400 1904) error = -1;
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1905) else
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1906) error = ep_loop_check_proc(ep_tovisit, depth + 1);
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1907) if (error != 0)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1908) break;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1909) } else {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1910) /*
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1911) * If we've reached a file that is not associated with
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1912) * an ep, then we need to check if the newly added
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1913) * links are going to add too many wakeup paths. We do
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1914) * this by adding it to the tfile_check_list, if it's
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1915) * not already there, and calling reverse_path_check()
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1916) * during ep_insert().
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1917) */
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1918) list_file(epi->ffd.file);
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1919) }
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1920) }
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1921) mutex_unlock(&ep->mtx);
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1922)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1923) return error;
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1924) }
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1925)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1926) /**
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1927) * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1928) * into another epoll file (represented by @ep) does not create
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1929) * closed loops or too deep chains.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1930) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1931) * @ep: Pointer to the epoll we are inserting into.
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1932) * @to: Pointer to the epoll to be inserted.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1933) *
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1934) * Return: %zero if adding the epoll @to inside the epoll @from
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 1935) * does not violate the constraints, or %-1 otherwise.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1936) */
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1937) static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1938) {
6a3890c474795 (Al Viro 2020-09-26 16:29:02 -0400 1939) inserting_into = ep;
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 1940) return ep_loop_check_proc(to, 0);
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1941) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1942)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1943) static void clear_tfile_check_list(void)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1944) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1945) rcu_read_lock();
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1946) while (tfile_check_list != EP_UNACTIVE_PTR) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1947) struct epitems_head *head = tfile_check_list;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1948) tfile_check_list = head->next;
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1949) unlist_file(head);
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1950) }
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 1951) rcu_read_unlock();
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1952) }
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 1953)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1954) /*
523723bb5032f (Andrew Morton 2008-08-12 15:09:01 -0700 1955) * Open an eventpoll file descriptor.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1956) */
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 1957) static int do_epoll_create(int flags)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1958) {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1959) int error, fd;
bb57c3edcd2fc (Davide Libenzi 2009-03-31 15:24:12 -0700 1960) struct eventpoll *ep = NULL;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1961) struct file *file;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1962)
e38b36f325153 (Ulrich Drepper 2008-07-23 21:29:42 -0700 1963) /* Check the EPOLL_* constant for consistency. */
e38b36f325153 (Ulrich Drepper 2008-07-23 21:29:42 -0700 1964) BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
e38b36f325153 (Ulrich Drepper 2008-07-23 21:29:42 -0700 1965)
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1966) if (flags & ~EPOLL_CLOEXEC)
296e236e96ddd (Davide Libenzi 2009-03-31 15:24:11 -0700 1967) return -EINVAL;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1968) /*
bb57c3edcd2fc (Davide Libenzi 2009-03-31 15:24:12 -0700 1969) * Create the internal data structure ("struct eventpoll").
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1970) */
9fe5ad9c8cef9 (Ulrich Drepper 2008-07-23 21:29:43 -0700 1971) error = ep_alloc(&ep);
bb57c3edcd2fc (Davide Libenzi 2009-03-31 15:24:12 -0700 1972) if (error < 0)
bb57c3edcd2fc (Davide Libenzi 2009-03-31 15:24:12 -0700 1973) return error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1974) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1975) * Creates all the items needed to setup an eventpoll file. That is,
2030a42cecd4d (Al Viro 2008-02-23 06:46:49 -0500 1976) * a file structure and a free file descriptor.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1977) */
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1978) fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1979) if (fd < 0) {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1980) error = fd;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1981) goto out_free_ep;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1982) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1983) file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
628ff7c1d8d84 (Roland Dreier 2009-12-18 09:41:24 -0800 1984) O_RDWR | (flags & O_CLOEXEC));
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1985) if (IS_ERR(file)) {
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1986) error = PTR_ERR(file);
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1987) goto out_free_fd;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1988) }
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1989) ep->file = file;
98022748f6c7b (Al Viro 2012-08-17 22:42:36 -0400 1990) fd_install(fd, file);
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1991) return fd;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1992)
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1993) out_free_fd:
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1994) put_unused_fd(fd);
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1995) out_free_ep:
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 1996) ep_free(ep);
bb57c3edcd2fc (Davide Libenzi 2009-03-31 15:24:12 -0700 1997) return error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1998) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 1999)
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2000) SYSCALL_DEFINE1(epoll_create1, int, flags)
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2001) {
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2002) return do_epoll_create(flags);
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2003) }
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2004)
5a8a82b1d306a (Heiko Carstens 2009-01-14 14:14:25 +0100 2005) SYSCALL_DEFINE1(epoll_create, int, size)
a0998b50c3f0b (Ulrich Drepper 2008-07-23 21:29:27 -0700 2006) {
bfe3891a5f5d3 (Davide Libenzi 2009-05-12 13:19:44 -0700 2007) if (size <= 0)
9fe5ad9c8cef9 (Ulrich Drepper 2008-07-23 21:29:43 -0700 2008) return -EINVAL;
9fe5ad9c8cef9 (Ulrich Drepper 2008-07-23 21:29:43 -0700 2009)
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2010) return do_epoll_create(0);
a0998b50c3f0b (Ulrich Drepper 2008-07-23 21:29:27 -0700 2011) }
a0998b50c3f0b (Ulrich Drepper 2008-07-23 21:29:27 -0700 2012)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2013) static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2014) bool nonblock)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2015) {
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2016) if (!nonblock) {
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2017) mutex_lock_nested(mutex, depth);
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2018) return 0;
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2019) }
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2020) if (mutex_trylock(mutex))
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2021) return 0;
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2022) return -EAGAIN;
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2023) }
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2024)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2025) int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2026) bool nonblock)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2027) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2028) int error;
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2029) int full_check = 0;
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2030) struct fd f, tf;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2031) struct eventpoll *ep;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2032) struct epitem *epi;
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2033) struct eventpoll *tep = NULL;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2034)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2035) error = -EBADF;
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2036) f = fdget(epfd);
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2037) if (!f.file)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2038) goto error_return;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2039)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2040) /* Get the "struct file *" for the target file */
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2041) tf = fdget(fd);
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2042) if (!tf.file)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2043) goto error_fput;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2044)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2045) /* The target file descriptor must support poll */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2046) error = -EPERM;
9965ed174e7d3 (Christoph Hellwig 2018-03-05 07:26:05 -0800 2047) if (!file_can_poll(tf.file))
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2048) goto error_tgt_fput;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2049)
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 2050) /* Check if EPOLLWAKEUP is allowed */
c680e41b3a2e9 (Nicolas Iooss 2014-09-09 14:50:51 -0700 2051) if (ep_op_has_event(op))
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2052) ep_take_care_of_epollwakeup(epds);
4d7e30d98939a (Arve Hjønnevåg 2012-05-01 21:33:34 +0200 2053)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2054) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2055) * We have to check that the file structure underneath the file descriptor
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2056) * the user passed to us _is_ an eventpoll file. And also we do not permit
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2057) * adding an epoll file descriptor inside itself.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2058) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2059) error = -EINVAL;
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2060) if (f.file == tf.file || !is_file_epoll(f.file))
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2061) goto error_tgt_fput;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2062)
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 2063) /*
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 2064) * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 2065) * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 2066) * Also, we do not currently supported nested exclusive wakeups.
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 2067) */
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2068) if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2069) if (op == EPOLL_CTL_MOD)
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2070) goto error_tgt_fput;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2071) if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2072) (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2073) goto error_tgt_fput;
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2074) }
df0108c5da561 (Jason Baron 2016-01-20 14:59:24 -0800 2075)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2076) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2077) * At this point it is safe to assume that the "private_data" contains
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2078) * our own data structure.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2079) */
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2080) ep = f.file->private_data;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2081)
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 2082) /*
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 2083) * When we insert an epoll file descriptor inside another epoll file
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 2084) * descriptor, there is the chance of creating closed loops, which are
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 2085) * better be handled here, than in more critical paths. While we are
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 2086) * checking for loops we also determine the list of files reachable
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 2087) * and hang them on the tfile_check_list, so we can check that we
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 2088) * haven't created too many possible wakeup paths.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 2089) *
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2090) * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2091) * the epoll file descriptor is attaching directly to a wakeup source,
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2092) * unless the epoll file descriptor is nested. The purpose of taking the
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2093) * 'epmutex' on add is to prevent complex toplogies such as loops and
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2094) * deep wakeup paths from forming in parallel through multiple
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2095) * EPOLL_CTL_ADD operations.
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 2096) */
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2097) error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2098) if (error)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2099) goto error_tgt_fput;
28d82dc1c4edb (Jason Baron 2012-01-12 17:17:43 -0800 2100) if (op == EPOLL_CTL_ADD) {
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 2101) if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 2102) is_file_epoll(tf.file)) {
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2103) mutex_unlock(&ep->mtx);
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2104) error = epoll_mutex_lock(&epmutex, 0, nonblock);
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2105) if (error)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2106) goto error_tgt_fput;
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 2107) loop_check_gen++;
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2108) full_check = 1;
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2109) if (is_file_epoll(tf.file)) {
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 2110) tep = tf.file->private_data;
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2111) error = -ELOOP;
bde03c4c1a6b3 (Al Viro 2020-09-26 16:50:57 -0400 2112) if (ep_loop_check(ep, tep) != 0)
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2113) goto error_tgt_fput;
a9ed4a6560b85 (Marc Zyngier 2020-08-19 17:12:17 +0100 2114) }
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2115) error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
52c479697c9b7 (Al Viro 2020-08-22 18:25:52 -0400 2116) if (error)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2117) goto error_tgt_fput;
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2118) }
67347fe4e6326 (Jason Baron 2013-11-12 15:10:18 -0800 2119) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2120)
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 2121) /*
a6c67fee9cf09 (Randy Dunlap 2021-03-01 15:25:51 -0700 2122) * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 2123) * above, we can be sure to be able to use the item looked up by
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 2124) * ep_find() till we release the mutex.
67647d0fb8bc0 (Davide Libenzi 2007-05-15 01:40:52 -0700 2125) */
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2126) epi = ep_find(ep, tf.file, fd);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2127)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2128) error = -EINVAL;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2129) switch (op) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2130) case EPOLL_CTL_ADD:
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2131) if (!epi) {
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2132) epds->events |= EPOLLERR | EPOLLHUP;
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2133) error = ep_insert(ep, epds, tf.file, fd, full_check);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2134) } else
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2135) error = -EEXIST;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2136) break;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2137) case EPOLL_CTL_DEL:
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2138) if (epi)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2139) error = ep_remove(ep, epi);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2140) else
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2141) error = -ENOENT;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2142) break;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2143) case EPOLL_CTL_MOD:
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2144) if (epi) {
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2145) if (!(epi->event.events & EPOLLEXCLUSIVE)) {
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2146) epds->events |= EPOLLERR | EPOLLHUP;
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2147) error = ep_modify(ep, epi, epds);
b6a515c8a0f6c (Jason Baron 2016-02-05 15:37:04 -0800 2148) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2149) } else
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2150) error = -ENOENT;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2151) break;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2152) }
d47de16c72219 (Davide Libenzi 2007-05-15 01:40:41 -0700 2153) mutex_unlock(&ep->mtx);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2154)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2155) error_tgt_fput:
52c479697c9b7 (Al Viro 2020-08-22 18:25:52 -0400 2156) if (full_check) {
52c479697c9b7 (Al Viro 2020-08-22 18:25:52 -0400 2157) clear_tfile_check_list();
18306c404abe1 (Al Viro 2020-09-10 08:30:05 -0400 2158) loop_check_gen++;
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 2159) mutex_unlock(&epmutex);
52c479697c9b7 (Al Viro 2020-08-22 18:25:52 -0400 2160) }
22bacca48a175 (Davide Libenzi 2011-02-25 14:44:12 -0800 2161)
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2162) fdput(tf);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2163) error_fput:
7e3fb5842e229 (Al Viro 2013-08-30 12:47:21 -0400 2164) fdput(f);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2165) error_return:
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2166)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2167) return error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2168) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2169)
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2170) /*
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2171) * The following function implements the controller interface for
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2172) * the eventpoll file that enables the insertion/removal/change of
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2173) * file descriptors inside the interest set.
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2174) */
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2175) SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2176) struct epoll_event __user *, event)
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2177) {
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2178) struct epoll_event epds;
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2179)
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2180) if (ep_op_has_event(op) &&
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2181) copy_from_user(&epds, event, sizeof(struct epoll_event)))
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2182) return -EFAULT;
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2183)
39220e8d4a2aa (Jens Axboe 2020-01-08 15:05:37 -0700 2184) return do_epoll_ctl(epfd, op, fd, &epds, false);
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2185) }
58e41a44c488f (Jens Axboe 2020-01-08 14:35:13 -0700 2186)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2187) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2188) * Implement the event wait interface for the eventpoll file. It is the kernel
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2189) * part of the user space epoll_wait(2).
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2190) */
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2191) static int do_epoll_wait(int epfd, struct epoll_event __user *events,
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2192) int maxevents, struct timespec64 *to)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2193) {
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2194) int error;
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2195) struct fd f;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2196) struct eventpoll *ep;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2197)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2198) /* The maximum number of event must be greater than zero */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2199) if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2200) return -EINVAL;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2201)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2202) /* Verify that the area passed by the user is writeable */
96d4f267e40f9 (Linus Torvalds 2019-01-03 18:57:57 -0800 2203) if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2204) return -EFAULT;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2205)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2206) /* Get the "struct file *" for the eventpoll file */
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2207) f = fdget(epfd);
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2208) if (!f.file)
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2209) return -EBADF;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2210)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2211) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2212) * We have to check that the file structure underneath the fd
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2213) * the user passed to us _is_ an eventpoll file.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2214) */
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2215) error = -EINVAL;
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2216) if (!is_file_epoll(f.file))
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2217) goto error_fput;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2218)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2219) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2220) * At this point it is safe to assume that the "private_data" contains
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2221) * our own data structure.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2222) */
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2223) ep = f.file->private_data;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2224)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2225) /* Time to fish for events ... */
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2226) error = ep_poll(ep, events, maxevents, to);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2227)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2228) error_fput:
2903ff019b346 (Al Viro 2012-08-28 12:52:22 -0400 2229) fdput(f);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2230) return error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2231) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2232)
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2233) SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2234) int, maxevents, int, timeout)
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2235) {
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2236) struct timespec64 to;
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2237)
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2238) return do_epoll_wait(epfd, events, maxevents,
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2239) ep_timeout_to_timespec(&to, timeout));
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2240) }
791eb22eef0d0 (Dominik Brodowski 2018-03-11 11:34:30 +0100 2241)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2242) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2243) * Implement the event wait interface for the eventpoll file. It is the kernel
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2244) * part of the user space epoll_pwait(2).
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2245) */
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2246) static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2247) int maxevents, struct timespec64 *to,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2248) const sigset_t __user *sigmask, size_t sigsetsize)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2249) {
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2250) int error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2251)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2252) /*
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2253) * If the caller wants a certain signal mask to be set during the wait,
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2254) * we apply it here.
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2255) */
b772434be0891 (Oleg Nesterov 2019-07-16 16:29:53 -0700 2256) error = set_user_sigmask(sigmask, sigsetsize);
ded653ccbec03 (Deepa Dinamani 2018-09-19 21:41:04 -0700 2257) if (error)
ded653ccbec03 (Deepa Dinamani 2018-09-19 21:41:04 -0700 2258) return error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2259)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2260) error = do_epoll_wait(epfd, events, maxevents, to);
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2261)
b772434be0891 (Oleg Nesterov 2019-07-16 16:29:53 -0700 2262) restore_saved_sigmask_unless(error == -EINTR);
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2263)
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2264) return error;
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2265) }
7699acd1341c6 (Davide Libenzi 2007-05-10 22:23:23 -0700 2266)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2267) SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2268) int, maxevents, int, timeout, const sigset_t __user *, sigmask,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2269) size_t, sigsetsize)
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2270) {
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2271) struct timespec64 to;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2272)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2273) return do_epoll_pwait(epfd, events, maxevents,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2274) ep_timeout_to_timespec(&to, timeout),
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2275) sigmask, sigsetsize);
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2276) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2277)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2278) SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2279) int, maxevents, const struct __kernel_timespec __user *, timeout,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2280) const sigset_t __user *, sigmask, size_t, sigsetsize)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2281) {
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2282) struct timespec64 ts, *to = NULL;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2283)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2284) if (timeout) {
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2285) if (get_timespec64(&ts, timeout))
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2286) return -EFAULT;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2287) to = &ts;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2288) if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2289) return -EINVAL;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2290) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2291)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2292) return do_epoll_pwait(epfd, events, maxevents, to,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2293) sigmask, sigsetsize);
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2294) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2295)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2296) #ifdef CONFIG_COMPAT
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2297) static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2298) int maxevents, struct timespec64 *timeout,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2299) const compat_sigset_t __user *sigmask,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2300) compat_size_t sigsetsize)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2301) {
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2302) long err;
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2303)
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2304) /*
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2305) * If the caller wants a certain signal mask to be set during the wait,
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2306) * we apply it here.
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2307) */
b772434be0891 (Oleg Nesterov 2019-07-16 16:29:53 -0700 2308) err = set_compat_user_sigmask(sigmask, sigsetsize);
ded653ccbec03 (Deepa Dinamani 2018-09-19 21:41:04 -0700 2309) if (err)
ded653ccbec03 (Deepa Dinamani 2018-09-19 21:41:04 -0700 2310) return err;
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2311)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2312) err = do_epoll_wait(epfd, events, maxevents, timeout);
7cdf7c20e9714 (Willem de Bruijn 2020-12-18 14:05:35 -0800 2313)
b772434be0891 (Oleg Nesterov 2019-07-16 16:29:53 -0700 2314) restore_saved_sigmask_unless(err == -EINTR);
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2315)
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2316) return err;
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2317) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2318)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2319) COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2320) struct epoll_event __user *, events,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2321) int, maxevents, int, timeout,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2322) const compat_sigset_t __user *, sigmask,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2323) compat_size_t, sigsetsize)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2324) {
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2325) struct timespec64 to;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2326)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2327) return do_compat_epoll_pwait(epfd, events, maxevents,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2328) ep_timeout_to_timespec(&to, timeout),
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2329) sigmask, sigsetsize);
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2330) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2331)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2332) COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2333) struct epoll_event __user *, events,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2334) int, maxevents,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2335) const struct __kernel_timespec __user *, timeout,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2336) const compat_sigset_t __user *, sigmask,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2337) compat_size_t, sigsetsize)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2338) {
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2339) struct timespec64 ts, *to = NULL;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2340)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2341) if (timeout) {
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2342) if (get_timespec64(&ts, timeout))
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2343) return -EFAULT;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2344) to = &ts;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2345) if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2346) return -EINVAL;
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2347) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2348)
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2349) return do_compat_epoll_pwait(epfd, events, maxevents, to,
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2350) sigmask, sigsetsize);
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2351) }
58169a52ebc9a (Willem de Bruijn 2020-12-18 14:05:38 -0800 2352)
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2353) #endif
35280bd4a3fa8 (Al Viro 2013-02-24 14:52:17 -0500 2354)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2355) static int __init eventpoll_init(void)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2356) {
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 2357) struct sysinfo si;
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 2358)
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 2359) si_meminfo(&si);
9df04e1f25eff (Davide Libenzi 2009-01-29 14:25:26 -0800 2360) /*
9df04e1f25eff (Davide Libenzi 2009-01-29 14:25:26 -0800 2361) * Allows top 4% of lomem to be allocated for epoll watches (per user).
9df04e1f25eff (Davide Libenzi 2009-01-29 14:25:26 -0800 2362) */
9df04e1f25eff (Davide Libenzi 2009-01-29 14:25:26 -0800 2363) max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
7ef9964e6d1b9 (Davide Libenzi 2008-12-01 13:13:55 -0800 2364) EP_ITEM_COST;
52bd19f7691b2 (Robin Holt 2011-01-12 17:00:01 -0800 2365) BUG_ON(max_user_watches < 0);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2366)
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 2367) /*
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 2368) * We can have many thousands of epitems, so prevent this from
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 2369) * using an extra cache line on 64-bit (and smaller) CPUs
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 2370) */
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 2371) BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
39732ca5af4b0 (Eric Wong 2013-04-30 15:27:38 -0700 2372)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2373) /* Allocates slab cache used to allocate "struct epitem" items */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2374) epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2ae928a9441a3 (Shakeel Butt 2017-11-17 15:28:59 -0800 2375) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2376)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2377) /* Allocates slab cache used to allocate "struct eppoll_entry" */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2378) pwq_cache = kmem_cache_create("eventpoll_pwq",
2ae928a9441a3 (Shakeel Butt 2017-11-17 15:28:59 -0800 2379) sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2380)
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 2381) ephead_cache = kmem_cache_create("ep_head",
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 2382) sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
319c15174757a (Al Viro 2020-10-01 20:45:51 -0400 2383)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2384) return 0;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 2385) }
cea69241870e5 (Davide Libenzi 2007-05-10 22:23:22 -0700 2386) fs_initcall(eventpoll_init);