From 80285b75c683484db4daf02c41009e4424738dd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Sep 2020 11:45:57 -0400 Subject: [PATCH 01/27] epoll: switch epitem->pwqlist to single-linked list We only traverse it once to destroy all associated eppoll_entry at epitem destruction time. The order of traversal is irrelevant there. Signed-off-by: Al Viro --- fs/eventpoll.c | 51 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4df61129566d..ae41868d9b35 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -128,6 +128,24 @@ struct nested_calls { spinlock_t lock; }; +/* Wait structure used by the poll hooks */ +struct eppoll_entry { + /* List header used to link this structure to the "struct epitem" */ + struct eppoll_entry *next; + + /* The "base" pointer is set to the container "struct epitem" */ + struct epitem *base; + + /* + * Wait queue item that will be linked to the target file wait + * queue head. + */ + wait_queue_entry_t wait; + + /* The wait queue head that linked the "wait" wait queue item */ + wait_queue_head_t *whead; +}; + /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. @@ -158,7 +176,7 @@ struct epitem { int nwait; /* List containing poll wait queues */ - struct list_head pwqlist; + struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; @@ -231,24 +249,6 @@ struct eventpoll { #endif }; -/* Wait structure used by the poll hooks */ -struct eppoll_entry { - /* List header used to link this structure to the "struct epitem" */ - struct list_head llink; - - /* The "base" pointer is set to the container "struct epitem" */ - struct epitem *base; - - /* - * Wait queue item that will be linked to the target file wait - * queue head. - */ - wait_queue_entry_t wait; - - /* The wait queue head that linked the "wait" wait queue item */ - wait_queue_head_t *whead; -}; - /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; @@ -617,13 +617,11 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { - struct list_head *lsthead = &epi->pwqlist; + struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; - while (!list_empty(lsthead)) { - pwq = list_first_entry(lsthead, struct eppoll_entry, llink); - - list_del(&pwq->llink); + while ((pwq = *p) != NULL) { + *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } @@ -1320,7 +1318,8 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); - list_add_tail(&pwq->llink, &epi->pwqlist); + pwq->next = epi->pwqlist; + epi->pwqlist = pwq; epi->nwait++; } else { /* We have to signal that an error occurred */ @@ -1507,7 +1506,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); - INIT_LIST_HEAD(&epi->pwqlist); + epi->pwqlist = NULL; epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; From 364f374f22ba1b049b17de602a8e5eceb1df4cef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Sep 2020 11:55:09 -0400 Subject: [PATCH 02/27] epoll: get rid of epitem->nwait we use it only to indicate allocation failures within queueing callback back to ep_insert(). Might as well use epq.epi for that reporting... Signed-off-by: Al Viro --- fs/eventpoll.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ae41868d9b35..44aca681d897 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -172,9 +172,6 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; - /* Number of active wait queue attached to poll operations */ - int nwait; - /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -351,12 +348,6 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) return container_of(p, struct eppoll_entry, wait)->base; } -/* Get the "struct epitem" from an epoll queue wrapper */ -static inline struct epitem *ep_item_from_epqueue(poll_table *p) -{ - return container_of(p, struct ep_pqueue, pt)->epi; -} - /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { @@ -1307,24 +1298,28 @@ out_unlock: static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { - struct epitem *epi = ep_item_from_epqueue(pt); + struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); + struct epitem *epi = epq->epi; struct eppoll_entry *pwq; - if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { - init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); - pwq->whead = whead; - pwq->base = epi; - if (epi->event.events & EPOLLEXCLUSIVE) - add_wait_queue_exclusive(whead, &pwq->wait); - else - add_wait_queue(whead, &pwq->wait); - pwq->next = epi->pwqlist; - epi->pwqlist = pwq; - epi->nwait++; - } else { - /* We have to signal that an error occurred */ - epi->nwait = -1; + if (unlikely(!epi)) // an earlier allocation has failed + return; + + pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); + if (unlikely(!pwq)) { + epq->epi = NULL; + return; } + + init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); + pwq->whead = whead; + pwq->base = epi; + if (epi->event.events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(whead, &pwq->wait); + else + add_wait_queue(whead, &pwq->wait); + pwq->next = epi->pwqlist; + epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) @@ -1510,7 +1505,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; - epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); @@ -1555,7 +1549,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * high memory pressure. */ error = -ENOMEM; - if (epi->nwait < 0) + if (!epq.epi) goto error_unregister; /* We have to drop the new item inside our item list to keep track of it */ From 8677600d796697f04adcdec57bd73a9f54c79697 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Aug 2020 22:05:06 -0400 Subject: [PATCH 03/27] untangling ep_call_nested(): get rid of useless arguments ctx is always equal to current, ncalls - to &poll_loop_ncalls. Signed-off-by: Al Viro --- fs/eventpoll.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 44aca681d897..ef73d71a5dc8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -455,21 +455,19 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * the same nested call (by the meaning of same cookie) is * no re-entered. * - * @ncalls: Pointer to the nested_calls structure to be used for this call. * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. - * @ctx: This instance context. * * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ -static int ep_call_nested(struct nested_calls *ncalls, - int (*nproc)(void *, void *, int), void *priv, - void *cookie, void *ctx) +static int ep_call_nested(int (*nproc)(void *, void *, int), void *priv, + void *cookie) { int error, call_nests = 0; unsigned long flags; + struct nested_calls *ncalls = &poll_loop_ncalls; struct list_head *lsthead = &ncalls->tasks_call_list; struct nested_call_node *tncur; struct nested_call_node tnode; @@ -482,7 +480,7 @@ static int ep_call_nested(struct nested_calls *ncalls, * very much limited. */ list_for_each_entry(tncur, lsthead, llink) { - if (tncur->ctx == ctx && + if (tncur->ctx == current && (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { /* * Ops ... loop detected or maximum nest level reached. @@ -494,7 +492,7 @@ static int ep_call_nested(struct nested_calls *ncalls, } /* Add the current task and cookie to the list */ - tnode.ctx = ctx; + tnode.ctx = current; tnode.cookie = cookie; list_add(&tnode.llink, lsthead); @@ -1397,10 +1395,8 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) break; } } else { - error = ep_call_nested(&poll_loop_ncalls, - reverse_path_check_proc, - child_file, child_file, - current); + error = ep_call_nested(reverse_path_check_proc, + child_file, child_file); } if (error != 0) break; @@ -1431,9 +1427,8 @@ static int reverse_path_check(void) /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); - error = ep_call_nested(&poll_loop_ncalls, - reverse_path_check_proc, current_file, - current_file, current); + error = ep_call_nested(reverse_path_check_proc, current_file, + current_file); if (error) break; } @@ -1970,9 +1965,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - error = ep_call_nested(&poll_loop_ncalls, - ep_loop_check_proc, epi->ffd.file, - ep_tovisit, current); + error = ep_call_nested(ep_loop_check_proc, epi->ffd.file, + ep_tovisit); if (error != 0) break; } else { @@ -2009,8 +2003,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, - ep_loop_check_proc, file, ep, current); + return ep_call_nested(ep_loop_check_proc, file, ep); } static void clear_tfile_check_list(void) From d01f0594d727d2346e4b6ac26ae63e416b60c3f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Aug 2020 22:19:12 -0400 Subject: [PATCH 04/27] untangling ep_call_nested(): it's all serialized on epmutex. IOW, * no locking is needed to protect the list * the list is actually a stack * no need to check ->ctx * it can bloody well be a static 5-element array - nobody is going to be accessing it in parallel. Signed-off-by: Al Viro --- fs/eventpoll.c | 80 +++++++------------------------------------------- 1 file changed, 11 insertions(+), 69 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ef73d71a5dc8..43aecae0935c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -109,25 +109,6 @@ struct epoll_filefd { int fd; } __packed; -/* - * Structure used to track possible nested calls, for too deep recursions - * and loop cycles. - */ -struct nested_call_node { - struct list_head llink; - void *cookie; - void *ctx; -}; - -/* - * This structure is used as collector for nested calls, to check for - * maximum recursion dept and loop cycles. - */ -struct nested_calls { - struct list_head tasks_call_list; - spinlock_t lock; -}; - /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ @@ -273,7 +254,8 @@ static DEFINE_MUTEX(epmutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ -static struct nested_calls poll_loop_ncalls; +static void *cookies[EP_MAX_NESTS + 1]; +static int nesting; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -348,13 +330,6 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) return container_of(p, struct eppoll_entry, wait)->base; } -/* Initialize the poll safe wake up structure */ -static void ep_nested_calls_init(struct nested_calls *ncalls) -{ - INIT_LIST_HEAD(&ncalls->tasks_call_list); - spin_lock_init(&ncalls->lock); -} - /** * ep_events_available - Checks if ready events might be available. * @@ -465,47 +440,20 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) static int ep_call_nested(int (*nproc)(void *, void *, int), void *priv, void *cookie) { - int error, call_nests = 0; - unsigned long flags; - struct nested_calls *ncalls = &poll_loop_ncalls; - struct list_head *lsthead = &ncalls->tasks_call_list; - struct nested_call_node *tncur; - struct nested_call_node tnode; + int error, i; - spin_lock_irqsave(&ncalls->lock, flags); + if (nesting > EP_MAX_NESTS) /* too deep nesting */ + return -1; - /* - * Try to see if the current task is already inside this wakeup call. - * We use a list here, since the population inside this set is always - * very much limited. - */ - list_for_each_entry(tncur, lsthead, llink) { - if (tncur->ctx == current && - (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { - /* - * Ops ... loop detected or maximum nest level reached. - * We abort this wake by breaking the cycle itself. - */ - error = -1; - goto out_unlock; - } + for (i = 0; i < nesting; i++) { + if (cookies[i] == cookie) /* loop detected */ + return -1; } - - /* Add the current task and cookie to the list */ - tnode.ctx = current; - tnode.cookie = cookie; - list_add(&tnode.llink, lsthead); - - spin_unlock_irqrestore(&ncalls->lock, flags); + cookies[nesting++] = cookie; /* Call the nested function */ - error = (*nproc)(priv, cookie, call_nests); - - /* Remove the current task from the list */ - spin_lock_irqsave(&ncalls->lock, flags); - list_del(&tnode.llink); -out_unlock: - spin_unlock_irqrestore(&ncalls->lock, flags); + error = (*nproc)(priv, cookie, nesting - 1); + nesting--; return error; } @@ -2379,12 +2327,6 @@ static int __init eventpoll_init(void) EP_ITEM_COST; BUG_ON(max_user_watches < 0); - /* - * Initialize the structure used to perform epoll file descriptor - * inclusion loops checks. - */ - ep_nested_calls_init(&poll_loop_ncalls); - /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs From 3b1688efa01cd5b5dc99d624e9fac68b6112f35d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Aug 2020 23:05:44 -0400 Subject: [PATCH 05/27] untangling ep_call_nested(): take pushing cookie into a helper Signed-off-by: Al Viro --- fs/eventpoll.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 43aecae0935c..bd2cc78c47c8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -424,6 +424,21 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) #endif /* CONFIG_NET_RX_BUSY_POLL */ +static bool ep_push_nested(void *cookie) +{ + int i; + + if (nesting > EP_MAX_NESTS) /* too deep nesting */ + return false; + + for (i = 0; i < nesting; i++) { + if (cookies[i] == cookie) /* loop detected */ + return false; + } + cookies[nesting++] = cookie; + return true; +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -440,17 +455,10 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) static int ep_call_nested(int (*nproc)(void *, void *, int), void *priv, void *cookie) { - int error, i; + int error; - if (nesting > EP_MAX_NESTS) /* too deep nesting */ + if (!ep_push_nested(cookie)) return -1; - - for (i = 0; i < nesting; i++) { - if (cookies[i] == cookie) /* loop detected */ - return -1; - } - cookies[nesting++] = cookie; - /* Call the nested function */ error = (*nproc)(priv, cookie, nesting - 1); nesting--; From 99d84d4330e8a67abfab77edb985db18aa603921 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Aug 2020 23:08:37 -0400 Subject: [PATCH 06/27] untangling ep_call_nested(): move push/pop of cookie into the callbacks Signed-off-by: Al Viro --- fs/eventpoll.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index bd2cc78c47c8..9a6ee5991f3d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -455,15 +455,7 @@ static bool ep_push_nested(void *cookie) static int ep_call_nested(int (*nproc)(void *, void *, int), void *priv, void *cookie) { - int error; - - if (!ep_push_nested(cookie)) - return -1; - /* Call the nested function */ - error = (*nproc)(priv, cookie, nesting - 1); - nesting--; - - return error; + return (*nproc)(priv, cookie, nesting); } /* @@ -1340,6 +1332,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) struct file *child_file; struct epitem *epi; + if (!ep_push_nested(cookie)) /* limits recursion */ + return -1; + /* CTL_DEL can remove links here, but that can't increase our count */ rcu_read_lock(); list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { @@ -1362,6 +1357,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) } } rcu_read_unlock(); + nesting--; /* pop */ return error; } @@ -1913,6 +1909,9 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct rb_node *rbp; struct epitem *epi; + if (!ep_push_nested(cookie)) /* limits recursion */ + return -1; + mutex_lock_nested(&ep->mtx, call_nests + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -1942,6 +1941,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) } } mutex_unlock(&ep->mtx); + nesting--; /* pop */ return error; } From 773318eddbacf77d6c98bf6073db38b1cccc7ac4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Aug 2020 23:13:27 -0400 Subject: [PATCH 07/27] untangling ep_call_nested(): and there was much rejoicing Signed-off-by: Al Viro --- fs/eventpoll.c | 43 +++++++++++-------------------------------- 1 file changed, 11 insertions(+), 32 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9a6ee5991f3d..8c3b02755a50 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -439,25 +439,6 @@ static bool ep_push_nested(void *cookie) return true; } -/** - * ep_call_nested - Perform a bound (possibly) nested call, by checking - * that the recursion limit is not exceeded, and that - * the same nested call (by the meaning of same cookie) is - * no re-entered. - * - * @nproc: Nested call core function pointer. - * @priv: Opaque data to be passed to the @nproc callback. - * @cookie: Cookie to be used to identify this nested call. - * - * Returns: Returns the code returned by the @nproc callback, or -1 if - * the maximum recursion limit has been exceeded. - */ -static int ep_call_nested(int (*nproc)(void *, void *, int), void *priv, - void *cookie) -{ - return (*nproc)(priv, cookie, nesting); -} - /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled @@ -1325,7 +1306,7 @@ static void path_count_init(void) path_count[i] = 0; } -static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +static int reverse_path_check_proc(void *priv, void *cookie, int depth) { int error = 0; struct file *file = priv; @@ -1341,13 +1322,13 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) child_file = epi->ep->file; if (is_file_epoll(child_file)) { if (list_empty(&child_file->f_ep_links)) { - if (path_count_inc(call_nests)) { + if (path_count_inc(depth)) { error = -1; break; } } else { - error = ep_call_nested(reverse_path_check_proc, - child_file, child_file); + error = reverse_path_check_proc(child_file, child_file, + depth + 1); } if (error != 0) break; @@ -1379,8 +1360,7 @@ static int reverse_path_check(void) /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); - error = ep_call_nested(reverse_path_check_proc, current_file, - current_file); + error = reverse_path_check_proc(current_file, current_file, 0); if (error) break; } @@ -1886,8 +1866,7 @@ send_events: } /** - * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() - * API, to verify that adding an epoll file inside another + * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure, does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). @@ -1900,7 +1879,7 @@ send_events: * Returns: Returns zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or -1 otherwise. */ -static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +static int ep_loop_check_proc(void *priv, void *cookie, int depth) { int error = 0; struct file *file = priv; @@ -1912,7 +1891,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) if (!ep_push_nested(cookie)) /* limits recursion */ return -1; - mutex_lock_nested(&ep->mtx, call_nests + 1); + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); @@ -1920,8 +1899,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - error = ep_call_nested(ep_loop_check_proc, epi->ffd.file, - ep_tovisit); + error = ep_loop_check_proc(epi->ffd.file, ep_tovisit, + depth + 1); if (error != 0) break; } else { @@ -1959,7 +1938,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(ep_loop_check_proc, file, ep); + return ep_loop_check_proc(file, ep, 0); } static void clear_tfile_check_list(void) From aebf15f0fbd54e8deebc56642c08da15b905027c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Aug 2020 23:29:02 -0400 Subject: [PATCH 08/27] reverse_path_check_proc(): sane arguments no need to force its calling conventions to match the callback for late unlamented ep_call_nested()... Signed-off-by: Al Viro --- fs/eventpoll.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8c3b02755a50..3e6f1f97f246 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1306,20 +1306,18 @@ static void path_count_init(void) path_count[i] = 0; } -static int reverse_path_check_proc(void *priv, void *cookie, int depth) +static int reverse_path_check_proc(struct file *file, int depth) { int error = 0; - struct file *file = priv; - struct file *child_file; struct epitem *epi; - if (!ep_push_nested(cookie)) /* limits recursion */ + if (!ep_push_nested(file)) /* limits recursion */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ rcu_read_lock(); list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { - child_file = epi->ep->file; + struct file *child_file = epi->ep->file; if (is_file_epoll(child_file)) { if (list_empty(&child_file->f_ep_links)) { if (path_count_inc(depth)) { @@ -1327,7 +1325,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int depth) break; } } else { - error = reverse_path_check_proc(child_file, child_file, + error = reverse_path_check_proc(child_file, depth + 1); } if (error != 0) @@ -1360,7 +1358,7 @@ static int reverse_path_check(void) /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); - error = reverse_path_check_proc(current_file, current_file, 0); + error = reverse_path_check_proc(current_file, 0); if (error) break; } From 0c320f776ed83e1ebde4d49bc316b23e868b4737 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Sep 2020 19:48:56 -0400 Subject: [PATCH 09/27] reverse_path_check_proc(): don't bother with cookies We know there's no loops by the time we call it; the only thing we care about is too deep reverse paths. Signed-off-by: Al Viro --- fs/eventpoll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3e6f1f97f246..0f540e91aa92 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1311,7 +1311,7 @@ static int reverse_path_check_proc(struct file *file, int depth) int error = 0; struct epitem *epi; - if (!ep_push_nested(file)) /* limits recursion */ + if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ @@ -1336,7 +1336,6 @@ static int reverse_path_check_proc(struct file *file, int depth) } } rcu_read_unlock(); - nesting--; /* pop */ return error; } From d16312a46936baa1a44b5f78394f4b0d1d01762a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 15:54:05 -0400 Subject: [PATCH 10/27] clean reverse_path_check_proc() a bit Signed-off-by: Al Viro --- fs/eventpoll.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0f540e91aa92..33af838046ea 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1317,23 +1317,15 @@ static int reverse_path_check_proc(struct file *file, int depth) /* CTL_DEL can remove links here, but that can't increase our count */ rcu_read_lock(); list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { - struct file *child_file = epi->ep->file; - if (is_file_epoll(child_file)) { - if (list_empty(&child_file->f_ep_links)) { - if (path_count_inc(depth)) { - error = -1; - break; - } - } else { - error = reverse_path_check_proc(child_file, - depth + 1); - } - if (error != 0) - break; - } else { - printk(KERN_ERR "reverse_path_check_proc: " - "file is not an ep!\n"); - } + struct file *recepient = epi->ep->file; + if (WARN_ON(!is_file_epoll(recepient))) + continue; + if (list_empty(&recepient->f_ep_links)) + error = path_count_inc(depth); + else + error = reverse_path_check_proc(recepient, depth + 1); + if (error != 0) + break; } rcu_read_unlock(); return error; From 56c428cac5a2c361271370dde3a22cb640bc9934 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 16:38:44 -0400 Subject: [PATCH 11/27] ep_loop_check_proc(): lift pushing the cookie into callers Signed-off-by: Al Viro --- fs/eventpoll.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 33af838046ea..9edea3933790 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1877,9 +1877,6 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) struct rb_node *rbp; struct epitem *epi; - if (!ep_push_nested(cookie)) /* limits recursion */ - return -1; - mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -1888,8 +1885,13 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - error = ep_loop_check_proc(epi->ffd.file, ep_tovisit, + if (!ep_push_nested(ep_tovisit)) { + error = -1; + } else { + error = ep_loop_check_proc(epi->ffd.file, ep_tovisit, depth + 1); + nesting--; + } if (error != 0) break; } else { @@ -1909,7 +1911,6 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) } } mutex_unlock(&ep->mtx); - nesting--; /* pop */ return error; } @@ -1927,7 +1928,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_loop_check_proc(file, ep, 0); + int err; + + ep_push_nested(ep); // can't fail + err = ep_loop_check_proc(file, ep, 0); + nesting--; + return err; } static void clear_tfile_check_list(void) From 6a3890c474795a4a3536e0a0c39f526e415eb212 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 16:29:02 -0400 Subject: [PATCH 12/27] get rid of ep_push_nested() The only remaining user is loop checking. But there we only need to check that we have not walked into the epoll we are inserting into - we are adding an edge to acyclic graph, so any loop being created will have to pass through the source of that edge. So we don't need that array of cookies - we have only one eventpoll to watch out for. RIP ep_push_nested(), along with the cookies array. Signed-off-by: Al Viro --- fs/eventpoll.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9edea3933790..6b1990b8b9a0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -254,8 +254,7 @@ static DEFINE_MUTEX(epmutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ -static void *cookies[EP_MAX_NESTS + 1]; -static int nesting; +static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -424,21 +423,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) #endif /* CONFIG_NET_RX_BUSY_POLL */ -static bool ep_push_nested(void *cookie) -{ - int i; - - if (nesting > EP_MAX_NESTS) /* too deep nesting */ - return false; - - for (i = 0; i < nesting; i++) { - if (cookies[i] == cookie) /* loop detected */ - return false; - } - cookies[nesting++] = cookie; - return true; -} - /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled @@ -1885,12 +1869,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - if (!ep_push_nested(ep_tovisit)) { + if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) { error = -1; } else { error = ep_loop_check_proc(epi->ffd.file, ep_tovisit, depth + 1); - nesting--; } if (error != 0) break; @@ -1928,12 +1911,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - int err; - - ep_push_nested(ep); // can't fail - err = ep_loop_check_proc(file, ep, 0); - nesting--; - return err; + inserting_into = ep; + return ep_loop_check_proc(file, ep, 0); } static void clear_tfile_check_list(void) From bde03c4c1a6b3b679a63aa8f275ac12ffdd58c65 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 16:50:57 -0400 Subject: [PATCH 13/27] ep_loop_check_proc(): saner calling conventions 1) 'cookie' argument is unused; kill it. 2) 'priv' one is always an epoll struct file, and we only care about its associated struct eventpoll; pass that instead. Signed-off-by: Al Viro --- fs/eventpoll.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6b1990b8b9a0..e971e3ace557 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1845,19 +1845,14 @@ send_events: * result in excessive stack usage). * * @priv: Pointer to the epoll file to be currently checked. - * @cookie: Original cookie for this call. This is the top-of-the-chain epoll - * data structure pointer. - * @call_nests: Current dept of the @ep_call_nested() call stack. + * @depth: Current depth of the path being checked. * * Returns: Returns zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or -1 otherwise. */ -static int ep_loop_check_proc(void *priv, void *cookie, int depth) +static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; - struct file *file = priv; - struct eventpoll *ep = file->private_data; - struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; @@ -1866,15 +1861,14 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) { + if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; - } else { - error = ep_loop_check_proc(epi->ffd.file, ep_tovisit, - depth + 1); - } + else + error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { @@ -1899,20 +1893,20 @@ static int ep_loop_check_proc(void *priv, void *cookie, int depth) } /** - * ep_loop_check - Performs a check to verify that adding an epoll file (@file) - * another epoll file (represented by @ep) does not create + * ep_loop_check - Performs a check to verify that adding an epoll file (@to) + * into another epoll file (represented by @from) does not create * closed loops or too deep chains. * - * @ep: Pointer to the epoll private data structure. - * @file: Pointer to the epoll file to be checked. + * @from: Pointer to the epoll we are inserting into. + * @to: Pointer to the epoll to be inserted. * - * Returns: Returns zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or -1 otherwise. + * Returns: Returns zero if adding the epoll @to inside the epoll @from + * does not violate the constraints, or -1 otherwise. */ -static int ep_loop_check(struct eventpoll *ep, struct file *file) +static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; - return ep_loop_check_proc(file, ep, 0); + return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) @@ -2086,8 +2080,9 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { + tep = tf.file->private_data; error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) + if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } else { get_file(tf.file); @@ -2098,7 +2093,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (is_file_epoll(tf.file)) { - tep = tf.file->private_data; error = epoll_mutex_lock(&tep->mtx, 1, nonblock); if (error) { mutex_unlock(&ep->mtx); From db502f8a3b0bb5188f92d9d6a68aed223892689b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 Aug 2020 13:06:51 -0400 Subject: [PATCH 14/27] ep_scan_ready_list(): prepare to splitup take the stuff done before and after the callback into separate helpers Signed-off-by: Al Viro --- fs/eventpoll.c | 63 ++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e971e3ace557..eb012fdc152e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -561,28 +561,10 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) rcu_read_unlock(); } -/** - * ep_scan_ready_list - Scans the ready list in a way that makes possible for - * the scan code, to call f_op->poll(). Also allows for - * O(NumReady) performance. - * - * @ep: Pointer to the epoll private data structure. - * @sproc: Pointer to the scan callback. - * @priv: Private opaque data passed to the @sproc callback. - * @depth: The current depth of recursive f_op->poll calls. - * @ep_locked: caller already holds ep->mtx - * - * Returns: The same integer error code returned by the @sproc callback. - */ -static __poll_t ep_scan_ready_list(struct eventpoll *ep, - __poll_t (*sproc)(struct eventpoll *, - struct list_head *, void *), - void *priv, int depth, bool ep_locked) +static void ep_start_scan(struct eventpoll *ep, + int depth, bool ep_locked, + struct list_head *txlist) { - __poll_t res; - struct epitem *epi, *nepi; - LIST_HEAD(txlist); - lockdep_assert_irqs_enabled(); /* @@ -602,14 +584,16 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * in a lockless way. */ write_lock_irq(&ep->lock); - list_splice_init(&ep->rdllist, &txlist); + list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); +} - /* - * Now call the callback function. - */ - res = (*sproc)(ep, &txlist, priv); +static void ep_done_scan(struct eventpoll *ep, + int depth, bool ep_locked, + struct list_head *txlist) +{ + struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* @@ -644,13 +628,38 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, /* * Quickly re-inject items left on "txlist". */ - list_splice(&txlist, &ep->rdllist); + list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); write_unlock_irq(&ep->lock); if (!ep_locked) mutex_unlock(&ep->mtx); +} +/** + * ep_scan_ready_list - Scans the ready list in a way that makes possible for + * the scan code, to call f_op->poll(). Also allows for + * O(NumReady) performance. + * + * @ep: Pointer to the epoll private data structure. + * @sproc: Pointer to the scan callback. + * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. + * @ep_locked: caller already holds ep->mtx + * + * Returns: The same integer error code returned by the @sproc callback. + */ +static __poll_t ep_scan_ready_list(struct eventpoll *ep, + __poll_t (*sproc)(struct eventpoll *, + struct list_head *, void *), + void *priv, int depth, bool ep_locked) +{ + __poll_t res; + LIST_HEAD(txlist); + + ep_start_scan(ep, depth, ep_locked, &txlist); + res = (*sproc)(ep, &txlist, priv); + ep_done_scan(ep, depth, ep_locked, &txlist); return res; } From 1ec09974d845bdf827028aa7deb96378f54bcd06 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 Aug 2020 13:16:39 -0400 Subject: [PATCH 15/27] lift the calls of ep_read_events_proc() into the callers Expand the calls of ep_scan_ready_list() that get ep_read_events_proc(). As a side benefit we can pass depth to ep_read_events_proc() by value and not by address - the latter used to be forced by the signature expected from ep_scan_ready_list() callback. Signed-off-by: Al Viro --- fs/eventpoll.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eb012fdc152e..9b9e29e0c85f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -774,7 +774,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) } static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, - void *priv); + int depth); static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt); @@ -787,6 +787,8 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct eventpoll *ep; + LIST_HEAD(txlist); + __poll_t res; bool locked; pt->_key = epi->event.events; @@ -797,20 +799,19 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, poll_wait(epi->ffd.file, &ep->poll_wait, pt); locked = pt && (pt->_qproc == ep_ptable_queue_proc); - return ep_scan_ready_list(epi->ffd.file->private_data, - ep_read_events_proc, &depth, depth, - locked) & epi->event.events; + ep_start_scan(ep, depth, locked, &txlist); + res = ep_read_events_proc(ep, &txlist, depth + 1); + ep_done_scan(ep, depth, locked, &txlist); + return res & epi->event.events; } static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, - void *priv) + int depth) { struct epitem *epi, *tmp; poll_table pt; - int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); - depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { if (ep_item_poll(epi, &pt, depth)) { @@ -832,7 +833,8 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { struct eventpoll *ep = file->private_data; - int depth = 0; + LIST_HEAD(txlist); + __poll_t res; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); @@ -841,8 +843,10 @@ static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) * Proceed to find out if wanted events are really available inside * the ready list. */ - return ep_scan_ready_list(ep, ep_read_events_proc, - &depth, depth, false); + ep_start_scan(ep, 0, false, &txlist); + res = ep_read_events_proc(ep, &txlist, 1); + ep_done_scan(ep, 0, false, &txlist); + return res; } #ifdef CONFIG_PROC_FS From 443f1a0422338a2c7a2e4f63dbe3977f1be566ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 Aug 2020 13:19:53 -0400 Subject: [PATCH 16/27] lift the calls of ep_send_events_proc() into the callers ... and kill ep_scan_ready_list() Signed-off-by: Al Viro --- fs/eventpoll.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9b9e29e0c85f..3b3a862f8014 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -636,33 +636,6 @@ static void ep_done_scan(struct eventpoll *ep, mutex_unlock(&ep->mtx); } -/** - * ep_scan_ready_list - Scans the ready list in a way that makes possible for - * the scan code, to call f_op->poll(). Also allows for - * O(NumReady) performance. - * - * @ep: Pointer to the epoll private data structure. - * @sproc: Pointer to the scan callback. - * @priv: Private opaque data passed to the @sproc callback. - * @depth: The current depth of recursive f_op->poll calls. - * @ep_locked: caller already holds ep->mtx - * - * Returns: The same integer error code returned by the @sproc callback. - */ -static __poll_t ep_scan_ready_list(struct eventpoll *ep, - __poll_t (*sproc)(struct eventpoll *, - struct list_head *, void *), - void *priv, int depth, bool ep_locked) -{ - __poll_t res; - LIST_HEAD(txlist); - - ep_start_scan(ep, depth, ep_locked, &txlist); - res = (*sproc)(ep, &txlist, priv); - ep_done_scan(ep, depth, ep_locked, &txlist); - return res; -} - static void epi_rcu_free(struct rcu_head *head) { struct epitem *epi = container_of(head, struct epitem, rcu); @@ -1685,11 +1658,15 @@ static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct ep_send_events_data esed; + LIST_HEAD(txlist); esed.maxevents = maxevents; esed.events = events; - ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); + ep_start_scan(ep, 0, false, &txlist); + ep_send_events_proc(ep, &txlist, &esed); + ep_done_scan(ep, 0, false, &txlist); + return esed.res; } From ff07952aeda8563d5080da3a0754db83ed0650f6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 Aug 2020 13:39:52 -0400 Subject: [PATCH 17/27] ep_send_events_proc(): fold into the caller ... and get rid of struct ep_send_events_data - not needed anymore. The weird way of passing the arguments in (and real return value out - nominal return value of ep_send_events_proc() is ignored) was due to the signature forced on ep_scan_ready_list() callbacks. Signed-off-by: Al Viro --- fs/eventpoll.c | 60 +++++++++++++++++--------------------------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3b3a862f8014..ac996b959e94 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -233,13 +233,6 @@ struct ep_pqueue { struct epitem *epi; }; -/* Used by the ep_send_events() function as callback private data */ -struct ep_send_events_data { - int maxevents; - struct epoll_event __user *events; - int res; -}; - /* * Configuration options available inside /proc/sys/fs/epoll/ */ @@ -1570,18 +1563,17 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, return 0; } -static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head, - void *priv) +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) { - struct ep_send_events_data *esed = priv; - __poll_t revents; struct epitem *epi, *tmp; - struct epoll_event __user *uevent = esed->events; - struct wakeup_source *ws; + LIST_HEAD(txlist); poll_table pt; + int res = 0; init_poll_funcptr(&pt, NULL); - esed->res = 0; + + ep_start_scan(ep, 0, false, &txlist); /* * We can loop without lock because we are passed a task private list. @@ -1590,8 +1582,11 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head */ lockdep_assert_held(&ep->mtx); - list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (esed->res >= esed->maxevents) + list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { + struct wakeup_source *ws; + __poll_t revents; + + if (res >= maxevents) break; /* @@ -1622,16 +1617,16 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head if (!revents) continue; - if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) { - list_add(&epi->rdllink, head); + if (__put_user(revents, &events->events) || + __put_user(epi->event.data, &events->data)) { + list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); - if (!esed->res) - esed->res = -EFAULT; - return 0; + if (!res) + res = -EFAULT; + break; } - esed->res++; - uevent++; + res++; + events++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { @@ -1650,24 +1645,9 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head ep_pm_stay_awake(epi); } } - - return 0; -} - -static int ep_send_events(struct eventpoll *ep, - struct epoll_event __user *events, int maxevents) -{ - struct ep_send_events_data esed; - LIST_HEAD(txlist); - - esed.maxevents = maxevents; - esed.events = events; - - ep_start_scan(ep, 0, false, &txlist); - ep_send_events_proc(ep, &txlist, &esed); ep_done_scan(ep, 0, false, &txlist); - return esed.res; + return res; } static inline struct timespec64 ep_set_mstimeout(long ms) From 57804b1cc4616780c72a2d0930d1bd0d5bd3ed4c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 Aug 2020 13:41:30 -0400 Subject: [PATCH 18/27] lift locking/unlocking ep->mtx out of ep_{start,done}_scan() get rid of depth/ep_locked arguments there and document the kludge in ep_item_poll() that has lead to ep_locked existence in the first place Signed-off-by: Al Viro --- fs/eventpoll.c | 57 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ac996b959e94..f9c567af1f5f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -554,20 +554,13 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) rcu_read_unlock(); } -static void ep_start_scan(struct eventpoll *ep, - int depth, bool ep_locked, - struct list_head *txlist) + +/* + * ep->mutex needs to be held because we could be hit by + * eventpoll_release_file() and epoll_ctl(). + */ +static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { - lockdep_assert_irqs_enabled(); - - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(). - */ - - if (!ep_locked) - mutex_lock_nested(&ep->mtx, depth); - /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events @@ -576,6 +569,7 @@ static void ep_start_scan(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ + lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); @@ -583,7 +577,6 @@ static void ep_start_scan(struct eventpoll *ep, } static void ep_done_scan(struct eventpoll *ep, - int depth, bool ep_locked, struct list_head *txlist) { struct epitem *epi, *nepi; @@ -624,9 +617,6 @@ static void ep_done_scan(struct eventpoll *ep, list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); write_unlock_irq(&ep->lock); - - if (!ep_locked) - mutex_unlock(&ep->mtx); } static void epi_rcu_free(struct rcu_head *head) @@ -763,11 +753,16 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, ep = epi->ffd.file->private_data; poll_wait(epi->ffd.file, &ep->poll_wait, pt); - locked = pt && (pt->_qproc == ep_ptable_queue_proc); - ep_start_scan(ep, depth, locked, &txlist); + // kludge: ep_insert() calls us with ep->mtx already locked + locked = pt && (pt->_qproc == ep_ptable_queue_proc); + if (!locked) + mutex_lock_nested(&ep->mtx, depth); + ep_start_scan(ep, &txlist); res = ep_read_events_proc(ep, &txlist, depth + 1); - ep_done_scan(ep, depth, locked, &txlist); + ep_done_scan(ep, &txlist); + if (!locked) + mutex_unlock(&ep->mtx); return res & epi->event.events; } @@ -809,9 +804,11 @@ static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) * Proceed to find out if wanted events are really available inside * the ready list. */ - ep_start_scan(ep, 0, false, &txlist); + mutex_lock(&ep->mtx); + ep_start_scan(ep, &txlist); res = ep_read_events_proc(ep, &txlist, 1); - ep_done_scan(ep, 0, false, &txlist); + ep_done_scan(ep, &txlist); + mutex_unlock(&ep->mtx); return res; } @@ -1573,15 +1570,13 @@ static int ep_send_events(struct eventpoll *ep, init_poll_funcptr(&pt, NULL); - ep_start_scan(ep, 0, false, &txlist); + mutex_lock(&ep->mtx); + ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. - * Items cannot vanish during the loop because ep_scan_ready_list() is - * holding "mtx" during this call. + * Items cannot vanish during the loop we are holding ep->mtx. */ - lockdep_assert_held(&ep->mtx); - list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; @@ -1609,9 +1604,8 @@ static int ep_send_events(struct eventpoll *ep, /* * If the event mask intersect the caller-requested one, - * deliver the event to userspace. Again, ep_scan_ready_list() - * is holding ep->mtx, so no operations coming from userspace - * can change the item. + * deliver the event to userspace. Again, we are holding ep->mtx, + * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) @@ -1645,7 +1639,8 @@ static int ep_send_events(struct eventpoll *ep, ep_pm_stay_awake(epi); } } - ep_done_scan(ep, 0, false, &txlist); + ep_done_scan(ep, &txlist); + mutex_unlock(&ep->mtx); return res; } From e3e096e7fc30c28cfc53fe8a1e265d65b85f60bb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 18:09:29 -0400 Subject: [PATCH 19/27] ep_insert(): don't open-code ep_remove() on failure exits Signed-off-by: Al Viro --- fs/eventpoll.c | 51 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9c567af1f5f..c987b61701e4 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1384,12 +1384,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, epi->next = EP_UNACTIVE_PTR; if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); - if (error) - goto error_create_wakeup_source; + if (error) { + kmem_cache_free(epi_cache, epi); + return error; + } } else { RCU_INIT_POINTER(epi->ws, NULL); } + atomic_long_inc(&ep->user->epoll_watches); + /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); @@ -1402,9 +1406,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, ep_rbtree_insert(ep, epi); /* now check if we've created too many backpaths */ - error = -EINVAL; - if (full_check && reverse_path_check()) - goto error_remove_epi; + if (unlikely(full_check && reverse_path_check())) { + ep_remove(ep, epi); + return -EINVAL; + } /* Initialize the poll table using the queue callback */ epq.epi = epi; @@ -1424,9 +1429,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ - error = -ENOMEM; - if (!epq.epi) - goto error_unregister; + if (unlikely(!epq.epi)) { + ep_remove(ep, epi); + return -ENOMEM; + } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); @@ -1448,40 +1454,11 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, write_unlock_irq(&ep->lock); - atomic_long_inc(&ep->user->epoll_watches); - /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL); return 0; - -error_unregister: - ep_unregister_pollwait(ep, epi); -error_remove_epi: - spin_lock(&tfile->f_lock); - list_del_rcu(&epi->fllink); - spin_unlock(&tfile->f_lock); - - rb_erase_cached(&epi->rbn, &ep->rbr); - - /* - * We need to do this because an event could have been arrived on some - * allocated wait queue. Note that we don't care about the ep->ovflist - * list, since that is used/cleaned only inside a section bound by "mtx". - * And ep_insert() is called with "mtx" held. - */ - write_lock_irq(&ep->lock); - if (ep_is_linked(epi)) - list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); - - wakeup_source_unregister(ep_wakeup_source(epi)); - -error_create_wakeup_source: - kmem_cache_free(epi_cache, epi); - - return error; } /* From 85353e919f6eb28ee4a797b06de8cc4c48dec2d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 18:15:26 -0400 Subject: [PATCH 20/27] ep_insert(): we only need tep->mtx around the insertion itself We do need ep->mtx (and we are holding it all along), but that's the lock on the epoll we are inserting into; locking of the epoll being inserted is not needed for most of that work - as the matter of fact, we only need it to provide barriers for the fastpath check (for now). Move taking and releasing it into ep_insert(). The caller (do_epoll_ctl()) doesn't need to bother with that at all. Moreover, that way we kill the kludge in ep_item_poll() - now it's always called with tep unlocked. Signed-off-by: Al Viro --- fs/eventpoll.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c987b61701e4..3fbce9c91481 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -731,8 +731,6 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, int depth); -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); /* * Differs from ep_eventpoll_poll() in that internal callers already have @@ -745,7 +743,6 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, struct eventpoll *ep; LIST_HEAD(txlist); __poll_t res; - bool locked; pt->_key = epi->event.events; if (!is_file_epoll(epi->ffd.file)) @@ -754,15 +751,11 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, ep = epi->ffd.file->private_data; poll_wait(epi->ffd.file, &ep->poll_wait, pt); - // kludge: ep_insert() calls us with ep->mtx already locked - locked = pt && (pt->_qproc == ep_ptable_queue_proc); - if (!locked) - mutex_lock_nested(&ep->mtx, depth); + mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); res = ep_read_events_proc(ep, &txlist, depth + 1); ep_done_scan(ep, &txlist); - if (!locked) - mutex_unlock(&ep->mtx); + mutex_unlock(&ep->mtx); return res & epi->event.events; } @@ -1365,6 +1358,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, long user_watches; struct epitem *epi; struct ep_pqueue epq; + struct eventpoll *tep = NULL; + + if (is_file_epoll(tfile)) + tep = tfile->private_data; lockdep_assert_irqs_enabled(); @@ -1394,6 +1391,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, atomic_long_inc(&ep->user->epoll_watches); + if (tep) + mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); @@ -1404,6 +1403,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); + if (tep) + mutex_unlock(&tep->mtx); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { @@ -2034,13 +2035,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; - if (is_file_epoll(tf.file)) { - error = epoll_mutex_lock(&tep->mtx, 1, nonblock); - if (error) { - mutex_unlock(&ep->mtx); - goto error_tgt_fput; - } - } } } @@ -2076,8 +2070,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -ENOENT; break; } - if (tep != NULL) - mutex_unlock(&tep->mtx); mutex_unlock(&ep->mtx); error_tgt_fput: From ad9366b1361fd6ed3f85f670bdb4e8af039e450c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 18:32:48 -0400 Subject: [PATCH 21/27] take the common part of ep_eventpoll_poll() and ep_item_poll() into helper The only reason why ep_item_poll() can't simply call ep_eventpoll_poll() (or, better yet, call vfs_poll() in all cases) is that we need to tell lockdep how deep into the hierarchy of ->mtx we are. So let's add a variant of ep_eventpoll_poll() that would take depth explicitly and turn ep_eventpoll_poll() into wrapper for that. Signed-off-by: Al Viro --- fs/eventpoll.c | 57 ++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3fbce9c91481..ee31ebe33150 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -732,6 +732,27 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, int depth); +static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) +{ + struct eventpoll *ep = file->private_data; + LIST_HEAD(txlist); + __poll_t res; + + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + + /* + * Proceed to find out if wanted events are really available inside + * the ready list. + */ + mutex_lock_nested(&ep->mtx, depth); + ep_start_scan(ep, &txlist); + res = ep_read_events_proc(ep, &txlist, depth + 1); + ep_done_scan(ep, &txlist); + mutex_unlock(&ep->mtx); + return res; +} + /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() @@ -740,22 +761,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { - struct eventpoll *ep; - LIST_HEAD(txlist); + struct file *file = epi->ffd.file; __poll_t res; pt->_key = epi->event.events; - if (!is_file_epoll(epi->ffd.file)) - return vfs_poll(epi->ffd.file, pt) & epi->event.events; - - ep = epi->ffd.file->private_data; - poll_wait(epi->ffd.file, &ep->poll_wait, pt); - - mutex_lock_nested(&ep->mtx, depth); - ep_start_scan(ep, &txlist); - res = ep_read_events_proc(ep, &txlist, depth + 1); - ep_done_scan(ep, &txlist); - mutex_unlock(&ep->mtx); + if (!is_file_epoll(file)) + res = vfs_poll(file, pt); + else + res = __ep_eventpoll_poll(file, pt, depth); return res & epi->event.events; } @@ -786,23 +799,7 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { - struct eventpoll *ep = file->private_data; - LIST_HEAD(txlist); - __poll_t res; - - /* Insert inside our poll wait queue */ - poll_wait(file, &ep->poll_wait, wait); - - /* - * Proceed to find out if wanted events are really available inside - * the ready list. - */ - mutex_lock(&ep->mtx); - ep_start_scan(ep, &txlist); - res = ep_read_events_proc(ep, &txlist, 1); - ep_done_scan(ep, &txlist); - mutex_unlock(&ep->mtx); - return res; + return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS From 2c0b71c1e9c9362c9503f218fed62aeb66a2ef97 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Sep 2020 18:48:57 -0400 Subject: [PATCH 22/27] fold ep_read_events_proc() into the only caller Signed-off-by: Al Viro --- fs/eventpoll.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ee31ebe33150..4903104137c6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -729,14 +729,17 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, - int depth); +static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); - __poll_t res; + struct epitem *epi, *tmp; + poll_table pt; + __poll_t res = 0; + + init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); @@ -747,7 +750,20 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); - res = ep_read_events_proc(ep, &txlist, depth + 1); + list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { + if (ep_item_poll(epi, &pt, depth + 1)) { + res = EPOLLIN | EPOLLRDNORM; + break; + } else { + /* + * Item has been dropped into the ready list by the poll + * callback, but it's not actually ready, as far as + * caller requested events goes. We can remove it here. + */ + __pm_relax(ep_wakeup_source(epi)); + list_del_init(&epi->rdllink); + } + } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; @@ -772,31 +788,6 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, return res & epi->event.events; } -static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, - int depth) -{ - struct epitem *epi, *tmp; - poll_table pt; - - init_poll_funcptr(&pt, NULL); - - list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt, depth)) { - return EPOLLIN | EPOLLRDNORM; - } else { - /* - * Item has been dropped into the ready list by the poll - * callback, but it's not actually ready, as far as - * caller requested events goes. We can remove it here. - */ - __pm_relax(ep_wakeup_source(epi)); - list_del_init(&epi->rdllink); - } - } - - return 0; -} - static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); From d1ec50adb560983635bd31263012e688cc167f31 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Sep 2020 11:03:32 -0400 Subject: [PATCH 23/27] ep_insert(): move creation of wakeup source past the fl_ep_links insertion That's the beginning of preparations for taking f_ep_links out of struct file. If insertion might fail, we will need a new failure exit. Having wakeup source creation done after that point will simplify life there; ep_remove() can (and commonly does) live with NULL epi->ws, so it can be used for cleanup after ep_create_wakeup_source() failure. It can't be used before the rbtree insertion, though, so if we are to unify all old failure exits, we need to move that thing down. Then we would be free to do simple kmem_cache_free() on the failure to insert into f_ep_links - no wakeup source to leak on that failure exit. Signed-off-by: Al Viro --- fs/eventpoll.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4903104137c6..680f4ada53d4 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1356,26 +1356,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; - if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); - epi->pwqlist = NULL; epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; - if (epi->event.events & EPOLLWAKEUP) { - error = ep_create_wakeup_source(epi); - if (error) { - kmem_cache_free(epi_cache, epi); - return error; - } - } else { - RCU_INIT_POINTER(epi->ws, NULL); - } atomic_long_inc(&ep->user->epoll_watches); @@ -1400,6 +1390,14 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, return -EINVAL; } + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) { + ep_remove(ep, epi); + return error; + } + } + /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); From 44cdc1d952e3f7aa9944c1bbf38fc23f49885017 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Sep 2020 11:18:30 -0400 Subject: [PATCH 24/27] convert ->f_ep_links/->fllink to hlist we don't care about the order of elements there Signed-off-by: Al Viro --- fs/eventpoll.c | 18 +++++++++--------- include/linux/eventpoll.h | 4 ++-- include/linux/fs.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 680f4ada53d4..c40576e62728 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -160,7 +160,7 @@ struct epitem { struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; + struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; @@ -642,7 +642,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); - list_del_rcu(&epi->fllink); + hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); rb_erase_cached(&epi->rbn, &ep->rbr); @@ -835,7 +835,8 @@ static const struct file_operations eventpoll_fops = { void eventpoll_release_file(struct file *file) { struct eventpoll *ep; - struct epitem *epi, *next; + struct epitem *epi; + struct hlist_node *next; /* * We don't want to get "file->f_lock" because it is not @@ -851,7 +852,7 @@ void eventpoll_release_file(struct file *file) * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); - list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { + hlist_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { ep = epi->ep; mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); @@ -1257,11 +1258,11 @@ static int reverse_path_check_proc(struct file *file, int depth) /* CTL_DEL can remove links here, but that can't increase our count */ rcu_read_lock(); - list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { + hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { struct file *recepient = epi->ep->file; if (WARN_ON(!is_file_epoll(recepient))) continue; - if (list_empty(&recepient->f_ep_links)) + if (hlist_empty(&recepient->f_ep_links)) error = path_count_inc(depth); else error = reverse_path_check_proc(recepient, depth + 1); @@ -1361,7 +1362,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); - INIT_LIST_HEAD(&epi->fllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; @@ -1373,7 +1373,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); - list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); + hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* @@ -1999,7 +1999,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { - if (!list_empty(&f.file->f_ep_links) || + if (!hlist_empty(&f.file->f_ep_links) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 8f000fada5a4..4e215ccfa792 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -25,7 +25,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { - INIT_LIST_HEAD(&file->f_ep_links); + INIT_HLIST_HEAD(&file->f_ep_links); INIT_LIST_HEAD(&file->f_tfile_llink); } @@ -50,7 +50,7 @@ static inline void eventpoll_release(struct file *file) * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ - if (likely(list_empty(&file->f_ep_links))) + if (likely(hlist_empty(&file->f_ep_links))) return; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bd126418bb6..b484ba0e474a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -946,7 +946,7 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ - struct list_head f_ep_links; + struct hlist_head f_ep_links; struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; From b62d2706a754887800a7cec4eb0592a9263a38fc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Oct 2020 14:11:00 -0400 Subject: [PATCH 25/27] lift rcu_read_lock() into reverse_path_check() Signed-off-by: Al Viro --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c40576e62728..06674515132c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1257,7 +1257,6 @@ static int reverse_path_check_proc(struct file *file, int depth) return -1; /* CTL_DEL can remove links here, but that can't increase our count */ - rcu_read_lock(); hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { struct file *recepient = epi->ep->file; if (WARN_ON(!is_file_epoll(recepient))) @@ -1269,7 +1268,6 @@ static int reverse_path_check_proc(struct file *file, int depth) if (error != 0) break; } - rcu_read_unlock(); return error; } @@ -1291,7 +1289,9 @@ static int reverse_path_check(void) /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); + rcu_read_lock(); error = reverse_path_check_proc(current_file, 0); + rcu_read_unlock(); if (error) break; } From d9f41e3c95a17c263bd72799ea8c33ea6138dc22 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Oct 2020 16:10:11 -0400 Subject: [PATCH 26/27] epoll: massage the check list insertion in the "non-epoll target" cases do it in ep_insert() rather than in do_epoll_ctl(), so that we do it only with some epitem is already guaranteed to exist. Signed-off-by: Al Viro --- fs/eventpoll.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 06674515132c..8f68095c5ce9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1375,6 +1375,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, spin_lock(&tfile->f_lock); hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); + if (full_check && !tep) { + get_file(tfile); + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } /* * Add the current item to the RB tree. All RB tree operations are @@ -2013,10 +2017,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; - } else { - get_file(tf.file); - list_add(&tf.file->f_tfile_llink, - &tfile_check_list); } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) From 319c15174757aaedacc89a6e55c965416f130e64 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Oct 2020 20:45:51 -0400 Subject: [PATCH 27/27] epoll: take epitem list out of struct file Move the head of epitem list out of struct file; for epoll ones it's moved into struct eventpoll (->refs there), for non-epoll - into the new object (struct epitem_head). In place of ->f_ep_links we leave a pointer to the list head (->f_ep). ->f_ep is protected by ->f_lock and it's zeroed as soon as the list of epitems becomes empty (that can happen only in ep_remove() by now). The list of files for reverse path check is *not* going through struct file now - it's a single-linked list going through epitem_head instances. It's terminated by ERR_PTR(-1) (== EP_UNACTIVE_POINTER), so the elements of list can be distinguished by head->next != NULL. epitem_head instances are allocated at ep_insert() time (by attach_epitem()) and freed either by ep_remove() (if it empties the set of epitems *and* epitem_head does not belong to the reverse path check list) or by clear_tfile_check_list() when the list is emptied (if the set of epitems is empty by that point). Allocations are done from a separate slab - minimal kmalloc() size is too large on some architectures. As the result, we trim struct file _and_ get rid of the games with temporary file references. Locking and barriers are interesting (aren't they always); see unlist_file() and ep_remove() for details. The non-obvious part is that ep_remove() needs to decide if it will be the one to free the damn thing *before* actually storing NULL to head->epitems.first - that's what smp_load_acquire is for in there. unlist_file() lockless path is safe, since we hit it only if we observe NULL in head->epitems.first and whoever had done that store is guaranteed to have observed non-NULL in head->next. IOW, their last access had been the store of NULL into ->epitems.first and we can safely free the sucker. OTOH, we are under rcu_read_lock() and both epitem and epitem->file have their freeing RCU-delayed. So if we see non-NULL ->epitems.first, we can grab ->f_lock (all epitems in there share the same struct file) and safely recheck the emptiness of ->epitems; again, ->next is still non-NULL, so ep_remove() couldn't have freed head yet. ->f_lock serializes us wrt ep_remove(); the rest is trivial. Note that once head->epitems becomes NULL, nothing can get inserted into it - the only remaining reference to head after that point is from the reverse path check list. Signed-off-by: Al Viro --- fs/eventpoll.c | 168 ++++++++++++++++++++++++++++---------- fs/file_table.c | 1 - include/linux/eventpoll.h | 11 +-- include/linux/fs.h | 5 +- 4 files changed, 129 insertions(+), 56 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8f68095c5ce9..d89aead356df 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -215,6 +215,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; + struct hlist_head refs; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ @@ -259,7 +260,45 @@ static struct kmem_cache *pwq_cache __read_mostly; * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. */ -static LIST_HEAD(tfile_check_list); +struct epitems_head { + struct hlist_head epitems; + struct epitems_head *next; +}; +static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; + +static struct kmem_cache *ephead_cache __read_mostly; + +static inline void free_ephead(struct epitems_head *head) +{ + if (head) + kmem_cache_free(ephead_cache, head); +} + +static void list_file(struct file *file) +{ + struct epitems_head *head; + + head = container_of(file->f_ep, struct epitems_head, epitems); + if (!head->next) { + head->next = tfile_check_list; + tfile_check_list = head; + } +} + +static void unlist_file(struct epitems_head *head) +{ + struct epitems_head *to_free = head; + struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); + if (p) { + struct epitem *epi= container_of(p, struct epitem, fllink); + spin_lock(&epi->ffd.file->f_lock); + if (!hlist_empty(&head->epitems)) + to_free = NULL; + head->next = NULL; + spin_unlock(&epi->ffd.file->f_lock); + } + free_ephead(to_free); +} #ifdef CONFIG_SYSCTL @@ -632,6 +671,8 @@ static void epi_rcu_free(struct rcu_head *head) static int ep_remove(struct eventpoll *ep, struct epitem *epi) { struct file *file = epi->ffd.file; + struct epitems_head *to_free; + struct hlist_head *head; lockdep_assert_irqs_enabled(); @@ -642,8 +683,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); + to_free = NULL; + head = file->f_ep; + if (head->first == &epi->fllink && !epi->fllink.next) { + file->f_ep = NULL; + if (!is_file_epoll(file)) { + struct epitems_head *v; + v = container_of(head, struct epitems_head, epitems); + if (!smp_load_acquire(&v->next)) + to_free = v; + } + } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); + free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); @@ -852,7 +905,11 @@ void eventpoll_release_file(struct file *file) * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); - hlist_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { + if (unlikely(!file->f_ep)) { + mutex_unlock(&epmutex); + return; + } + hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { ep = epi->ep; mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); @@ -1248,7 +1305,7 @@ static void path_count_init(void) path_count[i] = 0; } -static int reverse_path_check_proc(struct file *file, int depth) +static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; @@ -1257,14 +1314,12 @@ static int reverse_path_check_proc(struct file *file, int depth) return -1; /* CTL_DEL can remove links here, but that can't increase our count */ - hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { - struct file *recepient = epi->ep->file; - if (WARN_ON(!is_file_epoll(recepient))) - continue; - if (hlist_empty(&recepient->f_ep_links)) + hlist_for_each_entry_rcu(epi, refs, fllink) { + struct hlist_head *refs = &epi->ep->refs; + if (hlist_empty(refs)) error = path_count_inc(depth); else - error = reverse_path_check_proc(recepient, depth + 1); + error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } @@ -1272,7 +1327,7 @@ static int reverse_path_check_proc(struct file *file, int depth) } /** - * reverse_path_check - The tfile_check_list is list of file *, which have + * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up @@ -1283,19 +1338,18 @@ static int reverse_path_check_proc(struct file *file, int depth) */ static int reverse_path_check(void) { - int error = 0; - struct file *current_file; + struct epitems_head *p; - /* let's call this for all tfiles */ - list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { + int error; path_count_init(); rcu_read_lock(); - error = reverse_path_check_proc(current_file, 0); + error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) - break; + return error; } - return error; + return 0; } static int ep_create_wakeup_source(struct epitem *epi) @@ -1336,6 +1390,39 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) wakeup_source_unregister(ws); } +static int attach_epitem(struct file *file, struct epitem *epi) +{ + struct epitems_head *to_free = NULL; + struct hlist_head *head = NULL; + struct eventpoll *ep = NULL; + + if (is_file_epoll(file)) + ep = file->private_data; + + if (ep) { + head = &ep->refs; + } else if (!READ_ONCE(file->f_ep)) { +allocate: + to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); + if (!to_free) + return -ENOMEM; + head = &to_free->epitems; + } + spin_lock(&file->f_lock); + if (!file->f_ep) { + if (unlikely(!head)) { + spin_unlock(&file->f_lock); + goto allocate; + } + file->f_ep = head; + to_free = NULL; + } + hlist_add_head_rcu(&epi->fllink, file->f_ep); + spin_unlock(&file->f_lock); + free_ephead(to_free); + return 0; +} + /* * Must be called with "mtx" held. */ @@ -1367,19 +1454,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, epi->event = *event; epi->next = EP_UNACTIVE_PTR; - atomic_long_inc(&ep->user->epoll_watches); - if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ - spin_lock(&tfile->f_lock); - hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links); - spin_unlock(&tfile->f_lock); - if (full_check && !tep) { - get_file(tfile); - list_add(&tfile->f_tfile_llink, &tfile_check_list); + if (unlikely(attach_epitem(tfile, epi) < 0)) { + kmem_cache_free(epi_cache, epi); + if (tep) + mutex_unlock(&tep->mtx); + return -ENOMEM; } + if (full_check && !tep) + list_file(tfile); + + atomic_long_inc(&ep->user->epoll_watches); + /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. @@ -1813,11 +1902,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) * not already there, and calling reverse_path_check() * during ep_insert(). */ - if (list_empty(&epi->ffd.file->f_tfile_llink)) { - if (get_file_rcu(epi->ffd.file)) - list_add(&epi->ffd.file->f_tfile_llink, - &tfile_check_list); - } + list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); @@ -1844,16 +1929,13 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) static void clear_tfile_check_list(void) { - struct file *file; - - /* first clear the tfile_check_list */ - while (!list_empty(&tfile_check_list)) { - file = list_first_entry(&tfile_check_list, struct file, - f_tfile_llink); - list_del_init(&file->f_tfile_llink); - fput(file); + rcu_read_lock(); + while (tfile_check_list != EP_UNACTIVE_PTR) { + struct epitems_head *head = tfile_check_list; + tfile_check_list = head->next; + unlist_file(head); } - INIT_LIST_HEAD(&tfile_check_list); + rcu_read_unlock(); } /* @@ -2003,9 +2085,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { - if (!hlist_empty(&f.file->f_ep_links) || - ep->gen == loop_check_gen || - is_file_epoll(tf.file)) { + if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || + is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) @@ -2216,6 +2297,9 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + ephead_cache = kmem_cache_create("ep_head", + sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/file_table.c b/fs/file_table.c index 709ada3151da..45437f8e1003 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -113,7 +113,6 @@ static struct file *__alloc_file(int flags, const struct cred *cred) rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); - eventpoll_init_file(f); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 4e215ccfa792..0350393465d4 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -22,14 +22,6 @@ struct file; struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif -/* Used to initialize the epoll bits inside the "struct file" */ -static inline void eventpoll_init_file(struct file *file) -{ - INIT_HLIST_HEAD(&file->f_ep_links); - INIT_LIST_HEAD(&file->f_tfile_llink); -} - - /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); @@ -50,7 +42,7 @@ static inline void eventpoll_release(struct file *file) * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ - if (likely(hlist_empty(&file->f_ep_links))) + if (likely(!file->f_ep)) return; /* @@ -72,7 +64,6 @@ static inline int ep_op_has_event(int op) #else -static inline void eventpoll_init_file(struct file *file) {} static inline void eventpoll_release(struct file *file) {} #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b484ba0e474a..993c540e3b77 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -923,7 +923,7 @@ struct file { const struct file_operations *f_op; /* - * Protects f_ep_links, f_flags. + * Protects f_ep, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; @@ -946,8 +946,7 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ - struct hlist_head f_ep_links; - struct list_head f_tfile_llink; + struct hlist_head *f_ep; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err;