Josh Boyer 8e3ae98
epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s
Josh Boyer 8e3ae98
at once in the case where one epoll fd is monitoring another epoll
Josh Boyer 8e3ae98
fd. This is perfectly OK, since we're careful about the lock ordering,
Josh Boyer 8e3ae98
but causes spurious lockdep warnings. Annotate the recursion using
Josh Boyer 8e3ae98
mutex_lock_nested, and add a comment explaining the nesting rules for
Josh Boyer 8e3ae98
good measure.
Josh Boyer 8e3ae98
Josh Boyer 8e3ae98
Reported-by: Paul Bolle <pebolle@tiscali.nl>
Josh Boyer 8e3ae98
Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
Josh Boyer 8e3ae98
---
Josh Boyer 8e3ae98
 I've tested this on a synthetic epoll test case, that just adds e1 to
Josh Boyer 8e3ae98
 e2 and then does an epoll_wait(). I verified that it caused lockdep
Josh Boyer 8e3ae98
 problems on 3.0 and that this patch fixed it, but I haven't done more
Josh Boyer 8e3ae98
 extensive testing. Paul, are you able to test systemd against this?
Josh Boyer 8e3ae98
Josh Boyer 8e3ae98
 fs/eventpoll.c |   25 ++++++++++++++++++-------
Josh Boyer 8e3ae98
 1 files changed, 18 insertions(+), 7 deletions(-)
Josh Boyer 8e3ae98
Josh Boyer 8e3ae98
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
Josh Boyer 8e3ae98
index f9cfd16..0cb7bc6 100644
Josh Boyer 8e3ae98
--- a/fs/eventpoll.c
Josh Boyer 8e3ae98
+++ b/fs/eventpoll.c
Josh Boyer 8e3ae98
@@ -76,6 +76,15 @@
Josh Boyer 8e3ae98
  * Events that require holding "epmutex" are very rare, while for
Josh Boyer 8e3ae98
  * normal operations the epoll private "ep->mtx" will guarantee
Josh Boyer 8e3ae98
  * a better scalability.
Josh Boyer 8e3ae98
+ * It is possible to acquire multiple "ep->mtx"es at once in the case
Josh Boyer 8e3ae98
+ * when one epoll fd is added to another. In this case, we always
Josh Boyer 8e3ae98
+ * acquire the locks in the order of nesting (i.e. after epoll_ctl(e1,
Josh Boyer 8e3ae98
+ * EPOLL_CTL_ADD, e2), e1->mtx will always be acquired before
Josh Boyer 8e3ae98
+ * e2->mtx). Since we disallow cycles of epoll file descriptors, this
Josh Boyer 8e3ae98
+ * ensures that the mutexes are well-ordered. In order to communicate
Josh Boyer 8e3ae98
+ * this nesting to lockdep, when walking a tree of epoll file
Josh Boyer 8e3ae98
+ * descriptors, we use the current recursion depth as the lockdep
Josh Boyer 8e3ae98
+ * subkey.
Josh Boyer 8e3ae98
  */
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 /* Epoll private bits inside the event mask */
Josh Boyer 8e3ae98
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
Josh Boyer 8e3ae98
  * @ep: Pointer to the epoll private data structure.
Josh Boyer 8e3ae98
  * @sproc: Pointer to the scan callback.
Josh Boyer 8e3ae98
  * @priv: Private opaque data passed to the @sproc callback.
Josh Boyer 8e3ae98
+ * @depth: The current depth of recursive f_op->poll calls.
Josh Boyer 8e3ae98
  *
Josh Boyer 8e3ae98
  * Returns: The same integer error code returned by the @sproc callback.
Josh Boyer 8e3ae98
  */
Josh Boyer 8e3ae98
 static int ep_scan_ready_list(struct eventpoll *ep,
Josh Boyer 8e3ae98
 			      int (*sproc)(struct eventpoll *,
Josh Boyer 8e3ae98
 					   struct list_head *, void *),
Josh Boyer 8e3ae98
-			      void *priv)
Josh Boyer 8e3ae98
+			      void *priv,
Josh Boyer 8e3ae98
+			      int depth)
Josh Boyer 8e3ae98
 {
Josh Boyer 8e3ae98
 	int error, pwake = 0;
Josh Boyer 8e3ae98
 	unsigned long flags;
Josh Boyer 8e3ae98
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
Josh Boyer 8e3ae98
 	 * We need to lock this because we could be hit by
Josh Boyer 8e3ae98
 	 * eventpoll_release_file() and epoll_ctl().
Josh Boyer 8e3ae98
 	 */
Josh Boyer 8e3ae98
-	mutex_lock(&ep->mtx);
Josh Boyer 8e3ae98
+	mutex_lock_nested(&ep->mtx, depth);
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 	/*
Josh Boyer 8e3ae98
 	 * Steal the ready list, and re-init the original one to the
Josh Boyer 8e3ae98
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
Josh Boyer 8e3ae98
 {
Josh Boyer 8e3ae98
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
Josh Boyer 8e3ae98
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
Josh Boyer 8e3ae98
 }
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
Josh Boyer 8e3ae98
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 		ep = epi->ep;
Josh Boyer 8e3ae98
 		list_del_init(&epi->fllink);
Josh Boyer 8e3ae98
-		mutex_lock(&ep->mtx);
Josh Boyer 8e3ae98
+		mutex_lock_nested(&ep->mtx, 0);
Josh Boyer 8e3ae98
 		ep_remove(ep, epi);
Josh Boyer 8e3ae98
 		mutex_unlock(&ep->mtx);
Josh Boyer 8e3ae98
 	}
Josh Boyer 8e3ae98
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
Josh Boyer 8e3ae98
 	esed.maxevents = maxevents;
Josh Boyer 8e3ae98
 	esed.events = events;
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
Josh Boyer 8e3ae98
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
Josh Boyer 8e3ae98
 }
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 static inline struct timespec ep_set_mstimeout(long ms)
Josh Boyer 8e3ae98
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
Josh Boyer 8e3ae98
 	struct rb_node *rbp;
Josh Boyer 8e3ae98
 	struct epitem *epi;
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
-	mutex_lock(&ep->mtx);
Josh Boyer 8e3ae98
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
Josh Boyer 8e3ae98
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
Josh Boyer 8e3ae98
 		epi = rb_entry(rbp, struct epitem, rbn);
Josh Boyer 8e3ae98
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
Josh Boyer 8e3ae98
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
Josh Boyer 8e3ae98
 	}
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
-	mutex_lock(&ep->mtx);
Josh Boyer 8e3ae98
+	mutex_lock_nested(&ep->mtx, 0);
Josh Boyer 8e3ae98
 
Josh Boyer 8e3ae98
 	/*
Josh Boyer 8e3ae98
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
Josh Boyer 8e3ae98
-- 
Josh Boyer 8e3ae98
1.7.4.1
Josh Boyer 8e3ae98
Josh Boyer 8e3ae98
--
Josh Boyer 8e3ae98
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
Josh Boyer 8e3ae98
the body of a message to majordomo@vger.kernel.org
Josh Boyer 8e3ae98
More majordomo info at  http://vger.kernel.org/majordomo-info.html