8e3ae98
epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s
8e3ae98
at once in the case where one epoll fd is monitoring another epoll
8e3ae98
fd. This is perfectly OK, since we're careful about the lock ordering,
8e3ae98
but causes spurious lockdep warnings. Annotate the recursion using
8e3ae98
mutex_lock_nested, and add a comment explaining the nesting rules for
8e3ae98
good measure.
8e3ae98
8e3ae98
Reported-by: Paul Bolle <pebolle@tiscali.nl>
8e3ae98
Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
8e3ae98
---
8e3ae98
 I've tested this on a synthetic epoll test case, that just adds e1 to
8e3ae98
 e2 and then does an epoll_wait(). I verified that it caused lockdep
8e3ae98
 problems on 3.0 and that this patch fixed it, but I haven't done more
8e3ae98
 extensive testing. Paul, are you able to test systemd against this?
8e3ae98
8e3ae98
 fs/eventpoll.c |   25 ++++++++++++++++++-------
8e3ae98
 1 files changed, 18 insertions(+), 7 deletions(-)
8e3ae98
8e3ae98
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
8e3ae98
index f9cfd16..0cb7bc6 100644
8e3ae98
--- a/fs/eventpoll.c
8e3ae98
+++ b/fs/eventpoll.c
8e3ae98
@@ -76,6 +76,15 @@
8e3ae98
  * Events that require holding "epmutex" are very rare, while for
8e3ae98
  * normal operations the epoll private "ep->mtx" will guarantee
8e3ae98
  * a better scalability.
8e3ae98
+ * It is possible to acquire multiple "ep->mtx"es at once in the case
8e3ae98
+ * when one epoll fd is added to another. In this case, we always
8e3ae98
+ * acquire the locks in the order of nesting (i.e. after epoll_ctl(e1,
8e3ae98
+ * EPOLL_CTL_ADD, e2), e1->mtx will always be acquired before
8e3ae98
+ * e2->mtx). Since we disallow cycles of epoll file descriptors, this
8e3ae98
+ * ensures that the mutexes are well-ordered. In order to communicate
8e3ae98
+ * this nesting to lockdep, when walking a tree of epoll file
8e3ae98
+ * descriptors, we use the current recursion depth as the lockdep
8e3ae98
+ * subkey.
8e3ae98
  */
8e3ae98
 
8e3ae98
 /* Epoll private bits inside the event mask */
8e3ae98
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
8e3ae98
  * @ep: Pointer to the epoll private data structure.
8e3ae98
  * @sproc: Pointer to the scan callback.
8e3ae98
  * @priv: Private opaque data passed to the @sproc callback.
8e3ae98
+ * @depth: The current depth of recursive f_op->poll calls.
8e3ae98
  *
8e3ae98
  * Returns: The same integer error code returned by the @sproc callback.
8e3ae98
  */
8e3ae98
 static int ep_scan_ready_list(struct eventpoll *ep,
8e3ae98
 			      int (*sproc)(struct eventpoll *,
8e3ae98
 					   struct list_head *, void *),
8e3ae98
-			      void *priv)
8e3ae98
+			      void *priv,
8e3ae98
+			      int depth)
8e3ae98
 {
8e3ae98
 	int error, pwake = 0;
8e3ae98
 	unsigned long flags;
8e3ae98
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
8e3ae98
 	 * We need to lock this because we could be hit by
8e3ae98
 	 * eventpoll_release_file() and epoll_ctl().
8e3ae98
 	 */
8e3ae98
-	mutex_lock(&ep->mtx);
8e3ae98
+	mutex_lock_nested(&ep->mtx, depth);
8e3ae98
 
8e3ae98
 	/*
8e3ae98
 	 * Steal the ready list, and re-init the original one to the
8e3ae98
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
8e3ae98
 
8e3ae98
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
8e3ae98
 {
8e3ae98
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
8e3ae98
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
8e3ae98
 }
8e3ae98
 
8e3ae98
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
8e3ae98
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
8e3ae98
 
8e3ae98
 		ep = epi->ep;
8e3ae98
 		list_del_init(&epi->fllink);
8e3ae98
-		mutex_lock(&ep->mtx);
8e3ae98
+		mutex_lock_nested(&ep->mtx, 0);
8e3ae98
 		ep_remove(ep, epi);
8e3ae98
 		mutex_unlock(&ep->mtx);
8e3ae98
 	}
8e3ae98
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
8e3ae98
 	esed.maxevents = maxevents;
8e3ae98
 	esed.events = events;
8e3ae98
 
8e3ae98
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
8e3ae98
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
8e3ae98
 }
8e3ae98
 
8e3ae98
 static inline struct timespec ep_set_mstimeout(long ms)
8e3ae98
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
8e3ae98
 	struct rb_node *rbp;
8e3ae98
 	struct epitem *epi;
8e3ae98
 
8e3ae98
-	mutex_lock(&ep->mtx);
8e3ae98
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
8e3ae98
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
8e3ae98
 		epi = rb_entry(rbp, struct epitem, rbn);
8e3ae98
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
8e3ae98
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
8e3ae98
 	}
8e3ae98
 
8e3ae98
 
8e3ae98
-	mutex_lock(&ep->mtx);
8e3ae98
+	mutex_lock_nested(&ep->mtx, 0);
8e3ae98
 
8e3ae98
 	/*
8e3ae98
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
8e3ae98
-- 
8e3ae98
1.7.4.1
8e3ae98
8e3ae98
--
8e3ae98
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
8e3ae98
the body of a message to majordomo@vger.kernel.org
8e3ae98
More majordomo info at  http://vger.kernel.org/majordomo-info.html