Kyle McMartin 9d066c5
From 5883ea2555b2ae8dd84a256532f7abb2d4837fc1 Mon Sep 17 00:00:00 2001
Kyle McMartin 9d066c5
From: Dave Chinner <dchinner@redhat.com>
Kyle McMartin 9d066c5
Date: Tue, 20 Jul 2010 09:43:39 +1000
Kyle McMartin 9d066c5
Subject: xfs: track AGs with reclaimable inodes in per-ag radix tree
Kyle McMartin 9d066c5
Kyle McMartin 9d066c5
https://bugzilla.kernel.org/show_bug.cgi?id=16348
Kyle McMartin 9d066c5
Kyle McMartin 9d066c5
When the filesystem grows to a large number of allocation groups,
Kyle McMartin 9d066c5
the summing of recalimable inodes gets expensive. In many cases,
Kyle McMartin 9d066c5
most AGs won't have any reclaimable inodes and so we are wasting CPU
Kyle McMartin 9d066c5
time aggregating over these AGs. This is particularly important for
Kyle McMartin 9d066c5
the inode shrinker that gets called frequently under memory
Kyle McMartin 9d066c5
pressure.
Kyle McMartin 9d066c5
Kyle McMartin 9d066c5
To avoid the overhead, track AGs with reclaimable inodes in the
Kyle McMartin 9d066c5
per-ag radix tree so that we can find all the AGs with reclaimable
Kyle McMartin 9d066c5
inodes via a simple gang tag lookup. This involves setting the tag
Kyle McMartin 9d066c5
when the first reclaimable inode is tracked in the AG, and removing
Kyle McMartin 9d066c5
the tag when the last reclaimable inode is removed from the tree.
Kyle McMartin 9d066c5
Then the summation process becomes a loop walking the radix tree
Kyle McMartin 9d066c5
summing AGs with the reclaim tag set.
Kyle McMartin 9d066c5
Kyle McMartin 9d066c5
This significantly reduces the overhead of scanning - a 6400 AG
Kyle McMartin 9d066c5
filesystea now only uses about 25% of a cpu in kswapd while slab
Kyle McMartin 9d066c5
reclaim progresses instead of being permanently stuck at 100% CPU
Kyle McMartin 9d066c5
and making little progress. Clean filesystems filesystems will see
Kyle McMartin 9d066c5
no overhead and the overhead only increases linearly with the number
Kyle McMartin 9d066c5
of dirty AGs.
Kyle McMartin 9d066c5
Kyle McMartin 9d066c5
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Kyle McMartin 9d066c5
Reviewed-by: Christoph Hellwig <hch@lst.de>
Kyle McMartin 9d066c5
---
Kyle McMartin 9d066c5
 fs/xfs/linux-2.6/xfs_sync.c  |   68 +++++++++++++++++++++++++++++++++++++----
Kyle McMartin 9d066c5
 fs/xfs/linux-2.6/xfs_trace.h |   61 +++++++++++++++++++++----------------
Kyle McMartin 9d066c5
 2 files changed, 95 insertions(+), 34 deletions(-)
Kyle McMartin 9d066c5
Kyle McMartin 9d066c5
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
Kyle McMartin 9d066c5
index a427c63..b927a54 100644
Kyle McMartin 9d066c5
--- a/fs/xfs/linux-2.6/xfs_sync.c
Kyle McMartin 9d066c5
+++ b/fs/xfs/linux-2.6/xfs_sync.c
Kyle McMartin 9d066c5
@@ -144,6 +144,41 @@ restart:
Kyle McMartin 9d066c5
 	return last_error;
Kyle McMartin 9d066c5
 }
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
+/*
Kyle McMartin 9d066c5
+ * Select the next per-ag structure to iterate during the walk. The reclaim
Kyle McMartin 9d066c5
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
Kyle McMartin 9d066c5
+ */
Kyle McMartin 9d066c5
+static struct xfs_perag *
Kyle McMartin 9d066c5
+xfs_inode_ag_iter_next_pag(
Kyle McMartin 9d066c5
+	struct xfs_mount	*mp,
Kyle McMartin 9d066c5
+	xfs_agnumber_t		*first,
Kyle McMartin 9d066c5
+	int			tag)
Kyle McMartin 9d066c5
+{
Kyle McMartin 9d066c5
+	struct xfs_perag	*pag = NULL;
Kyle McMartin 9d066c5
+
Kyle McMartin 9d066c5
+	if (tag == XFS_ICI_RECLAIM_TAG) {
Kyle McMartin 9d066c5
+		int found;
Kyle McMartin 9d066c5
+		int ref;
Kyle McMartin 9d066c5
+
Kyle McMartin 9d066c5
+		spin_lock(&mp->m_perag_lock);
Kyle McMartin 9d066c5
+		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
Kyle McMartin 9d066c5
+				(void **)&pag, *first, 1, tag);
Kyle McMartin 9d066c5
+		if (found <= 0) {
Kyle McMartin 9d066c5
+			spin_unlock(&mp->m_perag_lock);
Kyle McMartin 9d066c5
+			return NULL;
Kyle McMartin 9d066c5
+		}
Kyle McMartin 9d066c5
+		*first = pag->pag_agno + 1;
Kyle McMartin 9d066c5
+		/* open coded pag reference increment */
Kyle McMartin 9d066c5
+		ref = atomic_inc_return(&pag->pag_ref);
Kyle McMartin 9d066c5
+		spin_unlock(&mp->m_perag_lock);
Kyle McMartin 9d066c5
+		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
Kyle McMartin 9d066c5
+	} else {
Kyle McMartin 9d066c5
+		pag = xfs_perag_get(mp, *first);
Kyle McMartin 9d066c5
+		(*first)++;
Kyle McMartin 9d066c5
+	}
Kyle McMartin 9d066c5
+	return pag;
Kyle McMartin 9d066c5
+}
Kyle McMartin 9d066c5
+
Kyle McMartin 9d066c5
 int
Kyle McMartin 9d066c5
 xfs_inode_ag_iterator(
Kyle McMartin 9d066c5
 	struct xfs_mount	*mp,
Kyle McMartin 9d066c5
@@ -154,16 +189,15 @@ xfs_inode_ag_iterator(
Kyle McMartin 9d066c5
 	int			exclusive,
Kyle McMartin 9d066c5
 	int			*nr_to_scan)
Kyle McMartin 9d066c5
 {
Kyle McMartin 9d066c5
+	struct xfs_perag	*pag;
Kyle McMartin 9d066c5
 	int			error = 0;
Kyle McMartin 9d066c5
 	int			last_error = 0;
Kyle McMartin 9d066c5
 	xfs_agnumber_t		ag;
Kyle McMartin 9d066c5
 	int			nr;
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
 	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
Kyle McMartin 9d066c5
-	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
Kyle McMartin 9d066c5
-		struct xfs_perag	*pag;
Kyle McMartin 9d066c5
-
Kyle McMartin 9d066c5
-		pag = xfs_perag_get(mp, ag);
Kyle McMartin 9d066c5
+	ag = 0;
Kyle McMartin 9d066c5
+	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
Kyle McMartin 9d066c5
 		if (!pag->pag_ici_init) {
Kyle McMartin 9d066c5
 			xfs_perag_put(pag);
Kyle McMartin 9d066c5
 			continue;
Kyle McMartin 9d066c5
@@ -681,6 +715,17 @@ __xfs_inode_set_reclaim_tag(
Kyle McMartin 9d066c5
 	radix_tree_tag_set(&pag->pag_ici_root,
Kyle McMartin 9d066c5
 			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
Kyle McMartin 9d066c5
 			   XFS_ICI_RECLAIM_TAG);
Kyle McMartin 9d066c5
+
Kyle McMartin 9d066c5
+	if (!pag->pag_ici_reclaimable) {
Kyle McMartin 9d066c5
+		/* propagate the reclaim tag up into the perag radix tree */
Kyle McMartin 9d066c5
+		spin_lock(&ip->i_mount->m_perag_lock);
Kyle McMartin 9d066c5
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
Kyle McMartin 9d066c5
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
Kyle McMartin 9d066c5
+				XFS_ICI_RECLAIM_TAG);
Kyle McMartin 9d066c5
+		spin_unlock(&ip->i_mount->m_perag_lock);
Kyle McMartin 9d066c5
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
Kyle McMartin 9d066c5
+							-1, _RET_IP_);
Kyle McMartin 9d066c5
+	}
Kyle McMartin 9d066c5
 	pag->pag_ici_reclaimable++;
Kyle McMartin 9d066c5
 }
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
@@ -715,6 +760,16 @@ __xfs_inode_clear_reclaim_tag(
Kyle McMartin 9d066c5
 	radix_tree_tag_clear(&pag->pag_ici_root,
Kyle McMartin 9d066c5
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
Kyle McMartin 9d066c5
 	pag->pag_ici_reclaimable--;
Kyle McMartin 9d066c5
+	if (!pag->pag_ici_reclaimable) {
Kyle McMartin 9d066c5
+		/* clear the reclaim tag from the perag radix tree */
Kyle McMartin 9d066c5
+		spin_lock(&ip->i_mount->m_perag_lock);
Kyle McMartin 9d066c5
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
Kyle McMartin 9d066c5
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
Kyle McMartin 9d066c5
+				XFS_ICI_RECLAIM_TAG);
Kyle McMartin 9d066c5
+		spin_unlock(&ip->i_mount->m_perag_lock);
Kyle McMartin 9d066c5
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
Kyle McMartin 9d066c5
+							-1, _RET_IP_);
Kyle McMartin 9d066c5
+	}
Kyle McMartin 9d066c5
 }
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
 /*
Kyle McMartin 9d066c5
@@ -903,9 +958,8 @@ xfs_reclaim_inode_shrink(
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
 	down_read(&xfs_mount_list_lock);
Kyle McMartin 9d066c5
 	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
Kyle McMartin 9d066c5
-		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
Kyle McMartin 9d066c5
-
Kyle McMartin 9d066c5
-			pag = xfs_perag_get(mp, ag);
Kyle McMartin 9d066c5
+		while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
Kyle McMartin 9d066c5
+					XFS_ICI_RECLAIM_TAG))) {
Kyle McMartin 9d066c5
 			if (!pag->pag_ici_init) {
Kyle McMartin 9d066c5
 				xfs_perag_put(pag);
Kyle McMartin 9d066c5
 				continue;
Kyle McMartin 9d066c5
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
Kyle McMartin 9d066c5
index fcaa62f..072d581 100644
Kyle McMartin 9d066c5
--- a/fs/xfs/linux-2.6/xfs_trace.h
Kyle McMartin 9d066c5
+++ b/fs/xfs/linux-2.6/xfs_trace.h
Kyle McMartin 9d066c5
@@ -78,33 +78,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
Kyle McMartin 9d066c5
 	)
Kyle McMartin 9d066c5
 )
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
-#define DEFINE_PERAG_REF_EVENT(name) \
Kyle McMartin 9d066c5
-TRACE_EVENT(name, \
Kyle McMartin 9d066c5
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
Kyle McMartin 9d066c5
-		 unsigned long caller_ip), \
Kyle McMartin 9d066c5
-	TP_ARGS(mp, agno, refcount, caller_ip), \
Kyle McMartin 9d066c5
-	TP_STRUCT__entry( \
Kyle McMartin 9d066c5
-		__field(dev_t, dev) \
Kyle McMartin 9d066c5
-		__field(xfs_agnumber_t, agno) \
Kyle McMartin 9d066c5
-		__field(int, refcount) \
Kyle McMartin 9d066c5
-		__field(unsigned long, caller_ip) \
Kyle McMartin 9d066c5
-	), \
Kyle McMartin 9d066c5
-	TP_fast_assign( \
Kyle McMartin 9d066c5
-		__entry->dev = mp->m_super->s_dev; \
Kyle McMartin 9d066c5
-		__entry->agno = agno; \
Kyle McMartin 9d066c5
-		__entry->refcount = refcount; \
Kyle McMartin 9d066c5
-		__entry->caller_ip = caller_ip; \
Kyle McMartin 9d066c5
-	), \
Kyle McMartin 9d066c5
-	TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
Kyle McMartin 9d066c5
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
Kyle McMartin 9d066c5
-		  __entry->agno, \
Kyle McMartin 9d066c5
-		  __entry->refcount, \
Kyle McMartin 9d066c5
-		  (char *)__entry->caller_ip) \
Kyle McMartin 9d066c5
-);
Kyle McMartin 9d066c5
-
Kyle McMartin 9d066c5
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
Kyle McMartin 9d066c5
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
Kyle McMartin 9d066c5
-
Kyle McMartin 9d066c5
 #define DEFINE_ATTR_LIST_EVENT(name) \
Kyle McMartin 9d066c5
 DEFINE_EVENT(xfs_attr_list_class, name, \
Kyle McMartin 9d066c5
 	TP_PROTO(struct xfs_attr_list_context *ctx), \
Kyle McMartin 9d066c5
@@ -118,6 +91,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
Kyle McMartin 9d066c5
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
Kyle McMartin 9d066c5
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
Kyle McMartin 9d066c5
 
Kyle McMartin 9d066c5
+DECLARE_EVENT_CLASS(xfs_perag_class,
Kyle McMartin 9d066c5
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
Kyle McMartin 9d066c5
+		 unsigned long caller_ip),
Kyle McMartin 9d066c5
+	TP_ARGS(mp, agno, refcount, caller_ip),
Kyle McMartin 9d066c5
+	TP_STRUCT__entry(
Kyle McMartin 9d066c5
+		__field(dev_t, dev)
Kyle McMartin 9d066c5
+		__field(xfs_agnumber_t, agno)
Kyle McMartin 9d066c5
+		__field(int, refcount)
Kyle McMartin 9d066c5
+		__field(unsigned long, caller_ip)
Kyle McMartin 9d066c5
+	),
Kyle McMartin 9d066c5
+	TP_fast_assign(
Kyle McMartin 9d066c5
+		__entry->dev = mp->m_super->s_dev;
Kyle McMartin 9d066c5
+		__entry->agno = agno;
Kyle McMartin 9d066c5
+		__entry->refcount = refcount;
Kyle McMartin 9d066c5
+		__entry->caller_ip = caller_ip;
Kyle McMartin 9d066c5
+	),
Kyle McMartin 9d066c5
+	TP_printk("dev %d:%d agno %u refcount %d caller %pf",
Kyle McMartin 9d066c5
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
Kyle McMartin 9d066c5
+		  __entry->agno,
Kyle McMartin 9d066c5
+		  __entry->refcount,
Kyle McMartin 9d066c5
+		  (char *)__entry->caller_ip)
Kyle McMartin 9d066c5
+);
Kyle McMartin 9d066c5
+
Kyle McMartin 9d066c5
+#define DEFINE_PERAG_REF_EVENT(name)	\
Kyle McMartin 9d066c5
+DEFINE_EVENT(xfs_perag_class, name,	\
Kyle McMartin 9d066c5
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
Kyle McMartin 9d066c5
+		 unsigned long caller_ip),					\
Kyle McMartin 9d066c5
+	TP_ARGS(mp, agno, refcount, caller_ip))
Kyle McMartin 9d066c5
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
Kyle McMartin 9d066c5
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
Kyle McMartin 9d066c5
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
Kyle McMartin 9d066c5
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
Kyle McMartin 9d066c5
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
Kyle McMartin 9d066c5
+
Kyle McMartin 9d066c5
 TRACE_EVENT(xfs_attr_list_node_descend,
Kyle McMartin 9d066c5
 	TP_PROTO(struct xfs_attr_list_context *ctx,
Kyle McMartin 9d066c5
 		 struct xfs_da_node_entry *btree),
Kyle McMartin 9d066c5
-- 
Kyle McMartin 9d066c5
1.7.3.2
Kyle McMartin 9d066c5