b71532b
From 8b368e8e961944105945fbe36f3f264252bfd19a Mon Sep 17 00:00:00 2001
b71532b
From: Dan Williams <dan.j.williams@intel.com>
b71532b
Date: Thu, 25 Feb 2016 01:02:30 +0000
b71532b
Subject: [PATCH] mm: CONFIG_NR_ZONES_EXTENDED
b71532b
b71532b
ZONE_DEVICE (merged in 4.3) and ZONE_CMA (proposed) are examples of new mm
b71532b
zones that are bumping up against the current maximum limit of 4 zones,
b71532b
i.e.  2 bits in page->flags.  When adding a zone this equation still needs
b71532b
to be satisified:
b71532b
b71532b
    SECTIONS_WIDTH + ZONES_WIDTH + NODES_SHIFT + LAST_CPUPID_SHIFT
b71532b
	  <= BITS_PER_LONG - NR_PAGEFLAGS
b71532b
b71532b
ZONE_DEVICE currently tries to satisfy this equation by requiring that
b71532b
ZONE_DMA be disabled, but this is untenable given generic kernels want to
b71532b
support ZONE_DEVICE and ZONE_DMA simultaneously.  ZONE_CMA would like to
b71532b
increase the amount of memory covered per section, but that limits the
b71532b
minimum granularity at which consecutive memory ranges can be added via
b71532b
devm_memremap_pages().
b71532b
b71532b
The trade-off of what is acceptable to sacrifice depends heavily on the
b71532b
platform.  For example, ZONE_CMA is targeted for 32-bit platforms where
b71532b
page->flags is constrained, but those platforms likely do not care about
b71532b
the minimum granularity of memory hotplug.  A big iron machine with 1024
b71532b
numa nodes can likely sacrifice ZONE_DMA where a general purpose
b71532b
distribution kernel can not.
b71532b
b71532b
CONFIG_NR_ZONES_EXTENDED is a configuration symbol that gets selected when
b71532b
the number of configured zones exceeds 4.  It documents the configuration
b71532b
symbols and definitions that get modified when ZONES_WIDTH is greater than
b71532b
2.
b71532b
b71532b
For now, it steals a bit from NODES_SHIFT.  Later on it can be used to
b71532b
document the definitions that get modified when a 32-bit configuration
b71532b
wants more zone bits.
b71532b
b71532b
Note that GFP_ZONE_TABLE poses an interesting constraint since
b71532b
include/linux/gfp.h gets included by the 32-bit portion of a 64-bit build.
b71532b
We need to be careful to only build the table for zones that have a
b71532b
corresponding gfp_t flag.  GFP_ZONES_SHIFT is introduced for this purpose.
b71532b
This patch does not attempt to solve the problem of adding a new zone
b71532b
that also has a corresponding GFP_ flag.
b71532b
b71532b
Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
b71532b
Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
b71532b
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
b71532b
Reported-by: Mark <markk@clara.co.uk>
b71532b
Cc: Mel Gorman <mgorman@suse.de>
b71532b
Cc: Rik van Riel <riel@redhat.com>
b71532b
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
b71532b
Cc: Dave Hansen <dave.hansen@linux.intel.com>
b71532b
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
b71532b
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
b71532b
---
b71532b
 arch/x86/Kconfig                  |  6 ++++--
b71532b
 include/linux/gfp.h               | 33 ++++++++++++++++++++-------------
b71532b
 include/linux/page-flags-layout.h |  2 ++
b71532b
 mm/Kconfig                        |  7 +++++--
b71532b
 4 files changed, 31 insertions(+), 17 deletions(-)
b71532b
b71532b
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
b71532b
index 3fef519..b94704a 100644
b71532b
--- a/arch/x86/Kconfig
b71532b
+++ b/arch/x86/Kconfig
b71532b
@@ -1409,8 +1409,10 @@ config NUMA_EMU
b71532b
 
b71532b
 config NODES_SHIFT
b71532b
 	int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
b71532b
-	range 1 10
b71532b
-	default "10" if MAXSMP
b71532b
+	range 1 10 if !NR_ZONES_EXTENDED
b71532b
+	range 1 9 if NR_ZONES_EXTENDED
b71532b
+	default "10" if MAXSMP && !NR_ZONES_EXTENDED
b71532b
+	default "9" if MAXSMP && NR_ZONES_EXTENDED
b71532b
 	default "6" if X86_64
b71532b
 	default "3"
b71532b
 	depends on NEED_MULTIPLE_NODES
b71532b
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
b71532b
index af1f2b2..d201d8a 100644
b71532b
--- a/include/linux/gfp.h
b71532b
+++ b/include/linux/gfp.h
b71532b
@@ -329,22 +329,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
b71532b
  *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
b71532b
  *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
b71532b
  *
b71532b
- * ZONES_SHIFT must be <= 2 on 32 bit platforms.
b71532b
+ * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
b71532b
  */
b71532b
 
b71532b
-#if 16 * ZONES_SHIFT > BITS_PER_LONG
b71532b
-#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
b71532b
+#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
b71532b
+/* ZONE_DEVICE is not a valid GFP zone specifier */
b71532b
+#define GFP_ZONES_SHIFT 2
b71532b
+#else
b71532b
+#define GFP_ZONES_SHIFT ZONES_SHIFT
b71532b
+#endif
b71532b
+
b71532b
+#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
b71532b
+#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
b71532b
 #endif
b71532b
 
b71532b
 #define GFP_ZONE_TABLE ( \
b71532b
-	(ZONE_NORMAL << 0 * ZONES_SHIFT)				      \
b71532b
-	| (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT)			      \
b71532b
-	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT)		      \
b71532b
-	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT)		      \
b71532b
-	| (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT)			      \
b71532b
-	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT)	      \
b71532b
-	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT)   \
b71532b
-	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT)   \
b71532b
+	(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)					\
b71532b
+	| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)			\
b71532b
+	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)		\
b71532b
+	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)		      	\
b71532b
+	| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)			\
b71532b
+	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)	\
b71532b
+	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)	\
b71532b
+	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)	\
b71532b
 )
b71532b
 
b71532b
 /*
b71532b
@@ -369,8 +376,8 @@ static inline enum zone_type gfp_zone(gfp_t flags)
b71532b
 	enum zone_type z;
b71532b
 	int bit = (__force int) (flags & GFP_ZONEMASK);
b71532b
 
b71532b
-	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
b71532b
-					 ((1 << ZONES_SHIFT) - 1);
b71532b
+	z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
b71532b
+					 ((1 << GFP_ZONES_SHIFT) - 1);
b71532b
 	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
b71532b
 	return z;
b71532b
 }
b71532b
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
b71532b
index da52366..77b078c 100644
b71532b
--- a/include/linux/page-flags-layout.h
b71532b
+++ b/include/linux/page-flags-layout.h
b71532b
@@ -17,6 +17,8 @@
b71532b
 #define ZONES_SHIFT 1
b71532b
 #elif MAX_NR_ZONES <= 4
b71532b
 #define ZONES_SHIFT 2
b71532b
+#elif MAX_NR_ZONES <= 8
b71532b
+#define ZONES_SHIFT 3
b71532b
 #else
b71532b
 #error ZONES_SHIFT -- too many zones configured adjust calculation
b71532b
 #endif
b71532b
diff --git a/mm/Kconfig b/mm/Kconfig
b71532b
index 031a329..7826216 100644
b71532b
--- a/mm/Kconfig
b71532b
+++ b/mm/Kconfig
b71532b
@@ -652,8 +652,6 @@ config IDLE_PAGE_TRACKING
b71532b
 
b71532b
 config ZONE_DEVICE
b71532b
 	bool "Device memory (pmem, etc...) hotplug support"
b71532b
-	default !ZONE_DMA
b71532b
-	depends on !ZONE_DMA
b71532b
 	depends on MEMORY_HOTPLUG
b71532b
 	depends on MEMORY_HOTREMOVE
b71532b
 	depends on X86_64 #arch_add_memory() comprehends device memory
b71532b
@@ -667,5 +665,10 @@ config ZONE_DEVICE
b71532b
 
b71532b
 	  If FS_DAX is enabled, then say Y.
b71532b
 
b71532b
+config NR_ZONES_EXTENDED
b71532b
+	bool
b71532b
+	default n if !64BIT
b71532b
+	default y if ZONE_DEVICE && ZONE_DMA && ZONE_DMA32
b71532b
+
b71532b
 config FRAME_VECTOR
b71532b
 	bool
b71532b
-- 
b71532b
2.5.0
b71532b