Blob Blame History Raw
From 735f01326873349426f041a4fa2f5703a1ed43a4 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 5 Feb 2016 15:06:15 -0800
Subject: [PATCH 01/36] drm/vc4: Fix a framebuffer reference leak on async flip
 interrupt.

We'd need X to queue up an async pageflip while another is
outstanding, and then take a SIGIO.  I think X actually avoids sending
out the next pageflip while one's already queued, but I'm not sure.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 018145e..989ee72 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -544,6 +544,7 @@ static int vc4_async_page_flip(struct drm_crtc *crtc,
 	/* Make sure all other async modesetes have landed. */
 	ret = down_interruptible(&vc4->async_modeset);
 	if (ret) {
+		drm_framebuffer_unreference(fb);
 		kfree(flip_state);
 		return ret;
 	}
-- 
2.7.3

From e1ceac2cefbda12d1d9d9ee547fc0cc8bfeebde6 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Feb 2016 14:15:14 -0800
Subject: [PATCH 02/36] drm/vc4: Bring HDMI up from power off if necessary.

If the firmware hadn't brought up HDMI for us, we need to do its
power-on reset sequence (reset HD and and clear its STANDBY bits,
reset HDMI, and leave the PHY disabled).

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_hdmi.c | 29 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_regs.h |  2 ++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
index c69c046..6e55760 100644
--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
+++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
@@ -495,6 +495,16 @@ static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data)
 		goto err_put_i2c;
 	}
 
+	/* This is the rate that is set by the firmware.  The number
+	 * needs to be a bit higher than the pixel clock rate
+	 * (generally 148.5Mhz).
+	 */
+	ret = clk_set_rate(hdmi->hsm_clock, 163682864);
+	if (ret) {
+		DRM_ERROR("Failed to set HSM clock rate: %d\n", ret);
+		goto err_unprepare_pix;
+	}
+
 	ret = clk_prepare_enable(hdmi->hsm_clock);
 	if (ret) {
 		DRM_ERROR("Failed to turn on HDMI state machine clock: %d\n",
@@ -516,7 +526,24 @@ static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data)
 	vc4->hdmi = hdmi;
 
 	/* HDMI core must be enabled. */
-	WARN_ON_ONCE((HD_READ(VC4_HD_M_CTL) & VC4_HD_M_ENABLE) == 0);
+	if (!(HD_READ(VC4_HD_M_CTL) & VC4_HD_M_ENABLE)) {
+		HD_WRITE(VC4_HD_M_CTL, VC4_HD_M_SW_RST);
+		udelay(1);
+		HD_WRITE(VC4_HD_M_CTL, 0);
+
+		HD_WRITE(VC4_HD_M_CTL, VC4_HD_M_ENABLE);
+
+		HDMI_WRITE(VC4_HDMI_SW_RESET_CONTROL,
+			   VC4_HDMI_SW_RESET_HDMI |
+			   VC4_HDMI_SW_RESET_FORMAT_DETECT);
+
+		HDMI_WRITE(VC4_HDMI_SW_RESET_CONTROL, 0);
+
+		/* PHY should be in reset, like
+		 * vc4_hdmi_encoder_disable() does.
+		 */
+		HDMI_WRITE(VC4_HDMI_TX_PHY_RESET_CTL, 0xf << 16);
+	}
 
 	drm_encoder_init(drm, hdmi->encoder, &vc4_hdmi_encoder_funcs,
 			 DRM_MODE_ENCODER_TMDS, NULL);
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 4e52a0a..85c36d2 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -456,6 +456,8 @@
 #define VC4_HDMI_TX_PHY_RESET_CTL		0x2c0
 
 #define VC4_HD_M_CTL				0x00c
+# define VC4_HD_M_REGISTER_FILE_STANDBY		(3 << 6)
+# define VC4_HD_M_RAM_STANDBY			(3 << 4)
 # define VC4_HD_M_SW_RST			BIT(2)
 # define VC4_HD_M_ENABLE			BIT(0)
 
-- 
2.7.3

From 63d38d99739736480b24c9f9bd7880ce4e49eb0c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Feb 2016 15:16:56 -0800
Subject: [PATCH 03/36] drm/vc4: Add another reg to HDMI debug dumping.

This is also involved in the HDMI setup sequence so it's nice to see
it.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_hdmi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
index 6e55760..56272ca 100644
--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
+++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
@@ -95,6 +95,7 @@ static const struct {
 	HDMI_REG(VC4_HDMI_SW_RESET_CONTROL),
 	HDMI_REG(VC4_HDMI_HOTPLUG_INT),
 	HDMI_REG(VC4_HDMI_HOTPLUG),
+	HDMI_REG(VC4_HDMI_RAM_PACKET_CONFIG),
 	HDMI_REG(VC4_HDMI_HORZA),
 	HDMI_REG(VC4_HDMI_HORZB),
 	HDMI_REG(VC4_HDMI_FIFO_CTL),
-- 
2.7.3

From 46e96facb9b67486285c26f88ee747b8d9f4abc9 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 15 Feb 2016 17:06:02 -0800
Subject: [PATCH 04/36] drm/vc4: Fix the name of the VSYNCD_EVEN register.

It's used for delaying vsync in interlaced mode.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c | 2 +-
 drivers/gpu/drm/vc4/vc4_regs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 989ee72..5e84be2 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -83,7 +83,7 @@ static const struct {
 } crtc_regs[] = {
 	CRTC_REG(PV_CONTROL),
 	CRTC_REG(PV_V_CONTROL),
-	CRTC_REG(PV_VSYNCD),
+	CRTC_REG(PV_VSYNCD_EVEN),
 	CRTC_REG(PV_HORZA),
 	CRTC_REG(PV_HORZB),
 	CRTC_REG(PV_VERTA),
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 85c36d2..d529665 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -187,7 +187,7 @@
 # define PV_VCONTROL_CONTINUOUS			BIT(1)
 # define PV_VCONTROL_VIDEN			BIT(0)
 
-#define PV_VSYNCD				0x08
+#define PV_VSYNCD_EVEN				0x08
 
 #define PV_HORZA				0x0c
 # define PV_HORZA_HBP_MASK			VC4_MASK(31, 16)
-- 
2.7.3

From baff41935a7b4c1b6015a99a0ca222fd0a5552b9 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 15 Feb 2016 17:31:41 -0800
Subject: [PATCH 05/36] drm/vc4: Fix setting of vertical timings in the CRTC.

It looks like when I went to add the interlaced bits, I just took the
existing PV_VERT* block and indented it, instead of copy and pasting
it first.  Without this, changing resolution never worked.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 5e84be2..93d53c2 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -212,6 +212,16 @@ static void vc4_crtc_mode_set_nofb(struct drm_crtc *crtc)
 				 PV_HORZB_HFP) |
 		   VC4_SET_FIELD(mode->hdisplay, PV_HORZB_HACTIVE));
 
+	CRTC_WRITE(PV_VERTA,
+		   VC4_SET_FIELD(mode->vtotal - mode->vsync_end,
+				 PV_VERTA_VBP) |
+		   VC4_SET_FIELD(mode->vsync_end - mode->vsync_start,
+				 PV_VERTA_VSYNC));
+	CRTC_WRITE(PV_VERTB,
+		   VC4_SET_FIELD(mode->vsync_start - mode->vdisplay,
+				 PV_VERTB_VFP) |
+		   VC4_SET_FIELD(vactive, PV_VERTB_VACTIVE));
+
 	if (interlace) {
 		CRTC_WRITE(PV_VERTA_EVEN,
 			   VC4_SET_FIELD(mode->vtotal - mode->vsync_end - 1,
-- 
2.7.3

From 6f7cde6ad6e866660b8e5607a213872e5f34e8fd Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 16 Feb 2016 10:24:08 -0800
Subject: [PATCH 06/36] drm/vc4: Initialize scaler DISPBKGND on modeset.

We weren't updating the interlaced bit, so we'd scan out incorrectly
if the firmware had brought up the TV encoder and we were switching to
HDMI.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c |  6 ++++++
 drivers/gpu/drm/vc4/vc4_regs.h | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 93d53c2..6ae5abc 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -183,6 +183,8 @@ static int vc4_get_clock_select(struct drm_crtc *crtc)
 
 static void vc4_crtc_mode_set_nofb(struct drm_crtc *crtc)
 {
+	struct drm_device *dev = crtc->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct vc4_crtc *vc4_crtc = to_vc4_crtc(crtc);
 	struct drm_crtc_state *state = crtc->state;
 	struct drm_display_mode *mode = &state->adjusted_mode;
@@ -251,6 +253,10 @@ static void vc4_crtc_mode_set_nofb(struct drm_crtc *crtc)
 		   PV_CONTROL_FIFO_CLR |
 		   PV_CONTROL_EN);
 
+	HVS_WRITE(SCALER_DISPBKGNDX(vc4_crtc->channel),
+		  SCALER_DISPBKGND_AUTOHS |
+		  (interlace ? SCALER_DISPBKGND_INTERLACE : 0));
+
 	if (debug_dump_regs) {
 		DRM_INFO("CRTC %d regs after:\n", drm_crtc_index(crtc));
 		vc4_crtc_dump_regs(vc4_crtc);
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index d529665..7c29993 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -350,6 +350,17 @@
 # define SCALER_DISPCTRLX_HEIGHT_SHIFT		0
 
 #define SCALER_DISPBKGND0                       0x00000044
+# define SCALER_DISPBKGND_AUTOHS		BIT(31)
+# define SCALER_DISPBKGND_INTERLACE		BIT(30)
+# define SCALER_DISPBKGND_GAMMA			BIT(29)
+# define SCALER_DISPBKGND_TESTMODE_MASK		VC4_MASK(28, 25)
+# define SCALER_DISPBKGND_TESTMODE_SHIFT	25
+/* Enables filling the scaler line with the RGB value in the low 24
+ * bits before compositing.  Costs cycles, so should be skipped if
+ * opaque display planes will cover everything.
+ */
+# define SCALER_DISPBKGND_FILL			BIT(24)
+
 #define SCALER_DISPSTAT0                        0x00000048
 #define SCALER_DISPBASE0                        0x0000004c
 # define SCALER_DISPSTATX_MODE_MASK		VC4_MASK(31, 30)
@@ -362,6 +373,9 @@
 # define SCALER_DISPSTATX_EMPTY			BIT(28)
 #define SCALER_DISPCTRL1                        0x00000050
 #define SCALER_DISPBKGND1                       0x00000054
+#define SCALER_DISPBKGNDX(x)			(SCALER_DISPBKGND0 +        \
+						 (x) * (SCALER_DISPBKGND1 - \
+							SCALER_DISPBKGND0))
 #define SCALER_DISPSTAT1                        0x00000058
 #define SCALER_DISPSTATX(x)			(SCALER_DISPSTAT0 +        \
 						 (x) * (SCALER_DISPSTAT1 - \
-- 
2.7.3

From 449c91f1f06a573ad4a3edd18d7b493bf44478f6 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 28 Dec 2015 14:14:09 -0800
Subject: [PATCH 07/36] drm/vc4: Improve comments on vc4_plane_state members.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_plane.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 0addbad..45e353d 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -26,16 +26,19 @@
 
 struct vc4_plane_state {
 	struct drm_plane_state base;
+	/* System memory copy of the display list for this element, computed
+	 * at atomic_check time.
+	 */
 	u32 *dlist;
-	u32 dlist_size; /* Number of dwords in allocated for the display list */
+	u32 dlist_size; /* Number of dwords allocated for the display list */
 	u32 dlist_count; /* Number of used dwords in the display list. */
 
 	/* Offset in the dlist to pointer word 0. */
 	u32 pw0_offset;
 
 	/* Offset where the plane's dlist was last stored in the
-	   hardware at vc4_crtc_atomic_flush() time.
-	*/
+	 * hardware at vc4_crtc_atomic_flush() time.
+	 */
 	u32 *hw_dlist;
 };
 
-- 
2.7.3

From 4c8b2ce80659e1c7a75b7b54430dab320aeb440b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 28 Dec 2015 14:14:57 -0800
Subject: [PATCH 08/36] drm/vc4: Add missing __iomem annotation to hw_dlist.

This is the pointer to the HVS device's memory where we stored the
contents of *dlist.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_plane.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 45e353d..ed07ee5 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -39,7 +39,7 @@ struct vc4_plane_state {
 	/* Offset where the plane's dlist was last stored in the
 	 * hardware at vc4_crtc_atomic_flush() time.
 	 */
-	u32 *hw_dlist;
+	u32 __iomem *hw_dlist;
 };
 
 static inline struct vc4_plane_state *
-- 
2.7.3

From f792f380190638916b495f3051547a849fc97fd2 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 28 Dec 2015 14:34:44 -0800
Subject: [PATCH 09/36] drm/vc4: Move the plane clipping/scaling setup to a
 separate function.

As we add actual scaling, this is going to get way more complicated.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_plane.c | 78 +++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index ed07ee5..554ed54 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -40,6 +40,14 @@ struct vc4_plane_state {
 	 * hardware at vc4_crtc_atomic_flush() time.
 	 */
 	u32 __iomem *hw_dlist;
+
+	/* Clipped coordinates of the plane on the display. */
+	int crtc_x, crtc_y, crtc_w, crtc_h;
+
+	/* Offset to start scanning out from the start of the plane's
+	 * BO.
+	 */
+	u32 offset;
 };
 
 static inline struct vc4_plane_state *
@@ -151,22 +159,17 @@ static void vc4_dlist_write(struct vc4_plane_state *vc4_state, u32 val)
 	vc4_state->dlist[vc4_state->dlist_count++] = val;
 }
 
-/* Writes out a full display list for an active plane to the plane's
- * private dlist state.
- */
-static int vc4_plane_mode_set(struct drm_plane *plane,
-			      struct drm_plane_state *state)
+static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 {
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
-	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
-	u32 ctl0_offset = vc4_state->dlist_count;
-	const struct hvs_format *format = vc4_get_hvs_format(fb->pixel_format);
-	uint32_t offset = fb->offsets[0];
-	int crtc_x = state->crtc_x;
-	int crtc_y = state->crtc_y;
-	int crtc_w = state->crtc_w;
-	int crtc_h = state->crtc_h;
+
+	vc4_state->offset = fb->offsets[0];
+
+	vc4_state->crtc_x = state->crtc_x;
+	vc4_state->crtc_y = state->crtc_y;
+	vc4_state->crtc_w = state->crtc_w;
+	vc4_state->crtc_h = state->crtc_h;
 
 	if (state->crtc_w << 16 != state->src_w ||
 	    state->crtc_h << 16 != state->src_h) {
@@ -178,18 +181,41 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 		return -EINVAL;
 	}
 
-	if (crtc_x < 0) {
-		offset += drm_format_plane_cpp(fb->pixel_format, 0) * -crtc_x;
-		crtc_w += crtc_x;
-		crtc_x = 0;
+	if (vc4_state->crtc_x < 0) {
+		vc4_state->offset += (drm_format_plane_cpp(fb->pixel_format,
+							   0) *
+				      -vc4_state->crtc_x);
+		vc4_state->crtc_w += vc4_state->crtc_x;
+		vc4_state->crtc_x = 0;
 	}
 
-	if (crtc_y < 0) {
-		offset += fb->pitches[0] * -crtc_y;
-		crtc_h += crtc_y;
-		crtc_y = 0;
+	if (vc4_state->crtc_y < 0) {
+		vc4_state->offset += fb->pitches[0] * -vc4_state->crtc_y;
+		vc4_state->crtc_h += vc4_state->crtc_y;
+		vc4_state->crtc_y = 0;
 	}
 
+	return 0;
+}
+
+
+/* Writes out a full display list for an active plane to the plane's
+ * private dlist state.
+ */
+static int vc4_plane_mode_set(struct drm_plane *plane,
+			      struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+	struct drm_framebuffer *fb = state->fb;
+	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
+	u32 ctl0_offset = vc4_state->dlist_count;
+	const struct hvs_format *format = vc4_get_hvs_format(fb->pixel_format);
+	int ret;
+
+	ret = vc4_plane_setup_clipping_and_scaling(state);
+	if (ret)
+		return ret;
+
 	vc4_dlist_write(vc4_state,
 			SCALER_CTL0_VALID |
 			(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
@@ -199,8 +225,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	/* Position Word 0: Image Positions and Alpha Value */
 	vc4_dlist_write(vc4_state,
 			VC4_SET_FIELD(0xff, SCALER_POS0_FIXED_ALPHA) |
-			VC4_SET_FIELD(crtc_x, SCALER_POS0_START_X) |
-			VC4_SET_FIELD(crtc_y, SCALER_POS0_START_Y));
+			VC4_SET_FIELD(vc4_state->crtc_x, SCALER_POS0_START_X) |
+			VC4_SET_FIELD(vc4_state->crtc_y, SCALER_POS0_START_Y));
 
 	/* Position Word 1: Scaled Image Dimensions.
 	 * Skipped due to SCALER_CTL0_UNITY scaling.
@@ -212,8 +238,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 				      SCALER_POS2_ALPHA_MODE_PIPELINE :
 				      SCALER_POS2_ALPHA_MODE_FIXED,
 				      SCALER_POS2_ALPHA_MODE) |
-			VC4_SET_FIELD(crtc_w, SCALER_POS2_WIDTH) |
-			VC4_SET_FIELD(crtc_h, SCALER_POS2_HEIGHT));
+			VC4_SET_FIELD(vc4_state->crtc_w, SCALER_POS2_WIDTH) |
+			VC4_SET_FIELD(vc4_state->crtc_h, SCALER_POS2_HEIGHT));
 
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
@@ -221,7 +247,7 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	vc4_state->pw0_offset = vc4_state->dlist_count;
 
 	/* Pointer Word 0: RGB / Y Pointer */
-	vc4_dlist_write(vc4_state, bo->paddr + offset);
+	vc4_dlist_write(vc4_state, bo->paddr + vc4_state->offset);
 
 	/* Pointer Context Word 0: Written by the HVS */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
-- 
2.7.3

From 696f1db279f08e58bc94172818209d5914e0e2d8 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 30 Dec 2015 11:50:22 -0800
Subject: [PATCH 10/36] drm/vc4: Add a proper short-circut path for legacy
 cursor updates.

Previously, on every modeset we would allocate new display list
memory, recompute changed planes, write all of them to the new memory,
and pointed scanout at the new list (which will latch approximately at
the next line of scanout).  We let
drm_atomic_helper_wait_for_vblanks() decide whether we needed to wait
for a vblank after a modeset before cleaning up the old state and
letting the next modeset proceed, and on legacy cursor updates we
wouldn't wait.  If you moved the cursor fast enough, we could
potentially wrap around the display list memory area and overwrite the
existing display list while it was still being scanned out, resulting
in the HVS scanning out garbage or just halting.

Instead of making cursor updates wait for scanout to move to the new
display list area (which introduces significant cursor lag in X), we
just rewrite our current display list.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_kms.c   |  9 ++++
 drivers/gpu/drm/vc4/vc4_plane.c | 94 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c
index f95f2df..4718ae5 100644
--- a/drivers/gpu/drm/vc4/vc4_kms.c
+++ b/drivers/gpu/drm/vc4/vc4_kms.c
@@ -49,6 +49,15 @@ vc4_atomic_complete_commit(struct vc4_commit *c)
 
 	drm_atomic_helper_commit_modeset_enables(dev, state);
 
+	/* Make sure that drm_atomic_helper_wait_for_vblanks()
+	 * actually waits for vblank.  If we're doing a full atomic
+	 * modeset (as opposed to a vc4_update_plane() short circuit),
+	 * then we need to wait for scanout to be done with our
+	 * display lists before we free it and potentially reallocate
+	 * and overwrite the dlist memory with a new modeset.
+	 */
+	state->legacy_cursor_update = false;
+
 	drm_atomic_helper_wait_for_vblanks(dev, state);
 
 	drm_atomic_helper_cleanup_planes(dev, state);
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 554ed54..713ec00 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -33,8 +33,12 @@ struct vc4_plane_state {
 	u32 dlist_size; /* Number of dwords allocated for the display list */
 	u32 dlist_count; /* Number of used dwords in the display list. */
 
-	/* Offset in the dlist to pointer word 0. */
-	u32 pw0_offset;
+	/* Offset in the dlist to various words, for pageflip or
+	 * cursor updates.
+	 */
+	u32 pos0_offset;
+	u32 pos2_offset;
+	u32 ptr0_offset;
 
 	/* Offset where the plane's dlist was last stored in the
 	 * hardware at vc4_crtc_atomic_flush() time.
@@ -223,6 +227,7 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 			SCALER_CTL0_UNITY);
 
 	/* Position Word 0: Image Positions and Alpha Value */
+	vc4_state->pos0_offset = vc4_state->dlist_count;
 	vc4_dlist_write(vc4_state,
 			VC4_SET_FIELD(0xff, SCALER_POS0_FIXED_ALPHA) |
 			VC4_SET_FIELD(vc4_state->crtc_x, SCALER_POS0_START_X) |
@@ -233,6 +238,7 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	 */
 
 	/* Position Word 2: Source Image Size, Alpha Mode */
+	vc4_state->pos2_offset = vc4_state->dlist_count;
 	vc4_dlist_write(vc4_state,
 			VC4_SET_FIELD(format->has_alpha ?
 				      SCALER_POS2_ALPHA_MODE_PIPELINE :
@@ -244,9 +250,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
-	vc4_state->pw0_offset = vc4_state->dlist_count;
-
 	/* Pointer Word 0: RGB / Y Pointer */
+	vc4_state->ptr0_offset = vc4_state->dlist_count;
 	vc4_dlist_write(vc4_state, bo->paddr + vc4_state->offset);
 
 	/* Pointer Context Word 0: Written by the HVS */
@@ -332,13 +337,13 @@ void vc4_plane_async_set_fb(struct drm_plane *plane, struct drm_framebuffer *fb)
 	 * scanout will start from this address as soon as the FIFO
 	 * needs to refill with pixels.
 	 */
-	writel(addr, &vc4_state->hw_dlist[vc4_state->pw0_offset]);
+	writel(addr, &vc4_state->hw_dlist[vc4_state->ptr0_offset]);
 
 	/* Also update the CPU-side dlist copy, so that any later
 	 * atomic updates that don't do a new modeset on our plane
 	 * also use our updated address.
 	 */
-	vc4_state->dlist[vc4_state->pw0_offset] = addr;
+	vc4_state->dlist[vc4_state->ptr0_offset] = addr;
 }
 
 static const struct drm_plane_helper_funcs vc4_plane_helper_funcs = {
@@ -354,8 +359,83 @@ static void vc4_plane_destroy(struct drm_plane *plane)
 	drm_plane_cleanup(plane);
 }
 
+/* Implements immediate (non-vblank-synced) updates of the cursor
+ * position, or falls back to the atomic helper otherwise.
+ */
+static int
+vc4_update_plane(struct drm_plane *plane,
+		 struct drm_crtc *crtc,
+		 struct drm_framebuffer *fb,
+		 int crtc_x, int crtc_y,
+		 unsigned int crtc_w, unsigned int crtc_h,
+		 uint32_t src_x, uint32_t src_y,
+		 uint32_t src_w, uint32_t src_h)
+{
+	struct drm_plane_state *plane_state;
+	struct vc4_plane_state *vc4_state;
+
+	if (plane != crtc->cursor)
+		goto out;
+
+	plane_state = plane->state;
+	vc4_state = to_vc4_plane_state(plane_state);
+
+	if (!plane_state)
+		goto out;
+
+	/* If we're changing the cursor contents, do that in the
+	 * normal vblank-synced atomic path.
+	 */
+	if (fb != plane_state->fb)
+		goto out;
+
+	/* No configuring new scaling in the fast path. */
+	if (crtc_w != plane_state->crtc_w ||
+	    crtc_h != plane_state->crtc_h ||
+	    src_w != plane_state->src_w ||
+	    src_h != plane_state->src_h) {
+		goto out;
+	}
+
+	/* Set the cursor's position on the screen.  This is the
+	 * expected change from the drm_mode_cursor_universal()
+	 * helper.
+	 */
+	plane_state->crtc_x = crtc_x;
+	plane_state->crtc_y = crtc_y;
+
+	/* Allow changing the start position within the cursor BO, if
+	 * that matters.
+	 */
+	plane_state->src_x = src_x;
+	plane_state->src_y = src_y;
+
+	/* Update the display list based on the new crtc_x/y. */
+	vc4_plane_atomic_check(plane, plane_state);
+
+	/* Note that we can't just call vc4_plane_write_dlist()
+	 * because that would smash the context data that the HVS is
+	 * currently using.
+	 */
+	writel(vc4_state->dlist[vc4_state->pos0_offset],
+	       &vc4_state->hw_dlist[vc4_state->pos0_offset]);
+	writel(vc4_state->dlist[vc4_state->pos2_offset],
+	       &vc4_state->hw_dlist[vc4_state->pos2_offset]);
+	writel(vc4_state->dlist[vc4_state->ptr0_offset],
+	       &vc4_state->hw_dlist[vc4_state->ptr0_offset]);
+
+	return 0;
+
+out:
+	return drm_atomic_helper_update_plane(plane, crtc, fb,
+					      crtc_x, crtc_y,
+					      crtc_w, crtc_h,
+					      src_x, src_y,
+					      src_w, src_h);
+}
+
 static const struct drm_plane_funcs vc4_plane_funcs = {
-	.update_plane = drm_atomic_helper_update_plane,
+	.update_plane = vc4_update_plane,
 	.disable_plane = drm_atomic_helper_disable_plane,
 	.destroy = vc4_plane_destroy,
 	.set_property = NULL,
-- 
2.7.3

From cd30019db690e3a92fe5d7d771352f118a105f82 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 28 Dec 2015 13:25:41 -0800
Subject: [PATCH 11/36] drm/vc4: Make the CRTCs cooperate on allocating display
 lists.

So far, we've only ever lit up one CRTC, so this has been fine.  To
extend to more displays or more planes, we need to make sure we don't
run our display lists into each other.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c | 115 +++++++++++++++++++++++------------------
 drivers/gpu/drm/vc4/vc4_drv.h  |   8 ++-
 drivers/gpu/drm/vc4/vc4_hvs.c  |  13 +++++
 3 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 6ae5abc..9032c06 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -49,22 +49,27 @@ struct vc4_crtc {
 	/* Which HVS channel we're using for our CRTC. */
 	int channel;
 
-	/* Pointer to the actual hardware display list memory for the
-	 * crtc.
-	 */
-	u32 __iomem *dlist;
-
-	u32 dlist_size; /* in dwords */
-
 	struct drm_pending_vblank_event *event;
 };
 
+struct vc4_crtc_state {
+	struct drm_crtc_state base;
+	/* Dlist area for this CRTC configuration. */
+	struct drm_mm_node mm;
+};
+
 static inline struct vc4_crtc *
 to_vc4_crtc(struct drm_crtc *crtc)
 {
 	return (struct vc4_crtc *)crtc;
 }
 
+static inline struct vc4_crtc_state *
+to_vc4_crtc_state(struct drm_crtc_state *crtc_state)
+{
+	return (struct vc4_crtc_state *)crtc_state;
+}
+
 struct vc4_crtc_data {
 	/* Which channel of the HVS this pixelvalve sources from. */
 	int hvs_channel;
@@ -335,11 +340,13 @@ static void vc4_crtc_enable(struct drm_crtc *crtc)
 static int vc4_crtc_atomic_check(struct drm_crtc *crtc,
 				 struct drm_crtc_state *state)
 {
+	struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(state);
 	struct drm_device *dev = crtc->dev;
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct drm_plane *plane;
-	struct vc4_crtc *vc4_crtc = to_vc4_crtc(crtc);
+	unsigned long flags;
 	u32 dlist_count = 0;
+	int ret;
 
 	/* The pixelvalve can only feed one encoder (and encoders are
 	 * 1:1 with connectors.)
@@ -362,18 +369,12 @@ static int vc4_crtc_atomic_check(struct drm_crtc *crtc,
 
 	dlist_count++; /* Account for SCALER_CTL0_END. */
 
-	if (!vc4_crtc->dlist || dlist_count > vc4_crtc->dlist_size) {
-		vc4_crtc->dlist = ((u32 __iomem *)vc4->hvs->dlist +
-				   HVS_BOOTLOADER_DLIST_END);
-		vc4_crtc->dlist_size = ((SCALER_DLIST_SIZE >> 2) -
-					HVS_BOOTLOADER_DLIST_END);
-
-		if (dlist_count > vc4_crtc->dlist_size) {
-			DRM_DEBUG_KMS("dlist too large for CRTC (%d > %d).\n",
-				      dlist_count, vc4_crtc->dlist_size);
-			return -EINVAL;
-		}
-	}
+	spin_lock_irqsave(&vc4->hvs->mm_lock, flags);
+	ret = drm_mm_insert_node(&vc4->hvs->dlist_mm, &vc4_state->mm,
+				 dlist_count, 1, 0);
+	spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags);
+	if (ret)
+		return ret;
 
 	return 0;
 }
@@ -384,47 +385,29 @@ static void vc4_crtc_atomic_flush(struct drm_crtc *crtc,
 	struct drm_device *dev = crtc->dev;
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct vc4_crtc *vc4_crtc = to_vc4_crtc(crtc);
+	struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc->state);
 	struct drm_plane *plane;
 	bool debug_dump_regs = false;
-	u32 __iomem *dlist_next = vc4_crtc->dlist;
+	u32 __iomem *dlist_start = vc4->hvs->dlist + vc4_state->mm.start;
+	u32 __iomem *dlist_next = dlist_start;
 
 	if (debug_dump_regs) {
 		DRM_INFO("CRTC %d HVS before:\n", drm_crtc_index(crtc));
 		vc4_hvs_dump_state(dev);
 	}
 
-	/* Copy all the active planes' dlist contents to the hardware dlist.
-	 *
-	 * XXX: If the new display list was large enough that it
-	 * overlapped a currently-read display list, we need to do
-	 * something like disable scanout before putting in the new
-	 * list.  For now, we're safe because we only have the two
-	 * planes.
-	 */
+	/* Copy all the active planes' dlist contents to the hardware dlist. */
 	drm_atomic_crtc_for_each_plane(plane, crtc) {
 		dlist_next += vc4_plane_write_dlist(plane, dlist_next);
 	}
 
-	if (dlist_next == vc4_crtc->dlist) {
-		/* If no planes were enabled, use the SCALER_CTL0_END
-		 * at the start of the display list memory (in the
-		 * bootloader section).  We'll rewrite that
-		 * SCALER_CTL0_END, just in case, though.
-		 */
-		writel(SCALER_CTL0_END, vc4->hvs->dlist);
-		HVS_WRITE(SCALER_DISPLISTX(vc4_crtc->channel), 0);
-	} else {
-		writel(SCALER_CTL0_END, dlist_next);
-		dlist_next++;
-
-		HVS_WRITE(SCALER_DISPLISTX(vc4_crtc->channel),
-			  (u32 __iomem *)vc4_crtc->dlist -
-			  (u32 __iomem *)vc4->hvs->dlist);
-
-		/* Make the next display list start after ours. */
-		vc4_crtc->dlist_size -= (dlist_next - vc4_crtc->dlist);
-		vc4_crtc->dlist = dlist_next;
-	}
+	writel(SCALER_CTL0_END, dlist_next);
+	dlist_next++;
+
+	WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm.size);
+
+	HVS_WRITE(SCALER_DISPLISTX(vc4_crtc->channel),
+		  vc4_state->mm.start);
 
 	if (debug_dump_regs) {
 		DRM_INFO("CRTC %d HVS after:\n", drm_crtc_index(crtc));
@@ -590,6 +573,36 @@ static int vc4_page_flip(struct drm_crtc *crtc,
 		return drm_atomic_helper_page_flip(crtc, fb, event, flags);
 }
 
+static struct drm_crtc_state *vc4_crtc_duplicate_state(struct drm_crtc *crtc)
+{
+	struct vc4_crtc_state *vc4_state;
+
+	vc4_state = kzalloc(sizeof(*vc4_state), GFP_KERNEL);
+	if (!vc4_state)
+		return NULL;
+
+	__drm_atomic_helper_crtc_duplicate_state(crtc, &vc4_state->base);
+	return &vc4_state->base;
+}
+
+static void vc4_crtc_destroy_state(struct drm_crtc *crtc,
+				   struct drm_crtc_state *state)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(crtc->dev);
+	struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(state);
+
+	if (vc4_state->mm.allocated) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&vc4->hvs->mm_lock, flags);
+		drm_mm_remove_node(&vc4_state->mm);
+		spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags);
+
+	}
+
+	__drm_atomic_helper_crtc_destroy_state(crtc, state);
+}
+
 static const struct drm_crtc_funcs vc4_crtc_funcs = {
 	.set_config = drm_atomic_helper_set_config,
 	.destroy = vc4_crtc_destroy,
@@ -598,8 +611,8 @@ static const struct drm_crtc_funcs vc4_crtc_funcs = {
 	.cursor_set = NULL, /* handled by drm_mode_cursor_universal */
 	.cursor_move = NULL, /* handled by drm_mode_cursor_universal */
 	.reset = drm_atomic_helper_crtc_reset,
-	.atomic_duplicate_state = drm_atomic_helper_crtc_duplicate_state,
-	.atomic_destroy_state = drm_atomic_helper_crtc_destroy_state,
+	.atomic_duplicate_state = vc4_crtc_duplicate_state,
+	.atomic_destroy_state = vc4_crtc_destroy_state,
 };
 
 static const struct drm_crtc_helper_funcs vc4_crtc_helper_funcs = {
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 51a6333..38a31c7 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -154,7 +154,13 @@ struct vc4_v3d {
 struct vc4_hvs {
 	struct platform_device *pdev;
 	void __iomem *regs;
-	void __iomem *dlist;
+	u32 __iomem *dlist;
+
+	/* Memory manager for CRTCs to allocate space in the display
+	 * list.  Units are dwords.
+	 */
+	struct drm_mm dlist_mm;
+	spinlock_t mm_lock;
 };
 
 struct vc4_plane {
diff --git a/drivers/gpu/drm/vc4/vc4_hvs.c b/drivers/gpu/drm/vc4/vc4_hvs.c
index 8098c5b..9e43554 100644
--- a/drivers/gpu/drm/vc4/vc4_hvs.c
+++ b/drivers/gpu/drm/vc4/vc4_hvs.c
@@ -119,6 +119,17 @@ static int vc4_hvs_bind(struct device *dev, struct device *master, void *data)
 
 	hvs->dlist = hvs->regs + SCALER_DLIST_START;
 
+	spin_lock_init(&hvs->mm_lock);
+
+	/* Set up the HVS display list memory manager.  We never
+	 * overwrite the setup from the bootloader (just 128b out of
+	 * our 16K), since we don't want to scramble the screen when
+	 * transitioning from the firmware's boot setup to runtime.
+	 */
+	drm_mm_init(&hvs->dlist_mm,
+		    HVS_BOOTLOADER_DLIST_END,
+		    (SCALER_DLIST_SIZE >> 2) - HVS_BOOTLOADER_DLIST_END);
+
 	vc4->hvs = hvs;
 	return 0;
 }
@@ -129,6 +140,8 @@ static void vc4_hvs_unbind(struct device *dev, struct device *master,
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = drm->dev_private;
 
+	drm_mm_takedown(&vc4->hvs->dlist_mm);
+
 	vc4->hvs = NULL;
 }
 
-- 
2.7.3

From 7934fe9bdbbe2ffb4bcfe656a22a8f9f4e3d266a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 28 Dec 2015 14:45:25 -0800
Subject: [PATCH 12/36] drm/vc4: Fix which value is being used for source image
 size.

This doesn't matter yet since we only allow 1:1 scaling, but the
comment clearly says we should be using the source size.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_plane.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 713ec00..d9c9290 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -47,6 +47,8 @@ struct vc4_plane_state {
 
 	/* Clipped coordinates of the plane on the display. */
 	int crtc_x, crtc_y, crtc_w, crtc_h;
+	/* Clipped size of the area scanned from in the FB. */
+	u32 src_w, src_h;
 
 	/* Offset to start scanning out from the start of the plane's
 	 * BO.
@@ -170,11 +172,6 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 
 	vc4_state->offset = fb->offsets[0];
 
-	vc4_state->crtc_x = state->crtc_x;
-	vc4_state->crtc_y = state->crtc_y;
-	vc4_state->crtc_w = state->crtc_w;
-	vc4_state->crtc_h = state->crtc_h;
-
 	if (state->crtc_w << 16 != state->src_w ||
 	    state->crtc_h << 16 != state->src_h) {
 		/* We don't support scaling yet, which involves
@@ -185,17 +182,25 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 		return -EINVAL;
 	}
 
+	vc4_state->src_w = state->src_w >> 16;
+	vc4_state->src_h = state->src_h >> 16;
+
+	vc4_state->crtc_x = state->crtc_x;
+	vc4_state->crtc_y = state->crtc_y;
+	vc4_state->crtc_w = state->crtc_w;
+	vc4_state->crtc_h = state->crtc_h;
+
 	if (vc4_state->crtc_x < 0) {
 		vc4_state->offset += (drm_format_plane_cpp(fb->pixel_format,
 							   0) *
 				      -vc4_state->crtc_x);
-		vc4_state->crtc_w += vc4_state->crtc_x;
+		vc4_state->src_w += vc4_state->crtc_x;
 		vc4_state->crtc_x = 0;
 	}
 
 	if (vc4_state->crtc_y < 0) {
 		vc4_state->offset += fb->pitches[0] * -vc4_state->crtc_y;
-		vc4_state->crtc_h += vc4_state->crtc_y;
+		vc4_state->src_h += vc4_state->crtc_y;
 		vc4_state->crtc_y = 0;
 	}
 
@@ -244,8 +249,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 				      SCALER_POS2_ALPHA_MODE_PIPELINE :
 				      SCALER_POS2_ALPHA_MODE_FIXED,
 				      SCALER_POS2_ALPHA_MODE) |
-			VC4_SET_FIELD(vc4_state->crtc_w, SCALER_POS2_WIDTH) |
-			VC4_SET_FIELD(vc4_state->crtc_h, SCALER_POS2_HEIGHT));
+			VC4_SET_FIELD(vc4_state->src_w, SCALER_POS2_WIDTH) |
+			VC4_SET_FIELD(vc4_state->src_h, SCALER_POS2_HEIGHT));
 
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
-- 
2.7.3

From e710e8e1d13c85a635c09168df6d008955ac5a4e Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 20 Oct 2015 16:06:57 +0100
Subject: [PATCH 13/36] drm/vc4: Add support for scaling of display planes.

This implements a simple policy for choosing scaling modes
(trapezoidal for decimation, PPF for magnification), and a single PPF
filter (Mitchell/Netravali's recommendation).

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.h   |   4 +
 drivers/gpu/drm/vc4/vc4_hvs.c   |  84 +++++++++++++
 drivers/gpu/drm/vc4/vc4_plane.c | 253 +++++++++++++++++++++++++++++++++++++---
 drivers/gpu/drm/vc4/vc4_regs.h  |  46 ++++++++
 4 files changed, 374 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 38a31c7..83db0b7 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -160,7 +160,11 @@ struct vc4_hvs {
 	 * list.  Units are dwords.
 	 */
 	struct drm_mm dlist_mm;
+	/* Memory manager for the LBM memory used by HVS scaling. */
+	struct drm_mm lbm_mm;
 	spinlock_t mm_lock;
+
+	struct drm_mm_node mitchell_netravali_filter;
 };
 
 struct vc4_plane {
diff --git a/drivers/gpu/drm/vc4/vc4_hvs.c b/drivers/gpu/drm/vc4/vc4_hvs.c
index 9e43554..6fbab1c 100644
--- a/drivers/gpu/drm/vc4/vc4_hvs.c
+++ b/drivers/gpu/drm/vc4/vc4_hvs.c
@@ -100,12 +100,76 @@ int vc4_hvs_debugfs_regs(struct seq_file *m, void *unused)
 }
 #endif
 
+/* The filter kernel is composed of dwords each containing 3 9-bit
+ * signed integers packed next to each other.
+ */
+#define VC4_INT_TO_COEFF(coeff) (coeff & 0x1ff)
+#define VC4_PPF_FILTER_WORD(c0, c1, c2)				\
+	((((c0) & 0x1ff) << 0) |				\
+	 (((c1) & 0x1ff) << 9) |				\
+	 (((c2) & 0x1ff) << 18))
+
+/* The whole filter kernel is arranged as the coefficients 0-16 going
+ * up, then a pad, then 17-31 going down and reversed within the
+ * dwords.  This means that a linear phase kernel (where it's
+ * symmetrical at the boundary between 15 and 16) has the last 5
+ * dwords matching the first 5, but reversed.
+ */
+#define VC4_LINEAR_PHASE_KERNEL(c0, c1, c2, c3, c4, c5, c6, c7, c8,	\
+				c9, c10, c11, c12, c13, c14, c15)	\
+	{VC4_PPF_FILTER_WORD(c0, c1, c2),				\
+	 VC4_PPF_FILTER_WORD(c3, c4, c5),				\
+	 VC4_PPF_FILTER_WORD(c6, c7, c8),				\
+	 VC4_PPF_FILTER_WORD(c9, c10, c11),				\
+	 VC4_PPF_FILTER_WORD(c12, c13, c14),				\
+	 VC4_PPF_FILTER_WORD(c15, c15, 0)}
+
+#define VC4_LINEAR_PHASE_KERNEL_DWORDS 6
+#define VC4_KERNEL_DWORDS (VC4_LINEAR_PHASE_KERNEL_DWORDS * 2 - 1)
+
+/* Recommended B=1/3, C=1/3 filter choice from Mitchell/Netravali.
+ * http://www.cs.utexas.edu/~fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf
+ */
+static const u32 mitchell_netravali_1_3_1_3_kernel[] =
+	VC4_LINEAR_PHASE_KERNEL(0, -2, -6, -8, -10, -8, -3, 2, 18,
+				50, 82, 119, 155, 187, 213, 227);
+
+static int vc4_hvs_upload_linear_kernel(struct vc4_hvs *hvs,
+					struct drm_mm_node *space,
+					const u32 *kernel)
+{
+	int ret, i;
+	u32 __iomem *dst_kernel;
+
+	ret = drm_mm_insert_node(&hvs->dlist_mm, space, VC4_KERNEL_DWORDS, 1,
+				 0);
+	if (ret) {
+		DRM_ERROR("Failed to allocate space for filter kernel: %d\n",
+			  ret);
+		return ret;
+	}
+
+	dst_kernel = hvs->dlist + space->start;
+
+	for (i = 0; i < VC4_KERNEL_DWORDS; i++) {
+		if (i < VC4_LINEAR_PHASE_KERNEL_DWORDS)
+			writel(kernel[i], &dst_kernel[i]);
+		else {
+			writel(kernel[VC4_KERNEL_DWORDS - i - 1],
+			       &dst_kernel[i]);
+		}
+	}
+
+	return 0;
+}
+
 static int vc4_hvs_bind(struct device *dev, struct device *master, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = drm->dev_private;
 	struct vc4_hvs *hvs = NULL;
+	int ret;
 
 	hvs = devm_kzalloc(&pdev->dev, sizeof(*hvs), GFP_KERNEL);
 	if (!hvs)
@@ -130,6 +194,22 @@ static int vc4_hvs_bind(struct device *dev, struct device *master, void *data)
 		    HVS_BOOTLOADER_DLIST_END,
 		    (SCALER_DLIST_SIZE >> 2) - HVS_BOOTLOADER_DLIST_END);
 
+	/* Set up the HVS LBM memory manager.  We could have some more
+	 * complicated data structure that allowed reuse of LBM areas
+	 * between planes when they don't overlap on the screen, but
+	 * for now we just allocate globally.
+	 */
+	drm_mm_init(&hvs->lbm_mm, 0, 96 * 1024);
+
+	/* Upload filter kernels.  We only have the one for now, so we
+	 * keep it around for the lifetime of the driver.
+	 */
+	ret = vc4_hvs_upload_linear_kernel(hvs,
+					   &hvs->mitchell_netravali_filter,
+					   mitchell_netravali_1_3_1_3_kernel);
+	if (ret)
+		return ret;
+
 	vc4->hvs = hvs;
 	return 0;
 }
@@ -140,7 +220,11 @@ static void vc4_hvs_unbind(struct device *dev, struct device *master,
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = drm->dev_private;
 
+	if (vc4->hvs->mitchell_netravali_filter.allocated)
+		drm_mm_remove_node(&vc4->hvs->mitchell_netravali_filter);
+
 	drm_mm_takedown(&vc4->hvs->dlist_mm);
+	drm_mm_takedown(&vc4->hvs->lbm_mm);
 
 	vc4->hvs = NULL;
 }
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index d9c9290..7c2d697 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -24,6 +24,12 @@
 #include "drm_fb_cma_helper.h"
 #include "drm_plane_helper.h"
 
+enum vc4_scaling_mode {
+	VC4_SCALING_NONE,
+	VC4_SCALING_TPZ,
+	VC4_SCALING_PPF,
+};
+
 struct vc4_plane_state {
 	struct drm_plane_state base;
 	/* System memory copy of the display list for this element, computed
@@ -47,13 +53,19 @@ struct vc4_plane_state {
 
 	/* Clipped coordinates of the plane on the display. */
 	int crtc_x, crtc_y, crtc_w, crtc_h;
-	/* Clipped size of the area scanned from in the FB. */
-	u32 src_w, src_h;
+	/* Clipped area being scanned from in the FB. */
+	u32 src_x, src_y, src_w, src_h;
+
+	enum vc4_scaling_mode x_scaling, y_scaling;
+	bool is_unity;
 
 	/* Offset to start scanning out from the start of the plane's
 	 * BO.
 	 */
 	u32 offset;
+
+	/* Our allocation in LBM for temporary storage during scaling. */
+	struct drm_mm_node lbm;
 };
 
 static inline struct vc4_plane_state *
@@ -90,6 +102,16 @@ static const struct hvs_format *vc4_get_hvs_format(u32 drm_format)
 	return NULL;
 }
 
+static enum vc4_scaling_mode vc4_get_scaling_mode(u32 src, u32 dst)
+{
+	if (dst > src)
+		return VC4_SCALING_PPF;
+	else if (dst < src)
+		return VC4_SCALING_TPZ;
+	else
+		return VC4_SCALING_NONE;
+}
+
 static bool plane_enabled(struct drm_plane_state *state)
 {
 	return state->fb && state->crtc;
@@ -106,6 +128,8 @@ static struct drm_plane_state *vc4_plane_duplicate_state(struct drm_plane *plane
 	if (!vc4_state)
 		return NULL;
 
+	memset(&vc4_state->lbm, 0, sizeof(vc4_state->lbm));
+
 	__drm_atomic_helper_plane_duplicate_state(plane, &vc4_state->base);
 
 	if (vc4_state->dlist) {
@@ -125,8 +149,17 @@ static struct drm_plane_state *vc4_plane_duplicate_state(struct drm_plane *plane
 static void vc4_plane_destroy_state(struct drm_plane *plane,
 				    struct drm_plane_state *state)
 {
+	struct vc4_dev *vc4 = to_vc4_dev(plane->dev);
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 
+	if (vc4_state->lbm.allocated) {
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);
+		drm_mm_remove_node(&vc4_state->lbm);
+		spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);
+	}
+
 	kfree(vc4_state->dlist);
 	__drm_atomic_helper_plane_destroy_state(plane, &vc4_state->base);
 	kfree(state);
@@ -165,23 +198,60 @@ static void vc4_dlist_write(struct vc4_plane_state *vc4_state, u32 val)
 	vc4_state->dlist[vc4_state->dlist_count++] = val;
 }
 
+/* Returns the scl0/scl1 field based on whether the dimensions need to
+ * be up/down/non-scaled.
+ *
+ * This is a replication of a table from the spec.
+ */
+static u32 vc4_get_scl_field(struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+
+	switch (vc4_state->x_scaling << 2 | vc4_state->y_scaling) {
+	case VC4_SCALING_PPF << 2 | VC4_SCALING_PPF:
+		return SCALER_CTL0_SCL_H_PPF_V_PPF;
+	case VC4_SCALING_TPZ << 2 | VC4_SCALING_PPF:
+		return SCALER_CTL0_SCL_H_TPZ_V_PPF;
+	case VC4_SCALING_PPF << 2 | VC4_SCALING_TPZ:
+		return SCALER_CTL0_SCL_H_PPF_V_TPZ;
+	case VC4_SCALING_TPZ << 2 | VC4_SCALING_TPZ:
+		return SCALER_CTL0_SCL_H_TPZ_V_TPZ;
+	case VC4_SCALING_PPF << 2 | VC4_SCALING_NONE:
+		return SCALER_CTL0_SCL_H_PPF_V_NONE;
+	case VC4_SCALING_NONE << 2 | VC4_SCALING_PPF:
+		return SCALER_CTL0_SCL_H_NONE_V_PPF;
+	case VC4_SCALING_NONE << 2 | VC4_SCALING_TPZ:
+		return SCALER_CTL0_SCL_H_NONE_V_TPZ;
+	case VC4_SCALING_TPZ << 2 | VC4_SCALING_NONE:
+		return SCALER_CTL0_SCL_H_TPZ_V_NONE;
+	default:
+	case VC4_SCALING_NONE << 2 | VC4_SCALING_NONE:
+		/* The unity case is independently handled by
+		 * SCALER_CTL0_UNITY.
+		 */
+		return 0;
+	}
+}
+
 static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 {
+	struct drm_plane *plane = state->plane;
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
+	u32 subpixel_src_mask = (1 << 16) - 1;
 
 	vc4_state->offset = fb->offsets[0];
 
-	if (state->crtc_w << 16 != state->src_w ||
-	    state->crtc_h << 16 != state->src_h) {
-		/* We don't support scaling yet, which involves
-		 * allocating the LBM memory for scaling temporary
-		 * storage, and putting filter kernels in the HVS
-		 * context.
-		 */
+	/* We don't support subpixel source positioning for scaling. */
+	if ((state->src_x & subpixel_src_mask) ||
+	    (state->src_y & subpixel_src_mask) ||
+	    (state->src_w & subpixel_src_mask) ||
+	    (state->src_h & subpixel_src_mask)) {
 		return -EINVAL;
 	}
 
+	vc4_state->src_x = state->src_x >> 16;
+	vc4_state->src_y = state->src_y >> 16;
 	vc4_state->src_w = state->src_w >> 16;
 	vc4_state->src_h = state->src_h >> 16;
 
@@ -190,6 +260,23 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 	vc4_state->crtc_w = state->crtc_w;
 	vc4_state->crtc_h = state->crtc_h;
 
+	vc4_state->x_scaling = vc4_get_scaling_mode(vc4_state->src_w,
+						    vc4_state->crtc_w);
+	vc4_state->y_scaling = vc4_get_scaling_mode(vc4_state->src_h,
+						    vc4_state->crtc_h);
+	vc4_state->is_unity = (vc4_state->x_scaling == VC4_SCALING_NONE &&
+			       vc4_state->y_scaling == VC4_SCALING_NONE);
+
+	/* No configuring scaling on the cursor plane, since it gets
+	   non-vblank-synced updates, and scaling requires requires
+	   LBM changes which have to be vblank-synced.
+	 */
+	if (plane->type == DRM_PLANE_TYPE_CURSOR && !vc4_state->is_unity)
+		return -EINVAL;
+
+	/* Clamp the on-screen start x/y to 0.  The hardware doesn't
+	 * support negative y, and negative x wastes bandwidth.
+	 */
 	if (vc4_state->crtc_x < 0) {
 		vc4_state->offset += (drm_format_plane_cpp(fb->pixel_format,
 							   0) *
@@ -207,6 +294,87 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 	return 0;
 }
 
+static void vc4_write_tpz(struct vc4_plane_state *vc4_state, u32 src, u32 dst)
+{
+	u32 scale, recip;
+
+	scale = (1 << 16) * src / dst;
+
+	/* The specs note that while the reciprocal would be defined
+	 * as (1<<32)/scale, ~0 is close enough.
+	 */
+	recip = ~0 / scale;
+
+	vc4_dlist_write(vc4_state,
+			VC4_SET_FIELD(scale, SCALER_TPZ0_SCALE) |
+			VC4_SET_FIELD(0, SCALER_TPZ0_IPHASE));
+	vc4_dlist_write(vc4_state,
+			VC4_SET_FIELD(recip, SCALER_TPZ1_RECIP));
+}
+
+static void vc4_write_ppf(struct vc4_plane_state *vc4_state, u32 src, u32 dst)
+{
+	u32 scale = (1 << 16) * src / dst;
+
+	vc4_dlist_write(vc4_state,
+			SCALER_PPF_AGC |
+			VC4_SET_FIELD(scale, SCALER_PPF_SCALE) |
+			VC4_SET_FIELD(0, SCALER_PPF_IPHASE));
+}
+
+static u32 vc4_lbm_size(struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+	/* This is the worst case number.  One of the two sizes will
+	 * be used depending on the scaling configuration.
+	 */
+	u32 pix_per_line = max(vc4_state->src_w, (u32)vc4_state->crtc_w);
+	u32 lbm;
+
+	if (vc4_state->is_unity)
+		return 0;
+	else if (vc4_state->y_scaling == VC4_SCALING_TPZ)
+		lbm = pix_per_line * 8;
+	else {
+		/* In special cases, this multiplier might be 12. */
+		lbm = pix_per_line * 16;
+	}
+
+	lbm = roundup(lbm, 32);
+
+	return lbm;
+}
+
+static void vc4_write_scaling_parameters(struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+
+	/* Ch0 H-PPF Word 0: Scaling Parameters */
+	if (vc4_state->x_scaling == VC4_SCALING_PPF) {
+		vc4_write_ppf(vc4_state,
+			      vc4_state->src_w, vc4_state->crtc_w);
+	}
+
+	/* Ch0 V-PPF Words 0-1: Scaling Parameters, Context */
+	if (vc4_state->y_scaling == VC4_SCALING_PPF) {
+		vc4_write_ppf(vc4_state,
+			      vc4_state->src_h, vc4_state->crtc_h);
+		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
+	}
+
+	/* Ch0 H-TPZ Words 0-1: Scaling Parameters, Recip */
+	if (vc4_state->x_scaling == VC4_SCALING_TPZ) {
+		vc4_write_tpz(vc4_state,
+			      vc4_state->src_w, vc4_state->crtc_w);
+	}
+
+	/* Ch0 V-TPZ Words 0-2: Scaling Parameters, Recip, Context */
+	if (vc4_state->y_scaling == VC4_SCALING_TPZ) {
+		vc4_write_tpz(vc4_state,
+			      vc4_state->src_h, vc4_state->crtc_h);
+		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
+	}
+}
 
 /* Writes out a full display list for an active plane to the plane's
  * private dlist state.
@@ -214,22 +382,50 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 static int vc4_plane_mode_set(struct drm_plane *plane,
 			      struct drm_plane_state *state)
 {
+	struct vc4_dev *vc4 = to_vc4_dev(plane->dev);
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
 	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
 	u32 ctl0_offset = vc4_state->dlist_count;
 	const struct hvs_format *format = vc4_get_hvs_format(fb->pixel_format);
+	u32 scl;
+	u32 lbm_size;
+	unsigned long irqflags;
 	int ret;
 
 	ret = vc4_plane_setup_clipping_and_scaling(state);
 	if (ret)
 		return ret;
 
+	/* Allocate the LBM memory that the HVS will use for temporary
+	 * storage due to our scaling/format conversion.
+	 */
+	lbm_size = vc4_lbm_size(state);
+	if (lbm_size) {
+		if (!vc4_state->lbm.allocated) {
+			spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);
+			ret = drm_mm_insert_node(&vc4->hvs->lbm_mm,
+						 &vc4_state->lbm,
+						 lbm_size, 32, 0);
+			spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);
+		} else {
+			WARN_ON_ONCE(lbm_size != vc4_state->lbm.size);
+		}
+	}
+
+	if (ret)
+		return ret;
+
+	scl = vc4_get_scl_field(state);
+
+	/* Control word */
 	vc4_dlist_write(vc4_state,
 			SCALER_CTL0_VALID |
 			(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
 			(format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
-			SCALER_CTL0_UNITY);
+			(vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
+			VC4_SET_FIELD(scl, SCALER_CTL0_SCL0) |
+			VC4_SET_FIELD(scl, SCALER_CTL0_SCL1));
 
 	/* Position Word 0: Image Positions and Alpha Value */
 	vc4_state->pos0_offset = vc4_state->dlist_count;
@@ -238,9 +434,14 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 			VC4_SET_FIELD(vc4_state->crtc_x, SCALER_POS0_START_X) |
 			VC4_SET_FIELD(vc4_state->crtc_y, SCALER_POS0_START_Y));
 
-	/* Position Word 1: Scaled Image Dimensions.
-	 * Skipped due to SCALER_CTL0_UNITY scaling.
-	 */
+	/* Position Word 1: Scaled Image Dimensions. */
+	if (!vc4_state->is_unity) {
+		vc4_dlist_write(vc4_state,
+				VC4_SET_FIELD(vc4_state->crtc_w,
+					      SCALER_POS1_SCL_WIDTH) |
+				VC4_SET_FIELD(vc4_state->crtc_h,
+					      SCALER_POS1_SCL_HEIGHT));
+	}
 
 	/* Position Word 2: Source Image Size, Alpha Mode */
 	vc4_state->pos2_offset = vc4_state->dlist_count;
@@ -266,6 +467,32 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	vc4_dlist_write(vc4_state,
 			VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH));
 
+	if (!vc4_state->is_unity) {
+		/* LBM Base Address. */
+		if (vc4_state->y_scaling != VC4_SCALING_NONE)
+			vc4_dlist_write(vc4_state, vc4_state->lbm.start);
+
+		vc4_write_scaling_parameters(state);
+
+		/* If any PPF setup was done, then all the kernel
+		 * pointers get uploaded.
+		 */
+		if (vc4_state->x_scaling == VC4_SCALING_PPF ||
+		    vc4_state->y_scaling == VC4_SCALING_PPF) {
+			u32 kernel = VC4_SET_FIELD(vc4->hvs->mitchell_netravali_filter.start,
+						   SCALER_PPF_KERNEL_OFFSET);
+
+			/* HPPF plane 0 */
+			vc4_dlist_write(vc4_state, kernel);
+			/* VPPF plane 0 */
+			vc4_dlist_write(vc4_state, kernel);
+			/* HPPF plane 1 */
+			vc4_dlist_write(vc4_state, kernel);
+			/* VPPF plane 1 */
+			vc4_dlist_write(vc4_state, kernel);
+		}
+	}
+
 	vc4_state->dlist[ctl0_offset] |=
 		VC4_SET_FIELD(vc4_state->dlist_count, SCALER_CTL0_SIZE);
 
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 7c29993..a5b544d 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -552,6 +552,21 @@ enum hvs_pixel_format {
 #define SCALER_CTL0_ORDER_MASK			VC4_MASK(14, 13)
 #define SCALER_CTL0_ORDER_SHIFT			13
 
+#define SCALER_CTL0_SCL1_MASK			VC4_MASK(10, 8)
+#define SCALER_CTL0_SCL1_SHIFT			8
+
+#define SCALER_CTL0_SCL0_MASK			VC4_MASK(7, 5)
+#define SCALER_CTL0_SCL0_SHIFT			5
+
+#define SCALER_CTL0_SCL_H_PPF_V_PPF		0
+#define SCALER_CTL0_SCL_H_TPZ_V_PPF		1
+#define SCALER_CTL0_SCL_H_PPF_V_TPZ		2
+#define SCALER_CTL0_SCL_H_TPZ_V_TPZ		3
+#define SCALER_CTL0_SCL_H_PPF_V_NONE		4
+#define SCALER_CTL0_SCL_H_NONE_V_PPF		5
+#define SCALER_CTL0_SCL_H_NONE_V_TPZ		6
+#define SCALER_CTL0_SCL_H_TPZ_V_NONE		7
+
 /* Set to indicate no scaling. */
 #define SCALER_CTL0_UNITY			BIT(4)
 
@@ -567,6 +582,12 @@ enum hvs_pixel_format {
 #define SCALER_POS0_START_X_MASK		VC4_MASK(11, 0)
 #define SCALER_POS0_START_X_SHIFT		0
 
+#define SCALER_POS1_SCL_HEIGHT_MASK		VC4_MASK(27, 16)
+#define SCALER_POS1_SCL_HEIGHT_SHIFT		16
+
+#define SCALER_POS1_SCL_WIDTH_MASK		VC4_MASK(11, 0)
+#define SCALER_POS1_SCL_WIDTH_SHIFT		0
+
 #define SCALER_POS2_ALPHA_MODE_MASK		VC4_MASK(31, 30)
 #define SCALER_POS2_ALPHA_MODE_SHIFT		30
 #define SCALER_POS2_ALPHA_MODE_PIPELINE		0
@@ -580,6 +601,31 @@ enum hvs_pixel_format {
 #define SCALER_POS2_WIDTH_MASK			VC4_MASK(11, 0)
 #define SCALER_POS2_WIDTH_SHIFT			0
 
+#define SCALER_TPZ0_VERT_RECALC			BIT(31)
+#define SCALER_TPZ0_SCALE_MASK			VC4_MASK(28, 8)
+#define SCALER_TPZ0_SCALE_SHIFT			8
+#define SCALER_TPZ0_IPHASE_MASK			VC4_MASK(7, 0)
+#define SCALER_TPZ0_IPHASE_SHIFT		0
+#define SCALER_TPZ1_RECIP_MASK			VC4_MASK(15, 0)
+#define SCALER_TPZ1_RECIP_SHIFT			0
+
+/* Skips interpolating coefficients to 64 phases, so just 8 are used.
+ * Required for nearest neighbor.
+ */
+#define SCALER_PPF_NOINTERP			BIT(31)
+/* Replaes the highest valued coefficient with one that makes all 4
+ * sum to unity.
+ */
+#define SCALER_PPF_AGC				BIT(30)
+#define SCALER_PPF_SCALE_MASK			VC4_MASK(24, 8)
+#define SCALER_PPF_SCALE_SHIFT			8
+#define SCALER_PPF_IPHASE_MASK			VC4_MASK(6, 0)
+#define SCALER_PPF_IPHASE_SHIFT			0
+
+#define SCALER_PPF_KERNEL_OFFSET_MASK		VC4_MASK(13, 0)
+#define SCALER_PPF_KERNEL_OFFSET_SHIFT		0
+#define SCALER_PPF_KERNEL_UNCACHED		BIT(31)
+
 #define SCALER_SRC_PITCH_MASK			VC4_MASK(15, 0)
 #define SCALER_SRC_PITCH_SHIFT			0
 
-- 
2.7.3

From c1f11c3b1a4379841341911c379237d3a3870607 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 20 Oct 2015 13:59:15 +0100
Subject: [PATCH 14/36] drm/vc4: Add support a few more RGB display plane
 formats.

These were all touch-tested with modetest.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_plane.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 7c2d697..013ebff 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -88,6 +88,22 @@ static const struct hvs_format {
 		.drm = DRM_FORMAT_ARGB8888, .hvs = HVS_PIXEL_FORMAT_RGBA8888,
 		.pixel_order = HVS_PIXEL_ORDER_ABGR, .has_alpha = true,
 	},
+	{
+		.drm = DRM_FORMAT_RGB565, .hvs = HVS_PIXEL_FORMAT_RGB565,
+		.pixel_order = HVS_PIXEL_ORDER_XRGB, .has_alpha = false,
+	},
+	{
+		.drm = DRM_FORMAT_BGR565, .hvs = HVS_PIXEL_FORMAT_RGB565,
+		.pixel_order = HVS_PIXEL_ORDER_XBGR, .has_alpha = false,
+	},
+	{
+		.drm = DRM_FORMAT_ARGB1555, .hvs = HVS_PIXEL_FORMAT_RGBA5551,
+		.pixel_order = HVS_PIXEL_ORDER_ABGR, .has_alpha = true,
+	},
+	{
+		.drm = DRM_FORMAT_XRGB1555, .hvs = HVS_PIXEL_FORMAT_RGBA5551,
+		.pixel_order = HVS_PIXEL_ORDER_ABGR, .has_alpha = false,
+	},
 };
 
 static const struct hvs_format *vc4_get_hvs_format(u32 drm_format)
-- 
2.7.3

From 149a88adaedd0bea6c6f2f12dcf893d740be2ebb Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 30 Dec 2015 12:25:44 -0800
Subject: [PATCH 15/36] drm/vc4: Add support for YUV planes.

This supports 420 and 422 subsampling with 2 or 3 planes, tested with
modetest.  It doesn't set up chroma subsampling position (which it
appears KMS doesn't deal with yet).

The LBM memory is overallocated in many cases, but apparently the docs
aren't quite correct and I'll probably need to look at the hardware
source to really figure it out.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_plane.c | 256 +++++++++++++++++++++++++++++++---------
 drivers/gpu/drm/vc4/vc4_regs.h  |  56 ++++++++-
 2 files changed, 253 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 013ebff..7b0c72a 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -54,15 +54,19 @@ struct vc4_plane_state {
 	/* Clipped coordinates of the plane on the display. */
 	int crtc_x, crtc_y, crtc_w, crtc_h;
 	/* Clipped area being scanned from in the FB. */
-	u32 src_x, src_y, src_w, src_h;
+	u32 src_x, src_y;
 
-	enum vc4_scaling_mode x_scaling, y_scaling;
+	u32 src_w[2], src_h[2];
+
+	/* Scaling selection for the RGB/Y plane and the Cb/Cr planes. */
+	enum vc4_scaling_mode x_scaling[2], y_scaling[2];
 	bool is_unity;
+	bool is_yuv;
 
 	/* Offset to start scanning out from the start of the plane's
 	 * BO.
 	 */
-	u32 offset;
+	u32 offsets[3];
 
 	/* Our allocation in LBM for temporary storage during scaling. */
 	struct drm_mm_node lbm;
@@ -79,6 +83,7 @@ static const struct hvs_format {
 	u32 hvs; /* HVS_FORMAT_* */
 	u32 pixel_order;
 	bool has_alpha;
+	bool flip_cbcr;
 } hvs_formats[] = {
 	{
 		.drm = DRM_FORMAT_XRGB8888, .hvs = HVS_PIXEL_FORMAT_RGBA8888,
@@ -104,6 +109,32 @@ static const struct hvs_format {
 		.drm = DRM_FORMAT_XRGB1555, .hvs = HVS_PIXEL_FORMAT_RGBA5551,
 		.pixel_order = HVS_PIXEL_ORDER_ABGR, .has_alpha = false,
 	},
+	{
+		.drm = DRM_FORMAT_YUV422,
+		.hvs = HVS_PIXEL_FORMAT_YCBCR_YUV422_3PLANE,
+	},
+	{
+		.drm = DRM_FORMAT_YVU422,
+		.hvs = HVS_PIXEL_FORMAT_YCBCR_YUV422_3PLANE,
+		.flip_cbcr = true,
+	},
+	{
+		.drm = DRM_FORMAT_YUV420,
+		.hvs = HVS_PIXEL_FORMAT_YCBCR_YUV420_3PLANE,
+	},
+	{
+		.drm = DRM_FORMAT_YVU420,
+		.hvs = HVS_PIXEL_FORMAT_YCBCR_YUV420_3PLANE,
+		.flip_cbcr = true,
+	},
+	{
+		.drm = DRM_FORMAT_NV12,
+		.hvs = HVS_PIXEL_FORMAT_YCBCR_YUV420_2PLANE,
+	},
+	{
+		.drm = DRM_FORMAT_NV16,
+		.hvs = HVS_PIXEL_FORMAT_YCBCR_YUV422_2PLANE,
+	},
 };
 
 static const struct hvs_format *vc4_get_hvs_format(u32 drm_format)
@@ -219,11 +250,11 @@ static void vc4_dlist_write(struct vc4_plane_state *vc4_state, u32 val)
  *
  * This is a replication of a table from the spec.
  */
-static u32 vc4_get_scl_field(struct drm_plane_state *state)
+static u32 vc4_get_scl_field(struct drm_plane_state *state, int plane)
 {
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 
-	switch (vc4_state->x_scaling << 2 | vc4_state->y_scaling) {
+	switch (vc4_state->x_scaling[plane] << 2 | vc4_state->y_scaling[plane]) {
 	case VC4_SCALING_PPF << 2 | VC4_SCALING_PPF:
 		return SCALER_CTL0_SCL_H_PPF_V_PPF;
 	case VC4_SCALING_TPZ << 2 | VC4_SCALING_PPF:
@@ -254,9 +285,16 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 	struct drm_plane *plane = state->plane;
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
+	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
 	u32 subpixel_src_mask = (1 << 16) - 1;
+	u32 format = fb->pixel_format;
+	int num_planes = drm_format_num_planes(format);
+	u32 h_subsample = 1;
+	u32 v_subsample = 1;
+	int i;
 
-	vc4_state->offset = fb->offsets[0];
+	for (i = 0; i < num_planes; i++)
+		vc4_state->offsets[i] = bo->paddr + fb->offsets[i];
 
 	/* We don't support subpixel source positioning for scaling. */
 	if ((state->src_x & subpixel_src_mask) ||
@@ -268,20 +306,48 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 
 	vc4_state->src_x = state->src_x >> 16;
 	vc4_state->src_y = state->src_y >> 16;
-	vc4_state->src_w = state->src_w >> 16;
-	vc4_state->src_h = state->src_h >> 16;
+	vc4_state->src_w[0] = state->src_w >> 16;
+	vc4_state->src_h[0] = state->src_h >> 16;
 
 	vc4_state->crtc_x = state->crtc_x;
 	vc4_state->crtc_y = state->crtc_y;
 	vc4_state->crtc_w = state->crtc_w;
 	vc4_state->crtc_h = state->crtc_h;
 
-	vc4_state->x_scaling = vc4_get_scaling_mode(vc4_state->src_w,
-						    vc4_state->crtc_w);
-	vc4_state->y_scaling = vc4_get_scaling_mode(vc4_state->src_h,
-						    vc4_state->crtc_h);
-	vc4_state->is_unity = (vc4_state->x_scaling == VC4_SCALING_NONE &&
-			       vc4_state->y_scaling == VC4_SCALING_NONE);
+	vc4_state->x_scaling[0] = vc4_get_scaling_mode(vc4_state->src_w[0],
+						       vc4_state->crtc_w);
+	vc4_state->y_scaling[0] = vc4_get_scaling_mode(vc4_state->src_h[0],
+						       vc4_state->crtc_h);
+
+	if (num_planes > 1) {
+		vc4_state->is_yuv = true;
+
+		h_subsample = drm_format_horz_chroma_subsampling(format);
+		v_subsample = drm_format_vert_chroma_subsampling(format);
+		vc4_state->src_w[1] = vc4_state->src_w[0] / h_subsample;
+		vc4_state->src_h[1] = vc4_state->src_h[0] / v_subsample;
+
+		vc4_state->x_scaling[1] =
+			vc4_get_scaling_mode(vc4_state->src_w[1],
+					     vc4_state->crtc_w);
+		vc4_state->y_scaling[1] =
+			vc4_get_scaling_mode(vc4_state->src_h[1],
+					     vc4_state->crtc_h);
+
+		/* YUV conversion requires that scaling be enabled,
+		 * even on a plane that's otherwise 1:1.  Choose TPZ
+		 * for simplicity.
+		 */
+		if (vc4_state->x_scaling[0] == VC4_SCALING_NONE)
+			vc4_state->x_scaling[0] = VC4_SCALING_TPZ;
+		if (vc4_state->y_scaling[0] == VC4_SCALING_NONE)
+			vc4_state->y_scaling[0] = VC4_SCALING_TPZ;
+	}
+
+	vc4_state->is_unity = (vc4_state->x_scaling[0] == VC4_SCALING_NONE &&
+			       vc4_state->y_scaling[0] == VC4_SCALING_NONE &&
+			       vc4_state->x_scaling[1] == VC4_SCALING_NONE &&
+			       vc4_state->y_scaling[1] == VC4_SCALING_NONE);
 
 	/* No configuring scaling on the cursor plane, since it gets
 	   non-vblank-synced updates, and scaling requires requires
@@ -294,16 +360,27 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 	 * support negative y, and negative x wastes bandwidth.
 	 */
 	if (vc4_state->crtc_x < 0) {
-		vc4_state->offset += (drm_format_plane_cpp(fb->pixel_format,
-							   0) *
-				      -vc4_state->crtc_x);
-		vc4_state->src_w += vc4_state->crtc_x;
+		for (i = 0; i < num_planes; i++) {
+			u32 cpp = drm_format_plane_cpp(fb->pixel_format, i);
+			u32 subs = ((i == 0) ? 1 : h_subsample);
+
+			vc4_state->offsets[i] += (cpp *
+						  (-vc4_state->crtc_x) / subs);
+		}
+		vc4_state->src_w[0] += vc4_state->crtc_x;
+		vc4_state->src_w[1] += vc4_state->crtc_x / h_subsample;
 		vc4_state->crtc_x = 0;
 	}
 
 	if (vc4_state->crtc_y < 0) {
-		vc4_state->offset += fb->pitches[0] * -vc4_state->crtc_y;
-		vc4_state->src_h += vc4_state->crtc_y;
+		for (i = 0; i < num_planes; i++) {
+			u32 subs = ((i == 0) ? 1 : v_subsample);
+
+			vc4_state->offsets[i] += (fb->pitches[i] *
+						  (-vc4_state->crtc_y) / subs);
+		}
+		vc4_state->src_h[0] += vc4_state->crtc_y;
+		vc4_state->src_h[1] += vc4_state->crtc_y / v_subsample;
 		vc4_state->crtc_y = 0;
 	}
 
@@ -344,15 +421,23 @@ static u32 vc4_lbm_size(struct drm_plane_state *state)
 	/* This is the worst case number.  One of the two sizes will
 	 * be used depending on the scaling configuration.
 	 */
-	u32 pix_per_line = max(vc4_state->src_w, (u32)vc4_state->crtc_w);
+	u32 pix_per_line = max(vc4_state->src_w[0], (u32)vc4_state->crtc_w);
 	u32 lbm;
 
-	if (vc4_state->is_unity)
-		return 0;
-	else if (vc4_state->y_scaling == VC4_SCALING_TPZ)
-		lbm = pix_per_line * 8;
-	else {
-		/* In special cases, this multiplier might be 12. */
+	if (!vc4_state->is_yuv) {
+		if (vc4_state->is_unity)
+			return 0;
+		else if (vc4_state->y_scaling[0] == VC4_SCALING_TPZ)
+			lbm = pix_per_line * 8;
+		else {
+			/* In special cases, this multiplier might be 12. */
+			lbm = pix_per_line * 16;
+		}
+	} else {
+		/* There are cases for this going down to a multiplier
+		 * of 2, but according to the firmware source, the
+		 * table in the docs is somewhat wrong.
+		 */
 		lbm = pix_per_line * 16;
 	}
 
@@ -361,33 +446,34 @@ static u32 vc4_lbm_size(struct drm_plane_state *state)
 	return lbm;
 }
 
-static void vc4_write_scaling_parameters(struct drm_plane_state *state)
+static void vc4_write_scaling_parameters(struct drm_plane_state *state,
+					 int channel)
 {
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 
 	/* Ch0 H-PPF Word 0: Scaling Parameters */
-	if (vc4_state->x_scaling == VC4_SCALING_PPF) {
+	if (vc4_state->x_scaling[channel] == VC4_SCALING_PPF) {
 		vc4_write_ppf(vc4_state,
-			      vc4_state->src_w, vc4_state->crtc_w);
+			      vc4_state->src_w[channel], vc4_state->crtc_w);
 	}
 
 	/* Ch0 V-PPF Words 0-1: Scaling Parameters, Context */
-	if (vc4_state->y_scaling == VC4_SCALING_PPF) {
+	if (vc4_state->y_scaling[channel] == VC4_SCALING_PPF) {
 		vc4_write_ppf(vc4_state,
-			      vc4_state->src_h, vc4_state->crtc_h);
+			      vc4_state->src_h[channel], vc4_state->crtc_h);
 		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 	}
 
 	/* Ch0 H-TPZ Words 0-1: Scaling Parameters, Recip */
-	if (vc4_state->x_scaling == VC4_SCALING_TPZ) {
+	if (vc4_state->x_scaling[channel] == VC4_SCALING_TPZ) {
 		vc4_write_tpz(vc4_state,
-			      vc4_state->src_w, vc4_state->crtc_w);
+			      vc4_state->src_w[channel], vc4_state->crtc_w);
 	}
 
 	/* Ch0 V-TPZ Words 0-2: Scaling Parameters, Recip, Context */
-	if (vc4_state->y_scaling == VC4_SCALING_TPZ) {
+	if (vc4_state->y_scaling[channel] == VC4_SCALING_TPZ) {
 		vc4_write_tpz(vc4_state,
-			      vc4_state->src_h, vc4_state->crtc_h);
+			      vc4_state->src_h[channel], vc4_state->crtc_h);
 		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 	}
 }
@@ -401,13 +487,13 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	struct vc4_dev *vc4 = to_vc4_dev(plane->dev);
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
-	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
 	u32 ctl0_offset = vc4_state->dlist_count;
 	const struct hvs_format *format = vc4_get_hvs_format(fb->pixel_format);
-	u32 scl;
+	int num_planes = drm_format_num_planes(format->drm);
+	u32 scl0, scl1;
 	u32 lbm_size;
 	unsigned long irqflags;
-	int ret;
+	int ret, i;
 
 	ret = vc4_plane_setup_clipping_and_scaling(state);
 	if (ret)
@@ -432,7 +518,19 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	if (ret)
 		return ret;
 
-	scl = vc4_get_scl_field(state);
+	/* SCL1 is used for Cb/Cr scaling of planar formats.  For RGB
+	 * and 4:4:4, scl1 should be set to scl0 so both channels of
+	 * the scaler do the same thing.  For YUV, the Y plane needs
+	 * to be put in channel 1 and Cb/Cr in channel 0, so we swap
+	 * the scl fields here.
+	 */
+	if (num_planes == 1) {
+		scl0 = vc4_get_scl_field(state, 1);
+		scl1 = scl0;
+	} else {
+		scl0 = vc4_get_scl_field(state, 1);
+		scl1 = vc4_get_scl_field(state, 0);
+	}
 
 	/* Control word */
 	vc4_dlist_write(vc4_state,
@@ -440,8 +538,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 			(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
 			(format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
 			(vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
-			VC4_SET_FIELD(scl, SCALER_CTL0_SCL0) |
-			VC4_SET_FIELD(scl, SCALER_CTL0_SCL1));
+			VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) |
+			VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1));
 
 	/* Position Word 0: Image Positions and Alpha Value */
 	vc4_state->pos0_offset = vc4_state->dlist_count;
@@ -466,35 +564,68 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 				      SCALER_POS2_ALPHA_MODE_PIPELINE :
 				      SCALER_POS2_ALPHA_MODE_FIXED,
 				      SCALER_POS2_ALPHA_MODE) |
-			VC4_SET_FIELD(vc4_state->src_w, SCALER_POS2_WIDTH) |
-			VC4_SET_FIELD(vc4_state->src_h, SCALER_POS2_HEIGHT));
+			VC4_SET_FIELD(vc4_state->src_w[0], SCALER_POS2_WIDTH) |
+			VC4_SET_FIELD(vc4_state->src_h[0], SCALER_POS2_HEIGHT));
 
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
-	/* Pointer Word 0: RGB / Y Pointer */
+
+	/* Pointer Word 0/1/2: RGB / Y / Cb / Cr Pointers
+	 *
+	 * The pointers may be any byte address.
+	 */
 	vc4_state->ptr0_offset = vc4_state->dlist_count;
-	vc4_dlist_write(vc4_state, bo->paddr + vc4_state->offset);
+	if (!format->flip_cbcr) {
+		for (i = 0; i < num_planes; i++)
+			vc4_dlist_write(vc4_state, vc4_state->offsets[i]);
+	} else {
+		WARN_ON_ONCE(num_planes != 3);
+		vc4_dlist_write(vc4_state, vc4_state->offsets[0]);
+		vc4_dlist_write(vc4_state, vc4_state->offsets[2]);
+		vc4_dlist_write(vc4_state, vc4_state->offsets[1]);
+	}
 
-	/* Pointer Context Word 0: Written by the HVS */
-	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
+	/* Pointer Context Word 0/1/2: Written by the HVS */
+	for (i = 0; i < num_planes; i++)
+		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
-	/* Pitch word 0: Pointer 0 Pitch */
-	vc4_dlist_write(vc4_state,
-			VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH));
+	/* Pitch word 0/1/2 */
+	for (i = 0; i < num_planes; i++) {
+		vc4_dlist_write(vc4_state,
+				VC4_SET_FIELD(fb->pitches[i], SCALER_SRC_PITCH));
+	}
+
+	/* Colorspace conversion words */
+	if (vc4_state->is_yuv) {
+		vc4_dlist_write(vc4_state, SCALER_CSC0_ITR_R_601_5);
+		vc4_dlist_write(vc4_state, SCALER_CSC1_ITR_R_601_5);
+		vc4_dlist_write(vc4_state, SCALER_CSC2_ITR_R_601_5);
+	}
 
 	if (!vc4_state->is_unity) {
 		/* LBM Base Address. */
-		if (vc4_state->y_scaling != VC4_SCALING_NONE)
+		if (vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
+		    vc4_state->y_scaling[1] != VC4_SCALING_NONE) {
 			vc4_dlist_write(vc4_state, vc4_state->lbm.start);
+		}
 
-		vc4_write_scaling_parameters(state);
+		if (num_planes > 1) {
+			/* Emit Cb/Cr as channel 0 and Y as channel
+			 * 1. This matches how we set up scl0/scl1
+			 * above.
+			 */
+			vc4_write_scaling_parameters(state, 1);
+		}
+		vc4_write_scaling_parameters(state, 0);
 
 		/* If any PPF setup was done, then all the kernel
 		 * pointers get uploaded.
 		 */
-		if (vc4_state->x_scaling == VC4_SCALING_PPF ||
-		    vc4_state->y_scaling == VC4_SCALING_PPF) {
+		if (vc4_state->x_scaling[0] == VC4_SCALING_PPF ||
+		    vc4_state->y_scaling[0] == VC4_SCALING_PPF ||
+		    vc4_state->x_scaling[1] == VC4_SCALING_PPF ||
+		    vc4_state->y_scaling[1] == VC4_SCALING_PPF) {
 			u32 kernel = VC4_SET_FIELD(vc4->hvs->mitchell_netravali_filter.start,
 						   SCALER_PPF_KERNEL_OFFSET);
 
@@ -698,6 +829,7 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 	struct drm_plane *plane = NULL;
 	struct vc4_plane *vc4_plane;
 	u32 formats[ARRAY_SIZE(hvs_formats)];
+	u32 num_formats = 0;
 	int ret = 0;
 	unsigned i;
 
@@ -708,12 +840,20 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 		goto fail;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(hvs_formats); i++)
-		formats[i] = hvs_formats[i].drm;
+	for (i = 0; i < ARRAY_SIZE(hvs_formats); i++) {
+		/* Don't allow YUV in cursor planes, since that means
+		 * tuning on the scaler, which we don't allow for the
+		 * cursor.
+		 */
+		if (type != DRM_PLANE_TYPE_CURSOR ||
+		    hvs_formats[i].hvs < HVS_PIXEL_FORMAT_YCBCR_YUV420_3PLANE) {
+			formats[num_formats++] = hvs_formats[i].drm;
+		}
+	}
 	plane = &vc4_plane->base;
 	ret = drm_universal_plane_init(dev, plane, 0xff,
 				       &vc4_plane_funcs,
-				       formats, ARRAY_SIZE(formats),
+				       formats, num_formats,
 				       type, NULL);
 
 	drm_plane_helper_add(plane, &vc4_plane_helper_funcs);
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index a5b544d..bf42a8e 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -519,7 +519,12 @@ enum hvs_pixel_format {
 	HVS_PIXEL_FORMAT_RGB888 = 5,
 	HVS_PIXEL_FORMAT_RGBA6666 = 6,
 	/* 32bpp */
-	HVS_PIXEL_FORMAT_RGBA8888 = 7
+	HVS_PIXEL_FORMAT_RGBA8888 = 7,
+
+	HVS_PIXEL_FORMAT_YCBCR_YUV420_3PLANE = 8,
+	HVS_PIXEL_FORMAT_YCBCR_YUV420_2PLANE = 9,
+	HVS_PIXEL_FORMAT_YCBCR_YUV422_3PLANE = 10,
+	HVS_PIXEL_FORMAT_YCBCR_YUV422_2PLANE = 11,
 };
 
 /* Note: the LSB is the rightmost character shown.  Only valid for
@@ -601,6 +606,55 @@ enum hvs_pixel_format {
 #define SCALER_POS2_WIDTH_MASK			VC4_MASK(11, 0)
 #define SCALER_POS2_WIDTH_SHIFT			0
 
+/* Color Space Conversion words.  Some values are S2.8 signed
+ * integers, except that the 2 integer bits map as {0x0: 0, 0x1: 1,
+ * 0x2: 2, 0x3: -1}
+ */
+/* bottom 8 bits of S2.8 contribution of Cr to Blue */
+#define SCALER_CSC0_COEF_CR_BLU_MASK		VC4_MASK(31, 24)
+#define SCALER_CSC0_COEF_CR_BLU_SHIFT		24
+/* Signed offset to apply to Y before CSC. (Y' = Y + YY_OFS) */
+#define SCALER_CSC0_COEF_YY_OFS_MASK		VC4_MASK(23, 16)
+#define SCALER_CSC0_COEF_YY_OFS_SHIFT		16
+/* Signed offset to apply to CB before CSC (Cb' = Cb - 128 + CB_OFS). */
+#define SCALER_CSC0_COEF_CB_OFS_MASK		VC4_MASK(15, 8)
+#define SCALER_CSC0_COEF_CB_OFS_SHIFT		8
+/* Signed offset to apply to CB before CSC (Cr' = Cr - 128 + CR_OFS). */
+#define SCALER_CSC0_COEF_CR_OFS_MASK		VC4_MASK(7, 0)
+#define SCALER_CSC0_COEF_CR_OFS_SHIFT		0
+#define SCALER_CSC0_ITR_R_601_5			0x00f00000
+#define SCALER_CSC0_ITR_R_709_3			0x00f00000
+#define SCALER_CSC0_JPEG_JFIF			0x00000000
+
+/* S2.8 contribution of Cb to Green */
+#define SCALER_CSC1_COEF_CB_GRN_MASK		VC4_MASK(31, 22)
+#define SCALER_CSC1_COEF_CB_GRN_SHIFT		22
+/* S2.8 contribution of Cr to Green */
+#define SCALER_CSC1_COEF_CR_GRN_MASK		VC4_MASK(21, 12)
+#define SCALER_CSC1_COEF_CR_GRN_SHIFT		12
+/* S2.8 contribution of Y to all of RGB */
+#define SCALER_CSC1_COEF_YY_ALL_MASK		VC4_MASK(11, 2)
+#define SCALER_CSC1_COEF_YY_ALL_SHIFT		2
+/* top 2 bits of S2.8 contribution of Cr to Blue */
+#define SCALER_CSC1_COEF_CR_BLU_MASK		VC4_MASK(1, 0)
+#define SCALER_CSC1_COEF_CR_BLU_SHIFT		0
+#define SCALER_CSC1_ITR_R_601_5			0xe73304a8
+#define SCALER_CSC1_ITR_R_709_3			0xf2b784a8
+#define SCALER_CSC1_JPEG_JFIF			0xea34a400
+
+/* S2.8 contribution of Cb to Red */
+#define SCALER_CSC2_COEF_CB_RED_MASK		VC4_MASK(29, 20)
+#define SCALER_CSC2_COEF_CB_RED_SHIFT		20
+/* S2.8 contribution of Cr to Red */
+#define SCALER_CSC2_COEF_CR_RED_MASK		VC4_MASK(19, 10)
+#define SCALER_CSC2_COEF_CR_RED_SHIFT		10
+/* S2.8 contribution of Cb to Blue */
+#define SCALER_CSC2_COEF_CB_BLU_MASK		VC4_MASK(19, 10)
+#define SCALER_CSC2_COEF_CB_BLU_SHIFT		10
+#define SCALER_CSC2_ITR_R_601_5			0x00066204
+#define SCALER_CSC2_ITR_R_709_3			0x00072a1c
+#define SCALER_CSC2_JPEG_JFIF			0x000599c5
+
 #define SCALER_TPZ0_VERT_RECALC			BIT(31)
 #define SCALER_TPZ0_SCALE_MASK			VC4_MASK(28, 8)
 #define SCALER_TPZ0_SCALE_SHIFT			8
-- 
2.7.3

From 47ed1ee3dbfde89297b81bc09f1b483f4da1b06d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 29 Feb 2016 17:53:00 -0800
Subject: [PATCH 16/36] drm/vc4: Let gpiolib know that we're OK with sleeping
 for HPD.

Fixes an error thrown every few seconds when we poll HPD when it's on
a I2C to GPIO expander.

Signed-off-by: Eric Anholt <eric@anholt.net>
Tested-by: Daniel Stone <daniels@collabora.com>
---
 drivers/gpu/drm/vc4/vc4_hdmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
index 56272ca..6bcf51d 100644
--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
+++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
@@ -166,7 +166,7 @@ vc4_hdmi_connector_detect(struct drm_connector *connector, bool force)
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 
 	if (vc4->hdmi->hpd_gpio) {
-		if (gpio_get_value(vc4->hdmi->hpd_gpio))
+		if (gpio_get_value_cansleep(vc4->hdmi->hpd_gpio))
 			return connector_status_connected;
 		else
 			return connector_status_disconnected;
-- 
2.7.3

From 24b28acef42486c282bc58e977cbfc66191a8f38 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 29 Feb 2016 17:53:01 -0800
Subject: [PATCH 17/36] drm/vc4: Respect GPIO_ACTIVE_LOW on HDMI HPD if set in
 the devicetree.

The original Raspberry Pi had the GPIO active high, but the later
models are active low.  The DT GPIO bindings allow specifying the
active flag, except that it doesn't get propagated to the gpiodesc, so
you have to handle it yourself.

Signed-off-by: Eric Anholt <eric@anholt.net>
Tested-by: Daniel Stone <daniels@collabora.com>
---
 drivers/gpu/drm/vc4/vc4_hdmi.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
index 6bcf51d..d8b8649 100644
--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
+++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
@@ -47,6 +47,7 @@ struct vc4_hdmi {
 	void __iomem *hdmicore_regs;
 	void __iomem *hd_regs;
 	int hpd_gpio;
+	bool hpd_active_low;
 
 	struct clk *pixel_clock;
 	struct clk *hsm_clock;
@@ -166,7 +167,8 @@ vc4_hdmi_connector_detect(struct drm_connector *connector, bool force)
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 
 	if (vc4->hdmi->hpd_gpio) {
-		if (gpio_get_value_cansleep(vc4->hdmi->hpd_gpio))
+		if (gpio_get_value_cansleep(vc4->hdmi->hpd_gpio) ^
+		    vc4->hdmi->hpd_active_low)
 			return connector_status_connected;
 		else
 			return connector_status_disconnected;
@@ -517,11 +519,17 @@ static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data)
 	 * we'll use the HDMI core's register.
 	 */
 	if (of_find_property(dev->of_node, "hpd-gpios", &value)) {
-		hdmi->hpd_gpio = of_get_named_gpio(dev->of_node, "hpd-gpios", 0);
+		enum of_gpio_flags hpd_gpio_flags;
+
+		hdmi->hpd_gpio = of_get_named_gpio_flags(dev->of_node,
+							 "hpd-gpios", 0,
+							 &hpd_gpio_flags);
 		if (hdmi->hpd_gpio < 0) {
 			ret = hdmi->hpd_gpio;
 			goto err_unprepare_hsm;
 		}
+
+		hdmi->hpd_active_low = hpd_gpio_flags & OF_GPIO_ACTIVE_LOW;
 	}
 
 	vc4->hdmi = hdmi;
-- 
2.7.3

From ed6836e411dd559a811dd063509a01772f4fe00f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 4 Mar 2016 12:32:07 -0800
Subject: [PATCH 19/36] drm/vc4: Recognize a more specific compatible string
 for V3D.

The Raspberry Pi Foundation's firmware updates are shipping device
trees using the old string, so we'll keep recognizing that as this rev
of V3D.  Still, we should use a more specific name in the upstream DT
to clarify which board is being supported, in case we do other revs of
V3D in the future.

Signed-off-by: Eric Anholt <eric@anholt.net>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
---
 drivers/gpu/drm/vc4/vc4_v3d.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 31de5d1..e6d3c60 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -268,6 +268,7 @@ static int vc4_v3d_dev_remove(struct platform_device *pdev)
 }
 
 static const struct of_device_id vc4_v3d_dt_match[] = {
+	{ .compatible = "brcm,bcm2835-v3d" },
 	{ .compatible = "brcm,vc4-v3d" },
 	{}
 };
-- 
2.7.3

From 55acd7db60c8247d926969b705373765c26c1f44 Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Fri, 11 Sep 2015 11:22:05 +0000
Subject: [PATCH 21/36] ARM: bcm2835: add the auxiliary spi1 and spi2 to the
 device tree

This enables the use of the auxiliary spi1 and spi2 devices
on the bcm2835 SOC.

Note that this requires the use of the new clk-bcm2835-aux to work.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
[anholt: Rebased on 2835.dtsi -> 283x.dtsi change]
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/bcm283x.dtsi | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi
index 971e741..f0d4573 100644
--- a/arch/arm/boot/dts/bcm283x.dtsi
+++ b/arch/arm/boot/dts/bcm283x.dtsi
@@ -1,5 +1,6 @@
 #include <dt-bindings/pinctrl/bcm2835.h>
 #include <dt-bindings/clock/bcm2835.h>
+#include <dt-bindings/clock/bcm2835-aux.h>
 #include "skeleton.dtsi"
 
 /* This include file covers the common peripherals and configuration between
@@ -159,6 +160,26 @@
 			clocks = <&clocks BCM2835_CLOCK_VPU>;
 		};
 
+		spi1: spi@7e215080 {
+			compatible = "brcm,bcm2835-aux-spi";
+			reg = <0x7e215080 0x40>;
+			interrupts = <1 29>;
+			clocks = <&aux BCM2835_AUX_CLOCK_SPI1>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
+		spi2: spi@7e2150c0 {
+			compatible = "brcm,bcm2835-aux-spi";
+			reg = <0x7e2150c0 0x40>;
+			interrupts = <1 29>;
+			clocks = <&aux BCM2835_AUX_CLOCK_SPI2>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
 		sdhci: sdhci@7e300000 {
 			compatible = "brcm,bcm2835-sdhci";
 			reg = <0x7e300000 0x100>;
-- 
2.7.3

From 3e0a385cebe3b0d9338cab356c6a11daaa64f808 Mon Sep 17 00:00:00 2001
From: Remi Pommarel <repk@triplefau.lt>
Date: Mon, 21 Dec 2015 21:12:59 +0100
Subject: [PATCH 22/36] ARM: bcm2835: Add PWM clock support to the device tree

Signed-off-by: Remi Pommarel <repk@triplefau.lt>
[anholt: Rebased on 2835.dtsi -> 283x.dtsi change]
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/bcm2835-rpi.dtsi |  4 ++++
 arch/arm/boot/dts/bcm283x.dtsi     | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/arch/arm/boot/dts/bcm2835-rpi.dtsi b/arch/arm/boot/dts/bcm2835-rpi.dtsi
index 3afb9fe..a584a93 100644
--- a/arch/arm/boot/dts/bcm2835-rpi.dtsi
+++ b/arch/arm/boot/dts/bcm2835-rpi.dtsi
@@ -58,3 +58,7 @@
 	status = "okay";
 	bus-width = <4>;
 };
+
+&pwm {
+	status = "okay";
+};
diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi
index f0d4573..e4a2792 100644
--- a/arch/arm/boot/dts/bcm283x.dtsi
+++ b/arch/arm/boot/dts/bcm283x.dtsi
@@ -180,6 +180,16 @@
 			status = "disabled";
 		};
 
+		pwm: pwm@7e20c000 {
+			compatible = "brcm,bcm2835-pwm";
+			reg = <0x7e20c000 0x28>;
+			clocks = <&clocks BCM2835_CLOCK_PWM>;
+			assigned-clocks = <&clocks BCM2835_CLOCK_PWM>;
+			assigned-clock-rates = <10000000>;
+			#pwm-cells = <2>;
+			status = "disabled";
+		};
+
 		sdhci: sdhci@7e300000 {
 			compatible = "brcm,bcm2835-sdhci";
 			reg = <0x7e300000 0x100>;
-- 
2.7.3

From 84416a8360e7c31e6ba9f7775c077bd5f3fe32de Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 25 Jan 2016 21:40:06 +0100
Subject: [PATCH 23/36] ARM: bcm2835: dt: Add Raspberry Pi Model A

This one is essentially the same as revision 2 B board (with the I2S on
P5 header).

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
[anholt: Rebased on bcm2835.dtsi -> bcm283x.dtsi change]
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/Makefile          |  1 +
 arch/arm/boot/dts/bcm2835-rpi-a.dts | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 arch/arm/boot/dts/bcm2835-rpi-a.dts

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index a4a6d70..d000814 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -60,6 +60,7 @@ dtb-$(CONFIG_ARCH_AXXIA) += \
 	axm5516-amarillo.dtb
 dtb-$(CONFIG_ARCH_BCM2835) += \
 	bcm2835-rpi-b.dtb \
+	bcm2835-rpi-a.dtb \
 	bcm2835-rpi-b-rev2.dtb \
 	bcm2835-rpi-b-plus.dtb \
 	bcm2835-rpi-a-plus.dtb \
diff --git a/arch/arm/boot/dts/bcm2835-rpi-a.dts b/arch/arm/boot/dts/bcm2835-rpi-a.dts
new file mode 100644
index 0000000..ddbbbbd
--- /dev/null
+++ b/arch/arm/boot/dts/bcm2835-rpi-a.dts
@@ -0,0 +1,24 @@
+/dts-v1/;
+#include "bcm2835.dtsi"
+#include "bcm2835-rpi.dtsi"
+
+/ {
+	compatible = "raspberrypi,model-a", "brcm,bcm2835";
+	model = "Raspberry Pi Model A";
+
+	leds {
+		act {
+			gpios = <&gpio 16 1>;
+		};
+	};
+};
+
+&gpio {
+	pinctrl-0 = <&gpioout &alt0 &i2s_alt2 &alt3>;
+
+	/* I2S interface */
+	i2s_alt2: i2s_alt2 {
+		brcm,pins = <28 29 30 31>;
+		brcm,function = <BCM2835_FSEL_ALT2>;
+	};
+};
-- 
2.7.3

From f6fd06c97f0d6d8398b8caba1b6879fa7e0284ba Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 16 Dec 2015 16:26:49 -0800
Subject: [PATCH 24/36] ARM: bcm2835: Add the Raspberry Pi power domain driver
 to the DT.

This connects the USB driver to the USB power domain, so that USB can
actually be turned on at boot if the bootloader didn't do it for us.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
---
 arch/arm/boot/dts/bcm2835-rpi.dtsi | 12 ++++++++++++
 arch/arm/boot/dts/bcm283x.dtsi     |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/bcm2835-rpi.dtsi b/arch/arm/boot/dts/bcm2835-rpi.dtsi
index a584a93..76bdbca 100644
--- a/arch/arm/boot/dts/bcm2835-rpi.dtsi
+++ b/arch/arm/boot/dts/bcm2835-rpi.dtsi
@@ -1,3 +1,5 @@
+#include <dt-bindings/power/raspberrypi-power.h>
+
 / {
 	memory {
 		reg = <0 0x10000000>;
@@ -18,6 +20,12 @@
 			compatible = "raspberrypi,bcm2835-firmware";
 			mboxes = <&mailbox>;
 		};
+
+		power: power {
+			compatible = "raspberrypi,bcm2835-power";
+			firmware = <&firmware>;
+			#power-domain-cells = <1>;
+		};
 	};
 };
 
@@ -62,3 +70,7 @@
 &pwm {
 	status = "okay";
 };
+
+&usb {
+	power-domains = <&power RPI_POWER_DOMAIN_USB>;
+};
diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi
index e4a2792..e69a6cf 100644
--- a/arch/arm/boot/dts/bcm283x.dtsi
+++ b/arch/arm/boot/dts/bcm283x.dtsi
@@ -218,7 +218,7 @@
 			status = "disabled";
 		};
 
-		usb@7e980000 {
+		usb: usb@7e980000 {
 			compatible = "brcm,bcm2835-usb";
 			reg = <0x7e980000 0x10000>;
 			interrupts = <1 9>;
-- 
2.7.3

From 580146c680ac7b706ac54d7d7db8204bee3d2e93 Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Fri, 12 Feb 2016 11:14:25 +0000
Subject: [PATCH 25/36] ARM: bcm2835: add bcm2835-aux-uart support to DT

Add bcm2835-aux-uart support to the device tree.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/bcm283x.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi
index e69a6cf..fc67964 100644
--- a/arch/arm/boot/dts/bcm283x.dtsi
+++ b/arch/arm/boot/dts/bcm283x.dtsi
@@ -160,6 +160,14 @@
 			clocks = <&clocks BCM2835_CLOCK_VPU>;
 		};
 
+		uart1: serial@7e215040 {
+			compatible = "brcm,bcm2835-aux-uart";
+			reg = <0x7e215040 0x40>;
+			interrupts = <1 29>;
+			clocks = <&aux BCM2835_AUX_CLOCK_UART>;
+			status = "disabled";
+		};
+
 		spi1: spi@7e215080 {
 			compatible = "brcm,bcm2835-aux-spi";
 			reg = <0x7e215080 0x40>;
-- 
2.7.3

From 41135d2ce60509e53306e5b76afab98ddc15951b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 2 Mar 2015 14:36:16 -0800
Subject: [PATCH 26/36] ARM: bcm2835: Add VC4 to the device tree.

VC4 is the GPU (display and 3D) present on the 283x.

v2: Sort by register address, mark HDMI as disabled by default in the
    SoC file and enable it from -rpi.
v3: Add references to the pixel/HSM clocks for HDMI.  Rename
    compatibility strings and clean up node names.
v4: Fix comment marking pv0's interrupt as pwa2 instead of pwa0.
    Rename hpd-gpio to hpd-gpios.
v5: Rebase on bcm283x.dtsi change, add v3d.
v6: Make HDMI reference the power domain.
v7: Fix the HDMI HPD gpios active value and HDMI enable for each RPI
    board.  Change V3D compatible string to 2835.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/bcm2835-rpi-a-plus.dts |  4 +++
 arch/arm/boot/dts/bcm2835-rpi-a.dts      |  4 +++
 arch/arm/boot/dts/bcm2835-rpi-b-plus.dts |  4 +++
 arch/arm/boot/dts/bcm2835-rpi-b-rev2.dts |  4 +++
 arch/arm/boot/dts/bcm2835-rpi-b.dts      |  4 +++
 arch/arm/boot/dts/bcm2835-rpi.dtsi       |  9 ++++++
 arch/arm/boot/dts/bcm2836-rpi-2-b.dts    |  4 +++
 arch/arm/boot/dts/bcm283x.dtsi           | 47 ++++++++++++++++++++++++++++++++
 8 files changed, 80 insertions(+)

diff --git a/arch/arm/boot/dts/bcm2835-rpi-a-plus.dts b/arch/arm/boot/dts/bcm2835-rpi-a-plus.dts
index 228614f..35ff4e7a 100644
--- a/arch/arm/boot/dts/bcm2835-rpi-a-plus.dts
+++ b/arch/arm/boot/dts/bcm2835-rpi-a-plus.dts
@@ -29,3 +29,7 @@
 		brcm,function = <BCM2835_FSEL_ALT0>;
 	};
 };
+
+&hdmi {
+	hpd-gpios = <&gpio 46 GPIO_ACTIVE_LOW>;
+};
diff --git a/arch/arm/boot/dts/bcm2835-rpi-a.dts b/arch/arm/boot/dts/bcm2835-rpi-a.dts
index ddbbbbd..306a84e 100644
--- a/arch/arm/boot/dts/bcm2835-rpi-a.dts
+++ b/arch/arm/boot/dts/bcm2835-rpi-a.dts
@@ -22,3 +22,7 @@
 		brcm,function = <BCM2835_FSEL_ALT2>;
 	};
 };
+
+&hdmi {
+	hpd-gpios = <&gpio 46 GPIO_ACTIVE_HIGH>;
+};
diff --git a/arch/arm/boot/dts/bcm2835-rpi-b-plus.dts b/arch/arm/boot/dts/bcm2835-rpi-b-plus.dts
index ef54050..57d313b 100644
--- a/arch/arm/boot/dts/bcm2835-rpi-b-plus.dts
+++ b/arch/arm/boot/dts/bcm2835-rpi-b-plus.dts
@@ -29,3 +29,7 @@
 		brcm,function = <BCM2835_FSEL_ALT0>;
 	};
 };
+
+&hdmi {
+	hpd-gpios = <&gpio 46 GPIO_ACTIVE_LOW>;
+};
diff --git a/arch/arm/boot/dts/bcm2835-rpi-b-rev2.dts b/arch/arm/boot/dts/bcm2835-rpi-b-rev2.dts
index 86f1f2f..cf2774e 100644
--- a/arch/arm/boot/dts/bcm2835-rpi-b-rev2.dts
+++ b/arch/arm/boot/dts/bcm2835-rpi-b-rev2.dts
@@ -22,3 +22,7 @@
 		brcm,function = <BCM2835_FSEL_ALT2>;
 	};
 };
+
+&hdmi {
+	hpd-gpios = <&gpio 46 GPIO_ACTIVE_LOW>;
+};
diff --git a/arch/arm/boot/dts/bcm2835-rpi-b.dts b/arch/arm/boot/dts/bcm2835-rpi-b.dts
index 4859e9d..8b15f9c 100644
--- a/arch/arm/boot/dts/bcm2835-rpi-b.dts
+++ b/arch/arm/boot/dts/bcm2835-rpi-b.dts
@@ -16,3 +16,7 @@
 &gpio {
 	pinctrl-0 = <&gpioout &alt0 &alt3>;
 };
+
+&hdmi {
+	hpd-gpios = <&gpio 46 GPIO_ACTIVE_HIGH>;
+};
diff --git a/arch/arm/boot/dts/bcm2835-rpi.dtsi b/arch/arm/boot/dts/bcm2835-rpi.dtsi
index 76bdbca..caf2707 100644
--- a/arch/arm/boot/dts/bcm2835-rpi.dtsi
+++ b/arch/arm/boot/dts/bcm2835-rpi.dtsi
@@ -74,3 +74,12 @@
 &usb {
 	power-domains = <&power RPI_POWER_DOMAIN_USB>;
 };
+
+&v3d {
+	power-domains = <&power RPI_POWER_DOMAIN_V3D>;
+};
+
+&hdmi {
+	power-domains = <&power RPI_POWER_DOMAIN_HDMI>;
+	status = "okay";
+};
diff --git a/arch/arm/boot/dts/bcm2836-rpi-2-b.dts b/arch/arm/boot/dts/bcm2836-rpi-2-b.dts
index ff94666..c4743f4 100644
--- a/arch/arm/boot/dts/bcm2836-rpi-2-b.dts
+++ b/arch/arm/boot/dts/bcm2836-rpi-2-b.dts
@@ -33,3 +33,7 @@
 		brcm,function = <BCM2835_FSEL_ALT0>;
 	};
 };
+
+&hdmi {
+	hpd-gpios = <&gpio 46 GPIO_ACTIVE_LOW>;
+};
diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi
index fc67964..bbe4eab 100644
--- a/arch/arm/boot/dts/bcm283x.dtsi
+++ b/arch/arm/boot/dts/bcm283x.dtsi
@@ -1,6 +1,7 @@
 #include <dt-bindings/pinctrl/bcm2835.h>
 #include <dt-bindings/clock/bcm2835.h>
 #include <dt-bindings/clock/bcm2835-aux.h>
+#include <dt-bindings/gpio/gpio.h>
 #include "skeleton.dtsi"
 
 /* This include file covers the common peripherals and configuration between
@@ -153,6 +154,18 @@
 			status = "disabled";
 		};
 
+		pixelvalve@7e206000 {
+			compatible = "brcm,bcm2835-pixelvalve0";
+			reg = <0x7e206000 0x100>;
+			interrupts = <2 13>; /* pwa0 */
+		};
+
+		pixelvalve@7e207000 {
+			compatible = "brcm,bcm2835-pixelvalve1";
+			reg = <0x7e207000 0x100>;
+			interrupts = <2 14>; /* pwa1 */
+		};
+
 		aux: aux@0x7e215000 {
 			compatible = "brcm,bcm2835-aux";
 			#clock-cells = <1>;
@@ -206,6 +219,12 @@
 			status = "disabled";
 		};
 
+		hvs@7e400000 {
+			compatible = "brcm,bcm2835-hvs";
+			reg = <0x7e400000 0x6000>;
+			interrupts = <2 1>;
+		};
+
 		i2c1: i2c@7e804000 {
 			compatible = "brcm,bcm2835-i2c";
 			reg = <0x7e804000 0x1000>;
@@ -226,11 +245,39 @@
 			status = "disabled";
 		};
 
+		pixelvalve@7e807000 {
+			compatible = "brcm,bcm2835-pixelvalve2";
+			reg = <0x7e807000 0x100>;
+			interrupts = <2 10>; /* pixelvalve */
+		};
+
+		hdmi: hdmi@7e902000 {
+			compatible = "brcm,bcm2835-hdmi";
+			reg = <0x7e902000 0x600>,
+			      <0x7e808000 0x100>;
+			interrupts = <2 8>, <2 9>;
+			ddc = <&i2c2>;
+			clocks = <&clocks BCM2835_PLLH_PIX>,
+				 <&clocks BCM2835_CLOCK_HSM>;
+			clock-names = "pixel", "hdmi";
+			status = "disabled";
+		};
+
 		usb: usb@7e980000 {
 			compatible = "brcm,bcm2835-usb";
 			reg = <0x7e980000 0x10000>;
 			interrupts = <1 9>;
 		};
+
+		v3d: v3d@7ec00000 {
+			compatible = "brcm,bcm2835-v3d";
+			reg = <0x7ec00000 0x1000>;
+			interrupts = <1 10>;
+		};
+
+		vc4: gpu {
+			compatible = "brcm,bcm2835-vc4";
+		};
 	};
 
 	clocks {
-- 
2.7.3

From da77f737f9f5a487f3a1f80f8546585ee18cd7b9 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 4 Mar 2016 10:39:28 -0800
Subject: [PATCH 27/36] dt-bindings: Add root properties for Raspberry Pi 3

Signed-off-by: Eric Anholt <eric@anholt.net>
Acked-by: Rob Herring <robh@kernel.org>
---
 Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt
index 11d3056..6ffe087 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt
@@ -30,6 +30,10 @@ Raspberry Pi 2 Model B
 Required root node properties:
 compatible = "raspberrypi,2-model-b", "brcm,bcm2836";
 
+Raspberry Pi 3 Model B
+Required root node properties:
+compatible = "raspberrypi,3-model-b", "brcm,bcm2837";
+
 Raspberry Pi Compute Module
 Required root node properties:
 compatible = "raspberrypi,compute-module", "brcm,bcm2835";
-- 
2.7.3

From b76b1cdf2e569cceab41dcf3b3f6a90965d0a02c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 4 Mar 2016 10:39:29 -0800
Subject: [PATCH 28/36] ARM: bcm2835: Add devicetree for the Raspberry Pi 3.

For now this doesn't support the new hardware present on the Pi 3 (BT,
wifi, GPIO expander).  Since the GPIO expander isn't supported, we
also don't have the LEDs like the other board files do.

Signed-off-by: Eric Anholt <eric@anholt.net>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
---
 arch/arm/boot/dts/Makefile            |  3 +-
 arch/arm/boot/dts/bcm2837-rpi-3-b.dts | 22 ++++++++++++
 arch/arm/boot/dts/bcm2837.dtsi        | 68 +++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm/boot/dts/bcm2837-rpi-3-b.dts
 create mode 100644 arch/arm/boot/dts/bcm2837.dtsi

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index d000814..a8a0767 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -64,7 +64,8 @@ dtb-$(CONFIG_ARCH_BCM2835) += \
 	bcm2835-rpi-b-rev2.dtb \
 	bcm2835-rpi-b-plus.dtb \
 	bcm2835-rpi-a-plus.dtb \
-	bcm2836-rpi-2-b.dtb
+	bcm2836-rpi-2-b.dtb \
+	bcm2837-rpi-3-b.dtb
 dtb-$(CONFIG_ARCH_BCM_5301X) += \
 	bcm4708-asus-rt-ac56u.dtb \
 	bcm4708-asus-rt-ac68u.dtb \
diff --git a/arch/arm/boot/dts/bcm2837-rpi-3-b.dts b/arch/arm/boot/dts/bcm2837-rpi-3-b.dts
new file mode 100644
index 0000000..5e8eafd
--- /dev/null
+++ b/arch/arm/boot/dts/bcm2837-rpi-3-b.dts
@@ -0,0 +1,22 @@
+/dts-v1/;
+#include "bcm2837.dtsi"
+#include "bcm2835-rpi.dtsi"
+
+/ {
+	compatible = "raspberrypi,3-model-b", "brcm,bcm2837";
+	model = "Raspberry Pi 3 Model B";
+
+	memory {
+		reg = <0 0x40000000>;
+	};
+};
+
+&gpio {
+	pinctrl-0 = <&gpioout &alt0 &i2s_alt0 &alt3>;
+
+	/* I2S interface */
+	i2s_alt0: i2s_alt0 {
+		brcm,pins = <28 29 30 31>;
+		brcm,function = <BCM2835_FSEL_ALT2>;
+	};
+};
diff --git a/arch/arm/boot/dts/bcm2837.dtsi b/arch/arm/boot/dts/bcm2837.dtsi
new file mode 100644
index 0000000..2f36722
--- /dev/null
+++ b/arch/arm/boot/dts/bcm2837.dtsi
@@ -0,0 +1,68 @@
+#include "bcm283x.dtsi"
+
+/ {
+	compatible = "brcm,bcm2836";
+
+	soc {
+		ranges = <0x7e000000 0x3f000000 0x1000000>,
+			 <0x40000000 0x40000000 0x00001000>;
+		dma-ranges = <0xc0000000 0x00000000 0x3f000000>;
+
+		local_intc: local_intc {
+			compatible = "brcm,bcm2836-l1-intc";
+			reg = <0x40000000 0x100>;
+			interrupt-controller;
+			#interrupt-cells = <1>;
+			interrupt-parent = <&local_intc>;
+		};
+	};
+
+	timer {
+		compatible = "arm,armv7-timer";
+		interrupt-parent = <&local_intc>;
+		interrupts = <0>, // PHYS_SECURE_PPI
+			     <1>, // PHYS_NONSECURE_PPI
+			     <3>, // VIRT_PPI
+			     <2>; // HYP_PPI
+		always-on;
+	};
+
+	cpus: cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu0: cpu@0 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a53";
+			reg = <0>;
+		};
+
+		cpu1: cpu@1 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a53";
+			reg = <1>;
+		};
+
+		cpu2: cpu@2 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a53";
+			reg = <2>;
+		};
+
+		cpu3: cpu@3 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a53";
+			reg = <3>;
+		};
+	};
+};
+
+/* Make the BCM2835-style global interrupt controller be a child of the
+ * CPU-local interrupt controller.
+ */
+&intc {
+	compatible = "brcm,bcm2836-armctrl-ic";
+	reg = <0x7e00b200 0x200>;
+	interrupt-parent = <&local_intc>;
+	interrupts = <8>;
+};
-- 
2.7.3

From 43aa67b7bccfb189a3e57832f08710c98fe707c6 Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Sun, 17 Jan 2016 12:15:28 +0000
Subject: [PATCH 29/36] ARM: bcm2835: follow dt uart node-naming convention

This patch fixes the naming of the device tree node: uart@7e201000
to conform to the standard of: serial@7e201000

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
---
 arch/arm/boot/dts/bcm283x.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi
index bbe4eab..31cc2f2 100644
--- a/arch/arm/boot/dts/bcm283x.dtsi
+++ b/arch/arm/boot/dts/bcm283x.dtsi
@@ -113,7 +113,7 @@
 			#interrupt-cells = <2>;
 		};
 
-		uart0: uart@7e201000 {
+		uart0: serial@7e201000 {
 			compatible = "brcm,bcm2835-pl011", "arm,pl011", "arm,primecell";
 			reg = <0x7e201000 0x1000>;
 			interrupts = <2 25>;
-- 
2.7.3

From 72b53a14be5ff0bda535faefa09bc9726acbe1ff Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Sun, 17 Jan 2016 12:15:29 +0000
Subject: [PATCH 30/36] dt/bindings: serial: bcm2835: add binding documentation
 for bcm2835-aux-uart

Add binding documentation for the bcm2835-aux-uart driver.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>

Changelog:
	V2->V3: fixed naming convention for node
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Eric Anholt <eric@anholt.net>
---
 .../bindings/serial/brcm,bcm2835-aux-uart.txt          | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/serial/brcm,bcm2835-aux-uart.txt

diff --git a/Documentation/devicetree/bindings/serial/brcm,bcm2835-aux-uart.txt b/Documentation/devicetree/bindings/serial/brcm,bcm2835-aux-uart.txt
new file mode 100644
index 0000000..b5cc629
--- /dev/null
+++ b/Documentation/devicetree/bindings/serial/brcm,bcm2835-aux-uart.txt
@@ -0,0 +1,18 @@
+* BCM2835 AUXILIAR UART
+
+Required properties:
+
+- compatible: "brcm,bcm2835-aux-uart"
+- reg: The base address of the UART register bank.
+- interrupts: A single interrupt specifier.
+- clocks: Clock driving the hardware; used to figure out the baud rate
+  divisor.
+
+Example:
+
+	uart1: serial@7e215040 {
+		compatible = "brcm,bcm2835-aux-uart";
+		reg = <0x7e215040 0x40>;
+		interrupts = <1 29>;
+		clocks = <&aux BCM2835_AUX_CLOCK_UART>;
+	};
-- 
2.7.3

From 285a4ac466d3712b50f5c0d29bf5874476f00c30 Mon Sep 17 00:00:00 2001
From: Martin Sperl <kernel@martin.sperl.org>
Date: Sun, 17 Jan 2016 12:15:30 +0000
Subject: [PATCH 31/36] serial: bcm2835: add driver for bcm2835-aux-uart

The bcm2835 SOC contains an auxiliary uart, which is very close
to the ns16550 with some differences.

The big difference is that the uart HW is not using an internal divider
of 16 but 8, which results in an effictive baud-rate being twice
the requested baud-rate.

This driver handles this device correctly and handles the difference in
the HW divider by scaling up the clock by a factor of 2.

The approach to write a separate (wrapper) driver instead of using a
multiplying clock and "ns16550" as compatibility in the device-tree
has been recommended by Stephen Warren.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>

Changelog:
	V1->V2: made an explicit bcm2835-aux-uart driver
		not conrolling the settings via DT only
	V2->V3: added comments on UART capabilities
		applied recommendations by Stefan Wahren
		keep registered line-id in bcm2835aux_data
Acked-by: Eric Anholt <eric@anholt.net>
---
 drivers/tty/serial/8250/8250_bcm2835aux.c | 146 ++++++++++++++++++++++++++++++
 drivers/tty/serial/8250/Kconfig           |  24 +++++
 drivers/tty/serial/8250/Makefile          |   1 +
 3 files changed, 171 insertions(+)
 create mode 100644 drivers/tty/serial/8250/8250_bcm2835aux.c

diff --git a/drivers/tty/serial/8250/8250_bcm2835aux.c b/drivers/tty/serial/8250/8250_bcm2835aux.c
new file mode 100644
index 0000000..ecf89f1
--- /dev/null
+++ b/drivers/tty/serial/8250/8250_bcm2835aux.c
@@ -0,0 +1,146 @@
+/*
+ * Serial port driver for BCM2835AUX UART
+ *
+ * Copyright (C) 2016 Martin Sperl <kernel@martin.sperl.org>
+ *
+ * Based on 8250_lpc18xx.c:
+ * Copyright (C) 2015 Joachim Eastwood <manabian@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
+#include "8250.h"
+
+struct bcm2835aux_data {
+	struct uart_8250_port uart;
+	struct clk *clk;
+	int line;
+};
+
+static int bcm2835aux_serial_probe(struct platform_device *pdev)
+{
+	struct bcm2835aux_data *data;
+	struct resource *res;
+	int ret;
+
+	/* allocate the custom structure */
+	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* initialize data */
+	spin_lock_init(&data->uart.port.lock);
+	data->uart.capabilities = UART_CAP_FIFO;
+	data->uart.port.dev = &pdev->dev;
+	data->uart.port.regshift = 2;
+	data->uart.port.type = PORT_16550;
+	data->uart.port.iotype = UPIO_MEM;
+	data->uart.port.fifosize = 8;
+	data->uart.port.flags = UPF_SHARE_IRQ |
+				UPF_FIXED_PORT |
+				UPF_FIXED_TYPE |
+				UPF_SKIP_TEST;
+
+	/* get the clock - this also enables the HW */
+	data->clk = devm_clk_get(&pdev->dev, NULL);
+	ret = PTR_ERR_OR_ZERO(data->clk);
+	if (ret) {
+		dev_err(&pdev->dev, "could not get clk: %d\n", ret);
+		return ret;
+	}
+
+	/* get the interrupt */
+	data->uart.port.irq = platform_get_irq(pdev, 0);
+	if (data->uart.port.irq < 0) {
+		dev_err(&pdev->dev, "irq not found - %i",
+			data->uart.port.irq);
+		return data->uart.port.irq;
+	}
+
+	/* map the main registers */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "memory resource not found");
+		return -EINVAL;
+	}
+	data->uart.port.membase = devm_ioremap_resource(&pdev->dev, res);
+	ret = PTR_ERR_OR_ZERO(data->uart.port.membase);
+	if (ret)
+		return ret;
+
+	/* Check for a fixed line number */
+	ret = of_alias_get_id(pdev->dev.of_node, "serial");
+	if (ret >= 0)
+		data->uart.port.line = ret;
+
+	/* enable the clock as a last step */
+	ret = clk_prepare_enable(data->clk);
+	if (ret) {
+		dev_err(&pdev->dev, "unable to enable uart clock - %d\n",
+			ret);
+		return ret;
+	}
+
+	/* the HW-clock divider for bcm2835aux is 8,
+	 * but 8250 expects a divider of 16,
+	 * so we have to multiply the actual clock by 2
+	 * to get identical baudrates.
+	 */
+	data->uart.port.uartclk = clk_get_rate(data->clk) * 2;
+
+	/* register the port */
+	ret = serial8250_register_8250_port(&data->uart);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "unable to register 8250 port - %d\n",
+			ret);
+		goto dis_clk;
+	}
+	data->line = ret;
+
+	platform_set_drvdata(pdev, data);
+
+	return 0;
+
+dis_clk:
+	clk_disable_unprepare(data->clk);
+	return ret;
+}
+
+static int bcm2835aux_serial_remove(struct platform_device *pdev)
+{
+	struct bcm2835aux_data *data = platform_get_drvdata(pdev);
+
+	serial8250_unregister_port(data->uart.port.line);
+	clk_disable_unprepare(data->clk);
+
+	return 0;
+}
+
+static const struct of_device_id bcm2835aux_serial_match[] = {
+	{ .compatible = "brcm,bcm2835-aux-uart" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, bcm2835aux_serial_match);
+
+static struct platform_driver bcm2835aux_serial_driver = {
+	.driver = {
+		.name = "bcm2835-aux-uart",
+		.of_match_table = bcm2835aux_serial_match,
+	},
+	.probe  = bcm2835aux_serial_probe,
+	.remove = bcm2835aux_serial_remove,
+};
+module_platform_driver(bcm2835aux_serial_driver);
+
+MODULE_DESCRIPTION("BCM2835 auxiliar UART driver");
+MODULE_AUTHOR("Martin Sperl <kernel@martin.sperl.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index b03cb517..67ad6b0 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -272,6 +272,30 @@ config SERIAL_8250_ACORN
 	  system, say Y to this option.  The driver can handle 1, 2, or 3 port
 	  cards.  If unsure, say N.
 
+config SERIAL_8250_BCM2835AUX
+	tristate "BCM2835 auxiliar mini UART support"
+	depends on ARCH_BCM2835 || COMPILE_TEST
+	depends on SERIAL_8250 && SERIAL_8250_SHARE_IRQ
+	help
+	  Support for the BCM2835 auxiliar mini UART.
+
+	  Features and limitations of the UART are
+	    Registers are similar to 16650 registers,
+              set bits in the control registers that are unsupported
+	      are ignored and read back as 0
+	    7/8 bit operation with 1 start and 1 stop bit
+	    8 symbols deep fifo for rx and tx
+	    SW controlled RTS and SW readable CTS
+	    Clock rate derived from system clock
+	    Uses 8 times oversampling (compared to 16 times for 16650)
+	    Missing break detection (but break generation)
+	    Missing framing error detection
+	    Missing parity bit
+	    Missing receive time-out interrupt
+	    Missing DCD, DSR, DTR and RI signals
+
+	  If unsure, say N.
+
 config SERIAL_8250_FSL
 	bool
 	depends on SERIAL_8250_CONSOLE
diff --git a/drivers/tty/serial/8250/Makefile b/drivers/tty/serial/8250/Makefile
index b9b9bca..5c1869f 100644
--- a/drivers/tty/serial/8250/Makefile
+++ b/drivers/tty/serial/8250/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_SERIAL_8250_PCI)		+= 8250_pci.o
 obj-$(CONFIG_SERIAL_8250_HP300)		+= 8250_hp300.o
 obj-$(CONFIG_SERIAL_8250_CS)		+= serial_cs.o
 obj-$(CONFIG_SERIAL_8250_ACORN)		+= 8250_acorn.o
+obj-$(CONFIG_SERIAL_8250_BCM2835AUX)	+= 8250_bcm2835aux.o
 obj-$(CONFIG_SERIAL_8250_CONSOLE)	+= 8250_early.o
 obj-$(CONFIG_SERIAL_8250_FOURPORT)	+= 8250_fourport.o
 obj-$(CONFIG_SERIAL_8250_ACCENT)	+= 8250_accent.o
-- 
2.7.3

From 528285e99c25249456023d28f521689bf9e9eb8b Mon Sep 17 00:00:00 2001
From: Peter Robinson <pbrobinson@gmail.com>
Date: Wed, 30 Mar 2016 09:35:13 +0100
Subject: [PATCH 32/36] drop usb power domain support for the moment, kills usb

---
 arch/arm/boot/dts/bcm2835-rpi.dtsi | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm/boot/dts/bcm2835-rpi.dtsi b/arch/arm/boot/dts/bcm2835-rpi.dtsi
index caf2707..b1e8145 100644
--- a/arch/arm/boot/dts/bcm2835-rpi.dtsi
+++ b/arch/arm/boot/dts/bcm2835-rpi.dtsi
@@ -71,10 +71,6 @@
 	status = "okay";
 };
 
-&usb {
-	power-domains = <&power RPI_POWER_DOMAIN_USB>;
-};
-
 &v3d {
 	power-domains = <&power RPI_POWER_DOMAIN_V3D>;
 };
-- 
2.7.3

From 6af83c5ff7f5514f32b1b3fa6d8d7dfe77e3acce Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Sun, 17 Jan 2016 14:59:00 +0000
Subject: [PATCH 33/36] mmc: sdhci-iproc: Clean up platform allocations if
 shdci init fails

This patch adopts the changes from 475c9e43bfa7 ("mmc: sdhci-bcm2835:
Clean up platform allocations if sdhci init fails") to sdhci-iproc.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Acked-by: Scott Branden <sbranden@broadcom.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-iproc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c
index 3b423b0..e22060a 100644
--- a/drivers/mmc/host/sdhci-iproc.c
+++ b/drivers/mmc/host/sdhci-iproc.c
@@ -213,7 +213,11 @@ static int sdhci_iproc_probe(struct platform_device *pdev)
 		host->caps1 = iproc_host->data->caps1;
 	}
 
-	return sdhci_add_host(host);
+	ret = sdhci_add_host(host);
+	if (ret)
+		goto err;
+
+	return 0;
 
 err:
 	sdhci_pltfm_free(pdev);
-- 
2.7.3

From 1565145761d5d94991e4763001c9e60c655818f1 Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Sun, 17 Jan 2016 14:59:01 +0000
Subject: [PATCH 34/36] mmc: sdhci-iproc: Actually enable the clock

The RPi firmware-based clocks driver can actually disable
unused clocks, so when switching to use it we ended up losing
our MMC clock once all devices were probed.

This patch adopts the changes from 1e5a0a9a58e2 ("mmc: sdhci-bcm2835:
Actually enable the clock") to sdhci-iproc.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Acked-by: Scott Branden <sbranden@broadcom.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-iproc.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c
index e22060a..55bc348 100644
--- a/drivers/mmc/host/sdhci-iproc.c
+++ b/drivers/mmc/host/sdhci-iproc.c
@@ -207,6 +207,11 @@ static int sdhci_iproc_probe(struct platform_device *pdev)
 		ret = PTR_ERR(pltfm_host->clk);
 		goto err;
 	}
+	ret = clk_prepare_enable(pltfm_host->clk);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to enable host clk\n");
+		goto err;
+	}
 
 	if (iproc_host->data->pdata->quirks & SDHCI_QUIRK_MISSING_CAPS) {
 		host->caps = iproc_host->data->caps;
@@ -215,10 +220,12 @@ static int sdhci_iproc_probe(struct platform_device *pdev)
 
 	ret = sdhci_add_host(host);
 	if (ret)
-		goto err;
+		goto err_clk;
 
 	return 0;
 
+err_clk:
+	clk_disable_unprepare(pltfm_host->clk);
 err:
 	sdhci_pltfm_free(pdev);
 	return ret;
-- 
2.7.3

From 49ebf153a97a0840c1e54f934411aceb93bbdee4 Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Wed, 27 Jan 2016 22:25:40 +0000
Subject: [PATCH 35/36] mmc: sdhci-iproc: define MMC caps in platform data

This patch moves the definition of the MMC capabilities
from the probe function into iproc platform data. After
that we are able to add support for another platform more
easily.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Suggested-by: Stephen Warren <swarren@wwwdotorg.org>
Acked-by: Scott Branden <sbranden@broadcom.com>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-iproc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c
index 55bc348..cdc6c4a 100644
--- a/drivers/mmc/host/sdhci-iproc.c
+++ b/drivers/mmc/host/sdhci-iproc.c
@@ -26,6 +26,7 @@ struct sdhci_iproc_data {
 	const struct sdhci_pltfm_data *pdata;
 	u32 caps;
 	u32 caps1;
+	u32 mmc_caps;
 };
 
 struct sdhci_iproc_host {
@@ -165,6 +166,7 @@ static const struct sdhci_iproc_data iproc_data = {
 	.pdata = &sdhci_iproc_pltfm_data,
 	.caps = 0x05E90000,
 	.caps1 = 0x00000064,
+	.mmc_caps = MMC_CAP_1_8V_DDR,
 };
 
 static const struct of_device_id sdhci_iproc_of_match[] = {
@@ -199,8 +201,7 @@ static int sdhci_iproc_probe(struct platform_device *pdev)
 	mmc_of_parse(host->mmc);
 	sdhci_get_of_property(pdev);
 
-	/* Enable EMMC 1/8V DDR capable */
-	host->mmc->caps |= MMC_CAP_1_8V_DDR;
+	host->mmc->caps |= iproc_host->data->mmc_caps;
 
 	pltfm_host->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(pltfm_host->clk)) {
-- 
2.7.3

From 208897fb02fa78d06f960916bc3781c8a060ab72 Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Wed, 27 Jan 2016 22:25:41 +0000
Subject: [PATCH 36/36] mmc: sdhci-iproc: add bcm2835 support

Scott Branden from Broadcom said that the BCM2835 eMMC IP core is
very similar to IPROC and share most of the quirks. So use this driver
instead of separate one.

The sdhci-iproc contains a better workaround for the clock domain
crossing problem which doesn't need any delays. This results in a
better write performance.

Btw we get the rid of the SDHCI_CAPABILITIES hack in the sdhci_readl
function.

Suggested-by: Scott Branden <sbranden@broadcom.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Acked-by: Eric Anholt <eric@anholt.net>
Acked-by: Scott Branden <sbranden@broadcom.com>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/Kconfig       |  6 +++---
 drivers/mmc/host/sdhci-iproc.c | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 1526b8a..60de1e4 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -318,15 +318,15 @@ config MMC_SDHCI_F_SDH30
 	  If unsure, say N.
 
 config MMC_SDHCI_IPROC
-	tristate "SDHCI platform support for the iProc SD/MMC Controller"
-	depends on ARCH_BCM_IPROC || COMPILE_TEST
+	tristate "SDHCI support for the BCM2835 & iProc SD/MMC Controller"
+	depends on ARCH_BCM2835 || ARCH_BCM_IPROC || COMPILE_TEST
 	depends on MMC_SDHCI_PLTFM
 	default ARCH_BCM_IPROC
 	select MMC_SDHCI_IO_ACCESSORS
 	help
 	  This selects the iProc SD/MMC controller.
 
-	  If you have an IPROC platform with SD or MMC devices,
+	  If you have a BCM2835 or IPROC platform with SD or MMC devices,
 	  say Y or M here.
 
 	  If unsure, say N.
diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c
index cdc6c4a..871c92c 100644
--- a/drivers/mmc/host/sdhci-iproc.c
+++ b/drivers/mmc/host/sdhci-iproc.c
@@ -169,7 +169,22 @@ static const struct sdhci_iproc_data iproc_data = {
 	.mmc_caps = MMC_CAP_1_8V_DDR,
 };
 
+static const struct sdhci_pltfm_data sdhci_bcm2835_pltfm_data = {
+	.quirks = SDHCI_QUIRK_BROKEN_CARD_DETECTION |
+		  SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK |
+		  SDHCI_QUIRK_MISSING_CAPS,
+	.ops = &sdhci_iproc_ops,
+};
+
+static const struct sdhci_iproc_data bcm2835_data = {
+	.pdata = &sdhci_bcm2835_pltfm_data,
+	.caps = SDHCI_CAN_VDD_330,
+	.caps1 = 0x00000000,
+	.mmc_caps = 0x00000000,
+};
+
 static const struct of_device_id sdhci_iproc_of_match[] = {
+	{ .compatible = "brcm,bcm2835-sdhci", .data = &bcm2835_data },
 	{ .compatible = "brcm,sdhci-iproc-cygnus", .data = &iproc_data },
 	{ }
 };
-- 
2.7.3