diff --git a/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch new file mode 100644 index 0000000..1511e4a --- /dev/null +++ b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch @@ -0,0 +1,141 @@ +From 7a7662fe09eb2ccd2eb93ce7261aa47c86111b4d Mon Sep 17 00:00:00 2001 +From: Karol Herbst +Date: Tue, 24 Mar 2020 21:29:23 +0100 +Subject: [PATCH 1/2] drm/nouveau: workaround runpm fail by disabling PCI power + management on certain intel bridges + +Fixes the infamous 'runtime PM' bug many users are facing on Laptops with +Nvidia Pascal GPUs by skipping said PCI power state changes on the GPU. + +Depending on the used kernel there might be messages like those in demsg: + +"nouveau 0000:01:00.0: Refused to change power state, currently in D3" +"nouveau 0000:01:00.0: can't change power state from D3cold to D0 (config +space inaccessible)" +followed by backtraces of kernel crashes or timeouts within nouveau. + +It's still unkown why this issue exists, but this is a reliable workaround +and solves a very annoying issue for user having to choose between a +crashing kernel or higher power consumption of their Laptops. + +Signed-off-by: Karol Herbst +Cc: Bjorn Helgaas +Cc: Lyude Paul +Cc: Rafael J. Wysocki +Cc: Mika Westerberg +Cc: linux-pci@vger.kernel.org +Cc: linux-pm@vger.kernel.org +Cc: dri-devel@lists.freedesktop.org +Cc: nouveau@lists.freedesktop.org +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205623 +Signed-off-by: Ben Skeggs +--- + drivers/gpu/drm/nouveau/nouveau_drm.c | 63 +++++++++++++++++++++++++++ + drivers/gpu/drm/nouveau/nouveau_drv.h | 2 + + 2 files changed, 65 insertions(+) + +diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c +index 6b1629c14dd7..ca4087f5a15b 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drm.c ++++ b/drivers/gpu/drm/nouveau/nouveau_drm.c +@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev) + kfree(drm); + } + ++/* ++ * On some Intel PCIe bridge controllers doing a ++ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear. ++ * Skipping the intermediate D3hot step seems to make it work again. This is ++ * probably caused by not meeting the expectation the involved AML code has ++ * when the GPU is put into D3hot state before invoking it. ++ * ++ * This leads to various manifestations of this issue: ++ * - AML code execution to power on the GPU hits an infinite loop (as the ++ * code waits on device memory to change). ++ * - kernel crashes, as all PCI reads return -1, which most code isn't able ++ * to handle well enough. ++ * ++ * In all cases dmesg will contain at least one line like this: ++ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3' ++ * followed by a lot of nouveau timeouts. ++ * ++ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not ++ * documented PCI config space register 0x248 of the Intel PCIe bridge ++ * controller (0x1901) in order to change the state of the PCIe link between ++ * the PCIe port and the GPU. There are alternative code paths using other ++ * registers, which seem to work fine (executed pre Windows 8): ++ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved') ++ * - 0xb0 bit 0x10 (link disable) ++ * Changing the conditions inside the firmware by poking into the relevant ++ * addresses does resolve the issue, but it seemed to be ACPI private memory ++ * and not any device accessible memory at all, so there is no portable way of ++ * changing the conditions. ++ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared. ++ * ++ * The only systems where this behavior can be seen are hybrid graphics laptops ++ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether ++ * this issue only occurs in combination with listed Intel PCIe bridge ++ * controllers and the mentioned GPUs or other devices as well. ++ * ++ * documentation on the PCIe bridge controller can be found in the ++ * "7th Generation IntelĀ® Processor Families for H Platforms Datasheet Volume 2" ++ * Section "12 PCI Express* Controller (x16) Registers" ++ */ ++ ++static void quirk_broken_nv_runpm(struct pci_dev *pdev) ++{ ++ struct drm_device *dev = pci_get_drvdata(pdev); ++ struct nouveau_drm *drm = nouveau_drm(dev); ++ struct pci_dev *bridge = pci_upstream_bridge(pdev); ++ ++ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL) ++ return; ++ ++ switch (bridge->device) { ++ case 0x1901: ++ drm->old_pm_cap = pdev->pm_cap; ++ pdev->pm_cap = 0; ++ NV_INFO(drm, "Disabling PCI power management to avoid bug\n"); ++ break; ++ } ++} ++ + static int nouveau_drm_probe(struct pci_dev *pdev, + const struct pci_device_id *pent) + { +@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev, + if (ret) + goto fail_drm_dev_init; + ++ quirk_broken_nv_runpm(pdev); + return 0; + + fail_drm_dev_init: +@@ -734,7 +793,11 @@ static void + nouveau_drm_remove(struct pci_dev *pdev) + { + struct drm_device *dev = pci_get_drvdata(pdev); ++ struct nouveau_drm *drm = nouveau_drm(dev); + ++ /* revert our workaround */ ++ if (drm->old_pm_cap) ++ pdev->pm_cap = drm->old_pm_cap; + nouveau_drm_device_remove(dev); + pci_disable_device(pdev); + } +diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h +index c2c332fbde97..2a6519737800 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drv.h ++++ b/drivers/gpu/drm/nouveau/nouveau_drv.h +@@ -140,6 +140,8 @@ struct nouveau_drm { + + struct list_head clients; + ++ u8 old_pm_cap; ++ + struct { + struct agp_bridge_data *bridge; + u32 base; +-- +2.25.1 + diff --git a/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch b/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch new file mode 100644 index 0000000..5548000 --- /dev/null +++ b/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch @@ -0,0 +1,68 @@ +From 37b556606d1217b4367e622d88cef11c65764386 Mon Sep 17 00:00:00 2001 +From: Ben Skeggs +Date: Tue, 31 Mar 2020 16:08:44 +1000 +Subject: [PATCH 2/2] drm/nouveau/gr/gp107,gp108: implement workaround for HW + hanging during init + +Certain boards with GP107/GP108 chipsets hang (often, but randomly) for +unknown reasons during GR initialisation. + +The first tell-tale symptom of this issue is: + +nouveau 0000:01:00.0: bus: MMIO read of 00000000 FAULT at 409800 [ TIMEOUT ] + +appearing in dmesg, likely followed by many other failures being logged. + +Karol found this WAR for the issue a while back, but efforts to isolate +the root cause and proper fix have not yielded success so far. I've +modified the original patch to include a few more details, limit it to +GP107/GP108 by default, and added a config option to override this choice. + +Signed-off-by: Ben Skeggs +Reviewed-by: Karol Herbst +--- + .../gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 26 +++++++++++++++++++ + 1 file changed, 26 insertions(+) + +diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +index dd8f85b8b3a7..f2f5636efac4 100644 +--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c ++++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base) + { + struct gf100_gr *gr = gf100_gr(base); + struct nvkm_subdev *subdev = &base->engine.subdev; ++ struct nvkm_device *device = subdev->device; ++ bool reset = device->chipset == 0x137 || device->chipset == 0x138; + u32 ret; + ++ /* On certain GP107/GP108 boards, we trigger a weird issue where ++ * GR will stop responding to PRI accesses after we've asked the ++ * SEC2 RTOS to boot the GR falcons. This happens with far more ++ * frequency when cold-booting a board (ie. returning from D3). ++ * ++ * The root cause for this is not known and has proven difficult ++ * to isolate, with many avenues being dead-ends. ++ * ++ * A workaround was discovered by Karol, whereby putting GR into ++ * reset for an extended period right before initialisation ++ * prevents the problem from occuring. ++ * ++ * XXX: As RM does not require any such workaround, this is more ++ * of a hack than a true fix. ++ */ ++ reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset); ++ if (reset) { ++ nvkm_mask(device, 0x000200, 0x00001000, 0x00000000); ++ nvkm_rd32(device, 0x000200); ++ msleep(50); ++ nvkm_mask(device, 0x000200, 0x00001000, 0x00001000); ++ nvkm_rd32(device, 0x000200); ++ } ++ + nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false); + + ret = nvkm_falcon_get(&gr->fecs.falcon, subdev); +-- +2.25.1 + diff --git a/kernel.spec b/kernel.spec index 5e9abbc..5632562 100644 --- a/kernel.spec +++ b/kernel.spec @@ -904,6 +904,12 @@ Patch511: e1000e-bump-up-timeout-to-wait-when-ME-un-configure-ULP-mode.patch # https://bugzilla.redhat.com/show_bug.cgi?id=1820196 Patch512: 0001-ALSA-hda-realtek-Add-quirk-for-Lenovo-Carbon-X1-8th-.patch +# nouveau runpm and secboot fixes +# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/f5755e7069d4acbcce1a93692421f358241ead7b +Patch513: 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch +# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/41c6a13e8143af71928749ea9895d2ebc2fb4ffd +Patch514: 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch + # END OF PATCH DEFINITIONS %endif @@ -2999,6 +3005,9 @@ fi # # %changelog +* Tue Apr 07 2020 Karol Herbst +- Add patches to fix nouveau issues preventing booting the installer or system + * Fri Apr 3 2020 Peter Robinson - Raspberry Pi HDMI mode validation fix - Raspberry Pi 4 rev 1.2 mmc fix