From 2053f7db7af8c5ab2aefe9759df3505e6b840379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pecio?= Date: Tue, 7 Jun 2016 12:34:45 +0200 Subject: [PATCH 001/813] USB: OHCI: Don't mark EDs as ED_OPER if scheduling fails commit c66f59ee5050447b3da92d36f5385a847990a894 upstream. Since ed_schedule begins with marking the ED as "operational", the ED may be left in such state even if scheduling actually fails. This allows future submission attempts to smuggle this ED to the hardware behind the scheduler's back and without linking it to the ohci->eds_in_use list. The former causes bandwidth saturation and data loss on isoc endpoints, the latter crashes the kernel when attempt is made to unlink such ED from this list. Fix ed_schedule to update ED state only on successful return. Signed-off-by: Michal Pecio Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ohci-q.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/ohci-q.c b/drivers/usb/host/ohci-q.c index d029bbe9eb36..641fed609911 100644 --- a/drivers/usb/host/ohci-q.c +++ b/drivers/usb/host/ohci-q.c @@ -183,7 +183,6 @@ static int ed_schedule (struct ohci_hcd *ohci, struct ed *ed) { int branch; - ed->state = ED_OPER; ed->ed_prev = NULL; ed->ed_next = NULL; ed->hwNextED = 0; @@ -259,6 +258,8 @@ static int ed_schedule (struct ohci_hcd *ohci, struct ed *ed) /* the HC may not see the schedule updates yet, but if it does * then they'll be properly ordered. */ + + ed->state = ED_OPER; return 0; } From bab5a36c1917216f0c94b521d008bdca393cc409 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: [PATCH 002/813] x86/quirks: Apply nvidia_bugs quirk only on root bus commit 447d29d1d3aed839e74c2401ef63387780ac51ed upstream. Since the following commit: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") ... early quirks are only applied to devices on the root bus. The motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to reintroduce scanning of secondary buses for a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs. To prevent regressions, open code the requirement to apply nvidia_bugs only on the root bus. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4d5477c1d76b2f0387a780f2142bbcdd9fee869b.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index db9a675e751b..b714e6325e60 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -75,6 +75,13 @@ static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC + /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. From dd4eb74efbd22006c99ae7ff45ef9ef676bb5715 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: [PATCH 003/813] x86/quirks: Reintroduce scanning of secondary buses commit 850c321027c2e31d0afc71588974719a4b565550 upstream. We used to scan secondary buses until the following commit that was applied in 2009: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") which commit constrained early quirks to the root bus only. Its motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to add a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs, which is located on a secondary bus behind a PCIe root port. To facilitate that, reintroduce scanning of secondary buses. The commit message of 8659c406ade3 notes that scanning only the root bus "saves quite some unnecessary scanning work". The algorithm used prior to 8659c406ade3 was particularly time consuming because it scanned buses 0 to 31 brute force. To avoid lengthening boot time, employ a recursive strategy which only scans buses that are actually reachable from the root bus. Yinghai Lu pointed out that the secondary bus number read from a bridge's config space may be invalid, in particular a value of 0 would cause an infinite loop. The PCI core goes beyond that and recurses to a child bus only if its bus number is greater than the parent bus number (see pci_scan_bridge()). Since the root bus is numbered 0, this implies that secondary buses may not be 0. Do the same on early scanning. If this algorithm is found to significantly impact boot time or cause infinite loops on broken hardware, it would be possible to limit its recursion depth: The Broadcom 4331 quirk applies at depth 1, all others at depth 0, so the bus need not be scanned deeper than that for now. An alternative approach would be to revert to scanning only the root bus, and apply the Broadcom 4331 quirk to the root ports 8086:1c12, 8086:1e12 and 8086:1e16. Apple always positioned the card behind either of these three ports. The quirk would then check presence of the card in slot 0 below the root port and do its deed. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/f0daa70dac1a9b2483abdb31887173eb6ab77bdf.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index b714e6325e60..ca82b2e1eabe 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -609,12 +609,6 @@ struct chipset { void (*f)(int num, int slot, int func); }; -/* - * Only works for devices on the root bus. If you add any devices - * not on bus 0 readd another loop level in early_quirks(). But - * be careful because at least the Nvidia quirk here relies on - * only matching on bus 0. - */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -647,6 +641,8 @@ static struct chipset early_qrk[] __initdata = { {} }; +static void __init early_pci_scan_bus(int bus); + /** * check_dev_quirk - apply early quirks to a given PCI device * @num: bus number @@ -655,7 +651,7 @@ static struct chipset early_qrk[] __initdata = { * * Check the vendor & device ID against the early quirks table. * - * If the device is single function, let early_quirks() know so we don't + * If the device is single function, let early_pci_scan_bus() know so we don't * poke at this device again. */ static int __init check_dev_quirk(int num, int slot, int func) @@ -664,6 +660,7 @@ static int __init check_dev_quirk(int num, int slot, int func) u16 vendor; u16 device; u8 type; + u8 sec; int i; class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); @@ -691,25 +688,36 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + if (!(type & 0x80)) return -1; return 0; } -void __init early_quirks(void) +static void __init early_pci_scan_bus(int bus) { int slot, func; - if (!early_pci_allowed()) - return; - /* Poor man's PCI discovery */ - /* Only scan the root bus */ for (slot = 0; slot < 32; slot++) for (func = 0; func < 8; func++) { /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(0, slot, func)) + if (check_dev_quirk(bus, slot, func)) break; } } + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} From ba1eebc72dc6cf8995562e534a337b965b66ef3b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: [PATCH 004/813] x86/quirks: Add early quirk to reset Apple AirPort card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit abb2bafd295fe962bbadc329dbfb2146457283ac upstream. The EFI firmware on Macs contains a full-fledged network stack for downloading OS X images from osrecovery.apple.com. Unfortunately on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331 wireless card on every boot and leaves it enabled even after ExitBootServices has been called. The card continues to assert its IRQ line, causing spurious interrupts if the IRQ is shared. It also corrupts memory by DMAing received packets, allowing for remote code execution over the air. This only stops when a driver is loaded for the wireless card, which may be never if the driver is not installed or blacklisted. The issue seems to be constrained to the Broadcom 4331. Chris Milsted has verified that the newer Broadcom 4360 built into the MacBookPro11,3 (2013/2014) does not exhibit this behaviour. The chances that Apple will ever supply a firmware fix for the older machines appear to be zero. The solution is to reset the card on boot by writing to a reset bit in its mmio space. This must be done as an early quirk and not as a plain vanilla PCI quirk to successfully combat memory corruption by DMAed packets: Matthew Garrett found out in 2012 that the packets are written to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html). This type of memory is made available to the page allocator by efi_free_boot_services(). Plain vanilla PCI quirks run much later, in subsys initcall level. In-between a time window would be open for memory corruption. Random crashes occurring in this time window and attributed to DMAed packets have indeed been observed in the wild by Chris Bainbridge. When Matthew Garrett analyzed the memory corruption issue in 2012, he sought to fix it with a grub quirk which transitions the card to D3hot: http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56 This approach does not help users with other bootloaders and while it may prevent DMAed packets, it does not cure the spurious interrupts emanating from the card. Unfortunately the card's mmio space is inaccessible in D3hot, so to reset it, we have to undo the effect of Matthew's grub patch and transition the card back to D0. Note that the quirk takes a few shortcuts to reduce the amount of code: The size of BAR 0 and the location of the PM capability is identical on all affected machines and therefore hardcoded. Only the address of BAR 0 differs between models. Also, it is assumed that the BCMA core currently mapped is the 802.11 core. The EFI driver seems to always take care of this. Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback towards finding the best solution to this problem. The following should be a comprehensive list of affected models: iMac13,1 2012 21.5" [Root Port 00:1c.3 = 8086:1e16] iMac13,2 2012 27" [Root Port 00:1c.3 = 8086:1e16] Macmini5,1 2011 i5 2.3 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,2 2011 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,3 2011 i7 2.0 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini6,1 2012 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1e12] Macmini6,2 2012 i7 2.3 GHz [Root Port 00:1c.1 = 8086:1e12] MacBookPro8,1 2011 13" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,2 2011 15" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,3 2011 17" [Root Port 00:1c.1 = 8086:1c12] MacBookPro9,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro9,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] For posterity, spurious interrupts caused by the Broadcom 4331 wireless card resulted in splats like this (stacktrace omitted): irq 17: nobody cared (try booting with the "irqpoll" option) handlers: [] pcie_isr [] sdhci_irq [sdhci] threaded [] sdhci_thread_irq [sdhci] [] azx_interrupt [snd_hda_codec] Disabling IRQ #17 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732 Tested-by: Konstantin Simanov # [MacBookPro8,1] Tested-by: Lukas Wunner # [MacBookPro9,1] Tested-by: Bryan Paradis # [MacBookPro9,2] Tested-by: Andrew Worsley # [MacBookPro10,1] Tested-by: Chris Bainbridge # [MacBookPro10,2] Signed-off-by: Lukas Wunner Acked-by: Rafał Miłecki Acked-by: Matt Fleming Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris Milsted Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Garrett Cc: Michael Buesch Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: b43-dev@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linux-wireless@vger.kernel.org Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de [ Did minor readability edits. ] Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 64 ++++++++++++++++++++++++++++++++++ drivers/bcma/bcma_private.h | 2 -- include/linux/bcma/bcma.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ca82b2e1eabe..9fdf1d330727 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -11,7 +11,11 @@ #include #include +#include +#include #include +#include +#include #include #include #include @@ -21,6 +25,9 @@ #include #include #include +#include + +#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -596,6 +603,61 @@ static void __init force_disable_hpet(int num, int slot, int func) #endif } +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + dev_err("Cannot power up Apple AirPort card\n"); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + dev_err("Cannot iomap Apple AirPort card\n"); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 @@ -638,6 +700,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} }; diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index 38f156745d53..71df8f2afc6c 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -8,8 +8,6 @@ #include #include -#define BCMA_CORE_SIZE 0x1000 - #define bcma_err(bus, fmt, ...) \ pr_err("bus%d: " fmt, (bus)->num, ##__VA_ARGS__) #define bcma_warn(bus, fmt, ...) \ diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3feb1b2d75d8..14cd6f77e284 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -156,6 +156,7 @@ struct bcma_host_ops { #define BCMA_CORE_DEFAULT 0xFFF #define BCMA_MAX_NR_CORES 16 +#define BCMA_CORE_SIZE 0x1000 /* Chip IDs of PCIe devices */ #define BCMA_CHIP_ID_BCM4313 0x4313 From 1b60fcdcf9bc09d3f86084b17d75c050cfed13a8 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 12 May 2016 16:54:08 +0200 Subject: [PATCH 005/813] dmaengine: at_xdmac: align descriptors on 64 bits commit 4a9723e8df68cfce4048517ee32e37f78854b6fb upstream. Having descriptors aligned on 64 bits allows update CNDA and CUBC in an atomic way. Signed-off-by: Ludovic Desroches Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver") Reviewed-by: Nicolas Ferre Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 02f9aa4ebe05..6251969d1cf9 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -242,7 +242,7 @@ struct at_xdmac_lld { u32 mbr_dus; /* Destination Microblock Stride Register */ }; - +/* 64-bit alignment needed to update CNDA and CUBC registers in an atomic way. */ struct at_xdmac_desc { struct at_xdmac_lld lld; enum dma_transfer_direction direction; @@ -253,7 +253,7 @@ struct at_xdmac_desc { unsigned int xfer_size; struct list_head descs_list; struct list_head xfer_node; -}; +} __aligned(sizeof(u64)); static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb) { From 5ce7333f41a81e82d54537e5057224a3cdc78342 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 12 May 2016 16:54:09 +0200 Subject: [PATCH 006/813] dmaengine: at_xdmac: fix residue corruption commit 53398f488821c2b5b15291e3debec6ad33f75d3d upstream. An unexpected value of CUBC can lead to a corrupted residue. A more complex sequence is needed to detect an inaccurate value for NCA or CUBC. Signed-off-by: Ludovic Desroches Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver") Reviewed-by: Nicolas Ferre Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 54 +++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 6251969d1cf9..6bbfbba66a55 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1388,6 +1388,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, u32 cur_nda, check_nda, cur_ubc, mask, value; u8 dwidth = 0; unsigned long flags; + bool initd; ret = dma_cookie_status(chan, cookie, txstate); if (ret == DMA_COMPLETE) @@ -1423,34 +1424,43 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, } /* - * When processing the residue, we need to read two registers but we - * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where - * we stand in the descriptor list and AT_XDMAC_CUBC is used - * to know how many data are remaining for the current descriptor. - * Since the dma channel is not paused to not loose data, between the - * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of - * descriptor. - * For that reason, after reading AT_XDMAC_CUBC, we check if we are - * still using the same descriptor by reading a second time - * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to - * read again AT_XDMAC_CUBC. + * The easiest way to compute the residue should be to pause the DMA + * but doing this can lead to miss some data as some devices don't + * have FIFO. + * We need to read several registers because: + * - DMA is running therefore a descriptor change is possible while + * reading these registers + * - When the block transfer is done, the value of the CUBC register + * is set to its initial value until the fetch of the next descriptor. + * This value will corrupt the residue calculation so we have to skip + * it. + * + * INITD -------- ------------ + * |____________________| + * _______________________ _______________ + * NDA @desc2 \/ @desc3 + * _______________________/\_______________ + * __________ ___________ _______________ + * CUBC 0 \/ MAX desc1 \/ MAX desc2 + * __________/\___________/\_______________ + * + * Since descriptors are aligned on 64 bits, we can assume that + * the update of NDA and CUBC is atomic. * Memory barriers are used to ensure the read order of the registers. - * A max number of retries is set because unlikely it can never ends if - * we are transferring a lot of data with small buffers. + * A max number of retries is set because unlikely it could never ends. */ - cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; - rmb(); - cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) { - rmb(); check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; - - if (likely(cur_nda == check_nda)) - break; - - cur_nda = check_nda; + rmb(); + initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD); rmb(); cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); + rmb(); + cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; + rmb(); + + if ((check_nda == cur_nda) && initd) + break; } if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) { From 6b373d53e8b99b7e1d4568914851a21bbd07d3bc Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 12 May 2016 16:54:10 +0200 Subject: [PATCH 007/813] dmaengine: at_xdmac: double FIFO flush needed to compute residue commit 9295c41d77ca93aac79cfca6fa09fa1ca5cab66f upstream. Due to the way CUBC register is updated, a double flush is needed to compute an accurate residue. First flush aim is to get data from the DMA FIFO and second one ensures that we won't report data which are not in memory. Signed-off-by: Ludovic Desroches Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver") Reviewed-by: Nicolas Ferre Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 6bbfbba66a55..e44a1bfb0250 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1413,7 +1413,16 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, residue = desc->xfer_size; /* * Flush FIFO: only relevant when the transfer is source peripheral - * synchronized. + * synchronized. Flush is needed before reading CUBC because data in + * the FIFO are not reported by CUBC. Reporting a residue of the + * transfer length while we have data in FIFO can cause issue. + * Usecase: atmel USART has a timeout which means I have received + * characters but there is no more character received for a while. On + * timeout, it requests the residue. If the data are in the DMA FIFO, + * we will return a residue of the transfer length. It means no data + * received. If an application is waiting for these data, it will hang + * since we won't have another USART timeout without receiving new + * data. */ mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC; value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM; @@ -1468,6 +1477,19 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, goto spin_unlock; } + /* + * Flush FIFO: only relevant when the transfer is source peripheral + * synchronized. Another flush is needed here because CUBC is updated + * when the controller sends the data write command. It can lead to + * report data that are not written in the memory or the device. The + * FIFO flush ensures that data are really written. + */ + if ((desc->lld.mbr_cfg & mask) == value) { + at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask); + while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS)) + cpu_relax(); + } + /* * Remove size of all microblocks already transferred and the current * one. Then add the remaining size to transfer of the current From 5b3114b2af2fe1cf6d465d594faefaea6c1f328b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2016 14:49:37 -0700 Subject: [PATCH 008/813] mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask commit e838a45f9392a5bd2be1cd3ab0b16ae85857461c upstream. Commit d0164adc89f6 ("mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd") modified __GFP_WAIT to explicitly identify the difference between atomic callers and those that were unwilling to sleep. Later the definition was removed entirely. The GFP_RECLAIM_MASK is the set of flags that affect watermark checking and reclaim behaviour but __GFP_ATOMIC was never added. Without it, atomic users of the slab allocator strip the __GFP_ATOMIC flag and cannot access the page allocator atomic reserves. This patch addresses the problem. The user-visible impact depends on the workload but potentially atomic allocations unnecessarily fail without this path. Link: http://lkml.kernel.org/r/20160610093832.GK2527@techsingularity.net Signed-off-by: Mel Gorman Reported-by: Marcin Wojtas Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 38e24b89e4c4..6979b2bd3227 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -22,7 +22,8 @@ */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ + __GFP_ATOMIC) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) From 21e9f8977968f2adfbea1f91786d26b8080c80d5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 24 Jun 2016 14:50:10 -0700 Subject: [PATCH 009/813] mm, compaction: abort free scanner if split fails commit a4f04f2c6955aff5e2c08dcb40aca247ff4d7370 upstream. If the memory compaction free scanner cannot successfully split a free page (only possible due to per-zone low watermark), terminate the free scanner rather than continuing to scan memory needlessly. If the watermark is insufficient for a free page of order <= cc->order, then terminate the scanner since all future splits will also likely fail. This prevents the compaction freeing scanner from scanning all memory on very large zones (very noticeable for zones > 128GB, for instance) when all splits will likely fail while holding zone->lock. compaction_alloc() iterating a 128GB zone has been benchmarked to take over 400ms on some systems whereas any free page isolated and ready to be split ends up failing in split_free_page() because of the low watermark check and thus the iteration continues. The next time compaction occurs, the freeing scanner will likely start at the end of the zone again since no success was made previously and we get the same lengthy iteration until the zone is brought above the low watermark. All thp page faults can take >400ms in such a state without this fix. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606211820350.97086@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Joonsoo Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 7881e072dc33..3aed7ade3482 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -475,25 +475,23 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); + if (!isolated) + break; + total_isolated += isolated; + cc->nr_freepages += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); page++; } - - /* If a page was split, advance to the end of it */ - if (isolated) { - cc->nr_freepages += isolated; - if (!strict && - cc->nr_migratepages <= cc->nr_freepages) { - blockpfn += isolated; - break; - } - - blockpfn += isolated - 1; - cursor += isolated - 1; - continue; + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; } + /* Advance to the end of split page */ + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; isolate_fail: if (strict) @@ -503,6 +501,9 @@ isolate_fail: } + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + /* * There is a tiny chance that we have read bogus compound_order(), * so be careful to not go outside of the pageblock. @@ -524,9 +525,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - if (locked) - spin_unlock_irqrestore(&cc->zone->lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); @@ -966,6 +964,7 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long isolated; /* * This can iterate a massively long zone without finding any @@ -990,8 +989,12 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, false); + /* If isolation failed early, do not continue needlessly */ + if (!isolated && isolate_start_pfn < block_end_pfn && + cc->nr_migratepages > cc->nr_freepages) + break; /* * If we isolated enough freepages, or aborted due to async From d32978b8f50e2a1f14dd9a0dd64f563638dafb5a Mon Sep 17 00:00:00 2001 From: Torsten Hilbrich Date: Fri, 24 Jun 2016 14:50:18 -0700 Subject: [PATCH 010/813] fs/nilfs2: fix potential underflow in call to crc32_le commit 63d2f95d63396059200c391ca87161897b99e74a upstream. The value `bytes' comes from the filesystem which is about to be mounted. We cannot trust that the value is always in the range we expect it to be. Check its value before using it to calculate the length for the crc32_le call. It value must be larger (or equal) sumoff + 4. This fixes a kernel bug when accidentially mounting an image file which had the nilfs2 magic value 0x3434 at the right offset 0x406 by chance. The bytes 0x01 0x00 were stored at 0x408 and were interpreted as a s_bytes value of 1. This caused an underflow when substracting sumoff + 4 (20) in the call to crc32_le. BUG: unable to handle kernel paging request at ffff88021e600000 IP: crc32_le+0x36/0x100 ... Call Trace: nilfs_valid_sb.part.5+0x52/0x60 [nilfs2] nilfs_load_super_block+0x142/0x300 [nilfs2] init_nilfs+0x60/0x390 [nilfs2] nilfs_mount+0x302/0x520 [nilfs2] mount_fs+0x38/0x160 vfs_kern_mount+0x67/0x110 do_mount+0x269/0xe00 SyS_mount+0x9f/0x100 entry_SYSCALL_64_fastpath+0x16/0x71 Link: http://lkml.kernel.org/r/1466778587-5184-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Torsten Hilbrich Tested-by: Torsten Hilbrich Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/the_nilfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801afb53..37e49cb2ac4c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -443,7 +443,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) return 0; bytes = le16_to_cpu(sbp->s_bytes); - if (bytes > BLOCK_SIZE) + if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) return 0; crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, sumoff); From 41a3b3cbb6846247f36e09f96f3680f94791f8b8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 14 Jul 2016 12:06:50 -0700 Subject: [PATCH 011/813] mm, compaction: prevent VM_BUG_ON when terminating freeing scanner commit a46cbf3bc53b6a93fb84a5ffb288c354fa807954 upstream. It's possible to isolate some freepages in a pageblock and then fail split_free_page() due to the low watermark check. In this case, we hit VM_BUG_ON() because the freeing scanner terminated early without a contended lock or enough freepages. This should never have been a VM_BUG_ON() since it's not a fatal condition. It should have been a VM_WARN_ON() at best, or even handled gracefully. Regardless, we need to terminate anytime the full pageblock scan was not done. The logic belongs in isolate_freepages_block(), so handle its state gracefully by terminating the pageblock loop and making a note to restart at the same pageblock next time since it was not possible to complete the scan this time. [rientjes@google.com: don't rescan pages in a pageblock] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1607111244150.83138@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606291436300.145590@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Minchan Kim Tested-by: Minchan Kim Cc: Joonsoo Kim Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3aed7ade3482..dba02dec7195 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -964,8 +964,6 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - unsigned long isolated; - /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need @@ -989,36 +987,30 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); - /* If isolation failed early, do not continue needlessly */ - if (!isolated && isolate_start_pfn < block_end_pfn && - cc->nr_migratepages > cc->nr_freepages) - break; + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, + freelist, false); /* - * If we isolated enough freepages, or aborted due to async - * compaction being contended, terminate the loop. - * Remember where the free scanner should restart next time, - * which is where isolate_freepages_block() left off. - * But if it scanned the whole pageblock, isolate_start_pfn - * now points at block_end_pfn, which is the start of the next - * pageblock. - * In that case we will however want to restart at the start - * of the previous pageblock. + * If we isolated enough freepages, or aborted due to lock + * contention, terminate. */ if ((cc->nr_freepages >= cc->nr_migratepages) || cc->contended) { - if (isolate_start_pfn >= block_end_pfn) + if (isolate_start_pfn >= block_end_pfn) { + /* + * Restart at previous pageblock if more + * freepages can be isolated next time. + */ isolate_start_pfn = block_start_pfn - pageblock_nr_pages; + } break; - } else { + } else if (isolate_start_pfn < block_end_pfn) { /* - * isolate_freepages_block() should not terminate - * prematurely unless contended, or isolated enough + * If isolation failed early, do not continue + * needlessly. */ - VM_BUG_ON(isolate_start_pfn < block_end_pfn); + break; } } From e534d9261acee101807f838e495d43a9d7d83cb6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jul 2016 12:07:20 -0700 Subject: [PATCH 012/813] mm, meminit: always return a valid node from early_pfn_to_nid commit e4568d3803852d00effd41dcdd489e726b998879 upstream. early_pfn_to_nid can return node 0 if a PFN is invalid on machines that has no node 0. A machine with only node 1 was observed to crash with the following message: BUG: unable to handle kernel paging request at 000000000002a3c8 PGD 0 Modules linked in: Hardware name: Supermicro H8DSP-8/H8DSP-8, BIOS 080011 06/30/2006 task: ffffffff81c0d500 ti: ffffffff81c00000 task.ti: ffffffff81c00000 RIP: reserve_bootmem_region+0x6a/0xef CR2: 000000000002a3c8 CR3: 0000000001c06000 CR4: 00000000000006b0 Call Trace: free_all_bootmem+0x4b/0x12a mem_init+0x70/0xa3 start_kernel+0x25b/0x49b The problem is that early_page_uninitialised uses the early_pfn_to_nid helper which returns node 0 for invalid PFNs. No caller of early_pfn_to_nid cares except early_page_uninitialised. This patch has early_pfn_to_nid always return a valid node. Link: http://lkml.kernel.org/r/1468008031-3848-3-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 99c1738684ec..ce9d0d47ddc0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1057,7 +1057,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; From becdfa32eeaf230253b20490179134d1bb898c34 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jul 2016 12:07:23 -0700 Subject: [PATCH 013/813] mm, meminit: ensure node is online before checking whether pages are uninitialised commit ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 upstream. early_page_uninitialised looks up an arbitrary PFN. While a machine without node 0 will boot with "mm, page_alloc: Always return a valid node from early_pfn_to_nid", it works because it assumes that nodes are always in PFN order. This is not guaranteed so this patch adds robustness by always checking if the node being checked is online. Link: http://lkml.kernel.org/r/1468008031-3848-4-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce9d0d47ddc0..2bcdfbf8c36d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -275,7 +275,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) + int nid = early_pfn_to_nid(pfn); + + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; From f2e7c1f79f13a6a073d98f4e69fe81aa230f2607 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Jul 2016 12:07:29 -0700 Subject: [PATCH 014/813] vmlinux.lds: account for destructor sections commit e41f501d391265ff568f3e49d6128cc30856a36f upstream. If CONFIG_KASAN is enabled and gcc is configured with --disable-initfini-array and/or gold linker is used, gcc emits .ctors/.dtors and .text.startup/.text.exit sections instead of .init_array/.fini_array. .dtors section is not explicitly accounted in the linker script and messes vvar/percpu layout. We want: ffffffff822bfd80 D _edata ffffffff822c0000 D __vvar_beginning_hack ffffffff822c0000 A __vvar_page ffffffff822c0080 0000000000000098 D vsyscall_gtod_data ffffffff822c1000 A __init_begin ffffffff822c1000 D init_per_cpu__irq_stack_union ffffffff822c1000 A __per_cpu_load ffffffff822d3000 D init_per_cpu__gdt_page We got: ffffffff8279a600 D _edata ffffffff8279b000 A __vvar_page ffffffff8279c000 A __init_begin ffffffff8279c000 D init_per_cpu__irq_stack_union ffffffff8279c000 A __per_cpu_load ffffffff8279e000 D __vvar_beginning_hack ffffffff8279e080 0000000000000098 D vsyscall_gtod_data ffffffff827ae000 D init_per_cpu__gdt_page This happens because __vvar_page and .vvar get different addresses in arch/x86/kernel/vmlinux.lds.S: . = ALIGN(PAGE_SIZE); __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { /* work around gold bug 13023 */ __vvar_beginning_hack = .; Discard .dtors/.fini_array/.text.exit, since we don't call dtors. Merge .text.startup into init text. Link: http://lkml.kernel.org/r/1467386363-120030-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/vmlinux.lds.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c4bd0e2c173c..ef2e8c97e183 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -531,15 +531,19 @@ #define INIT_TEXT \ *(.init.text) \ + *(.text.startup) \ MEM_DISCARD(init.text) #define EXIT_DATA \ *(.exit.data) \ + *(.fini_array) \ + *(.dtors) \ MEM_DISCARD(exit.data) \ MEM_DISCARD(exit.rodata) #define EXIT_TEXT \ *(.exit.text) \ + *(.text.exit) \ MEM_DISCARD(exit.text) #define EXIT_CALL \ From 78edebc495bbd8e3c2cced6a937467140a4fd52b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 20 Jul 2016 15:45:08 -0700 Subject: [PATCH 015/813] pps: do not crash when failed to register commit 368301f2fe4b07e5fb71dba3cc566bc59eb6705f upstream. With this command sequence: modprobe plip modprobe pps_parport rmmod pps_parport the partport_pps modules causes this crash: BUG: unable to handle kernel NULL pointer dereference at (null) IP: parport_detach+0x1d/0x60 [pps_parport] Oops: 0000 [#1] SMP ... Call Trace: parport_unregister_driver+0x65/0xc0 [parport] SyS_delete_module+0x187/0x210 The sequence that builds up to this is: 1) plip is loaded and takes the parport device for exclusive use: plip0: Parallel port at 0x378, using IRQ 7. 2) pps_parport then fails to grab the device: pps_parport: parallel port PPS client parport0: cannot grant exclusive access for device pps_parport pps_parport: couldn't register with parport0 3) rmmod of pps_parport is then killed because it tries to access pardev->name, but pardev (taken from port->cad) is NULL. So add a check for NULL in the test there too. Link: http://lkml.kernel.org/r/20160714115245.12651-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/pps/clients/pps_parport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c index 38a8bbe74810..83797d89c30f 100644 --- a/drivers/pps/clients/pps_parport.c +++ b/drivers/pps/clients/pps_parport.c @@ -195,7 +195,7 @@ static void parport_detach(struct parport *port) struct pps_client_pp *device; /* FIXME: oooh, this is ugly! */ - if (strcmp(pardev->name, KBUILD_MODNAME)) + if (!pardev || strcmp(pardev->name, KBUILD_MODNAME)) /* not our port */ return; From dc20f3244ae920430d9d9f19939a13a0279380ca Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 9 Jun 2016 15:20:05 +0300 Subject: [PATCH 016/813] kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w commit 57675cb976eff977aefb428e68e4e0236d48a9ff upstream. Lengthy output of sysrq-w may take a lot of time on slow serial console. Currently we reset NMI-watchdog on the current CPU to avoid spurious lockup messages. Sometimes this doesn't work since softlockup watchdog might trigger on another CPU which is waiting for an IPI to proceed. We reset softlockup watchdogs on all CPUs, but we do this only after listing all tasks, and this may be too late on a busy system. So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c0cdb5a73f8..67d1e1597d9c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4951,14 +4951,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif From e0bc4e7e1c876f8e58ba381bf5194a8b8c8d448f Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Thu, 23 Jun 2016 11:00:39 +0300 Subject: [PATCH 017/813] arc: unwind: warn only once if DW2_UNWIND is disabled commit 9bd54517ee86cb164c734f72ea95aeba4804f10b upstream. If CONFIG_ARC_DW2_UNWIND is disabled every time arc_unwind_core() gets called following message gets printed in debug console: ----------------->8--------------- CONFIG_ARC_DW2_UNWIND needs to be enabled ----------------->8--------------- That message makes sense if user indeed wants to see a backtrace or get nice function call-graphs in perf but what if user disabled unwinder for the purpose? Why pollute his debug console? So instead we'll warn user about possibly missing feature once and let him decide if that was what he or she really wanted. Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 001de4ce711e..11b50959f20e 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -142,7 +142,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, * prelogue is setup (callee regs saved and then fp set and not other * way around */ - pr_warn("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); + pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); return 0; #endif From 6bce4d0eb37b1c4268b728985e98dbdcd9592632 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 28 Jun 2016 09:42:25 +0530 Subject: [PATCH 018/813] ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame) commit f52e126cc7476196f44f3c313b7d9f0699a881fc upstream. With recent binutils update to support dwarf CFI pseudo-ops in gas, we now get .eh_frame vs. .debug_frame. Although the call frame info is exactly the same in both, the CIE differs, which the current kernel unwinder can't cope with. This broke both the kernel unwinder as well as loadable modules (latter because of a new unhandled relo R_ARC_32_PCREL from .rela.eh_frame in the module loader) The ideal solution would be to switch unwinder to .eh_frame. For now however we can make do by just ensureing .debug_frame is generated by removing -fasynchronous-unwind-tables .eh_frame generated with -gdwarf-2 -fasynchronous-unwind-tables .debug_frame generated with -gdwarf-2 Fixes STAR 9001058196 Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index aeb19021099e..209d8451e23d 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -48,8 +48,6 @@ endif endif -cflags-$(CONFIG_ARC_DW2_UNWIND) += -fasynchronous-unwind-tables - # By default gcc 4.8 generates dwarf4 which kernel unwinder can't grok ifeq ($(atleast_gcc48),y) cflags-$(CONFIG_ARC_DW2_UNWIND) += -gdwarf-2 From 66af4230b41fe0fe4d88e011a849437f176e7732 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 21 Jun 2016 14:26:36 -0400 Subject: [PATCH 019/813] xen/pciback: Fix conf_space read/write overlap check. commit 02ef871ecac290919ea0c783d05da7eedeffc10e upstream. Current overlap check is evaluating to false a case where a filter field is fully contained (proper subset) of a r/w request. This change applies classical overlap check instead to include all the scenarios. More specifically, for (Hilscher GmbH CIFX 50E-DP(M/S)) device driver the logic is such that the entire confspace is read and written in 4 byte chunks. In this case as an example, CACHE_LINE_SIZE, LATENCY_TIMER and PCI_BIST are arriving together in one call to xen_pcibk_config_write() with offset == 0xc and size == 4. With the exsisting overlap check the LATENCY_TIMER field (offset == 0xd, length == 1) is fully contained in the write request and hence is excluded from write, which is incorrect. Signed-off-by: Andrey Grodzovsky Reviewed-by: Boris Ostrovsky Reviewed-by: Jan Beulich Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xen-pciback/conf_space.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 9c234209d8b5..47a4177b16d2 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -183,8 +183,7 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, field_start = OFFSET(cfg_entry); field_end = OFFSET(cfg_entry) + field->size; - if ((req_start >= field_start && req_start < field_end) - || (req_end > field_start && req_end <= field_end)) { + if (req_end > field_start && field_end > req_start) { err = conf_space_read(dev, cfg_entry, field_start, &tmp_val); if (err) @@ -230,8 +229,7 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) field_start = OFFSET(cfg_entry); field_end = OFFSET(cfg_entry) + field->size; - if ((req_start >= field_start && req_start < field_end) - || (req_end > field_start && req_end <= field_end)) { + if (req_end > field_start && field_end > req_start) { tmp_val = 0; err = xen_pcibk_config_read(dev, field_start, From d1e6344e0b97f8c7d2e14a5bd892d0180274e0c5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Jul 2016 01:23:57 -0600 Subject: [PATCH 020/813] xenbus: don't BUG() on user mode induced condition commit 0beef634b86a1350c31da5fcc2992f0d7c8a622b upstream. Inability to locate a user mode specified transaction ID should not lead to a kernel crash. For other than XS_TRANSACTION_START also don't issue anything to xenbus if the specified ID doesn't match that of any active transaction. Signed-off-by: Jan Beulich Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_dev_frontend.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 9433e46518c8..531e76474983 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -316,11 +316,18 @@ static int xenbus_write_transaction(unsigned msg_type, rc = -ENOMEM; goto out; } + } else { + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == u->u.msg.tx_id) + break; + if (&trans->list == &u->transactions) + return -ESRCH; } reply = xenbus_dev_request_and_reply(&u->u.msg); if (IS_ERR(reply)) { - kfree(trans); + if (msg_type == XS_TRANSACTION_START) + kfree(trans); rc = PTR_ERR(reply); goto out; } @@ -333,12 +340,7 @@ static int xenbus_write_transaction(unsigned msg_type, list_add(&trans->list, &u->transactions); } } else if (u->u.msg.type == XS_TRANSACTION_END) { - list_for_each_entry(trans, &u->transactions, list) - if (trans->handle.id == u->u.msg.tx_id) - break; - BUG_ON(&trans->list == &u->transactions); list_del(&trans->list); - kfree(trans); } From ee8b7ff00d5b8fc931a63d31349e9a7189cc72d7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Jul 2016 01:32:04 -0600 Subject: [PATCH 021/813] xenbus: don't bail early from xenbus_dev_request_and_reply() commit 7469be95a487319514adce2304ad2af3553d2fc9 upstream. xenbus_dev_request_and_reply() needs to track whether a transaction is open. For XS_TRANSACTION_START messages it calls transaction_start() and for XS_TRANSACTION_END messages it calls transaction_end(). If sending an XS_TRANSACTION_START message fails or responds with an an error, the transaction is not open and transaction_end() must be called. If sending an XS_TRANSACTION_END message fails, the transaction is still open, but if an error response is returned the transaction is closed. Commit 027bd7e89906 ("xen/xenbus: Avoid synchronous wait on XenBus stalling shutdown/restart") introduced a regression where failed XS_TRANSACTION_START messages were leaving the transaction open. This can cause problems with suspend (and migration) as all transactions must be closed before suspending. It appears that the problematic change was added accidentally, so just remove it. Signed-off-by: Jan Beulich Cc: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_xs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index ba804f3d8278..ce65591b4168 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -250,9 +250,6 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); - if (IS_ERR(ret)) - return ret; - if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) From 90bed827ea910f82ab17ee154f501b5ae71617e6 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:07 -0400 Subject: [PATCH 022/813] ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cec8f96e49d9be372fdb0c3836dcf31ec71e457e upstream. The stack object “tread” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 7c6155f5865b..245c5f340ae5 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1746,6 +1746,7 @@ static int snd_timer_user_params(struct file *file, if (tu->timeri->flags & SNDRV_TIMER_IFLG_EARLY_EVENT) { if (tu->tread) { struct snd_timer_tread tread; + memset(&tread, 0, sizeof(tread)); tread.event = SNDRV_TIMER_EVENT_EARLY; tread.tstamp.tv_sec = 0; tread.tstamp.tv_nsec = 0; From 3e6af33c73fb7ec7be8dedd01047162ef64a26a5 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:20 -0400 Subject: [PATCH 023/813] ALSA: timer: Fix leak in events via snd_timer_user_ccallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9a47e9cff994f37f7f0dbd9ae23740d0f64f9fe6 upstream. The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 245c5f340ae5..3a5e0dd9cebe 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1247,6 +1247,7 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri, tu->tstamp = *tstamp; if ((tu->filter & (1 << event)) == 0 || !tu->tread) return; + memset(&r1, 0, sizeof(r1)); r1.event = event; r1.tstamp = *tstamp; r1.val = resolution; From 8fd58e050f90ed5d5161413c75a8a8271934566c Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:32 -0400 Subject: [PATCH 024/813] ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e4ec8cc8039a7063e24204299b462bd1383184a5 upstream. The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 3a5e0dd9cebe..637d034bb084 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1282,6 +1282,7 @@ static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri, } if ((tu->filter & (1 << SNDRV_TIMER_EVENT_RESOLUTION)) && tu->last_resolution != resolution) { + memset(&r1, 0, sizeof(r1)); r1.event = SNDRV_TIMER_EVENT_RESOLUTION; r1.tstamp = tstamp; r1.val = resolution; From 12a83f6702402803190450218c0a80a1d5fb2b09 Mon Sep 17 00:00:00 2001 From: Sinclair Yeh Date: Thu, 23 Jun 2016 17:37:34 -0700 Subject: [PATCH 025/813] Input: vmmouse - remove port reservation commit 60842ef8128e7bf58c024814cd0dc14319232b6c upstream. The VMWare EFI BIOS will expose port 0x5658 as an ACPI resource. This causes the port to be reserved by the APCI module as the system comes up, making it unavailable to be reserved again by other drivers, thus preserving this VMWare port for special use in a VMWare guest. This port is designed to be shared among multiple VMWare services, such as the VMMOUSE. Because of this, VMMOUSE should not try to reserve this port on its own. The VMWare non-EFI BIOS does not do this to preserve compatibility with existing/legacy VMs. It is known that there is small chance a VM may be configured such that these ports get reserved by other non-VMWare devices, and if this ever happens, the result is undefined. Signed-off-by: Sinclair Yeh Reviewed-by: Thomas Hellstrom Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/vmmouse.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c index a3f0f5a47490..0f586780ceb4 100644 --- a/drivers/input/mouse/vmmouse.c +++ b/drivers/input/mouse/vmmouse.c @@ -355,18 +355,11 @@ int vmmouse_detect(struct psmouse *psmouse, bool set_properties) return -ENXIO; } - if (!request_region(VMMOUSE_PROTO_PORT, 4, "vmmouse")) { - psmouse_dbg(psmouse, "VMMouse port in use.\n"); - return -EBUSY; - } - /* Check if the device is present */ response = ~VMMOUSE_PROTO_MAGIC; VMMOUSE_CMD(GETVERSION, 0, version, response, dummy1, dummy2); - if (response != VMMOUSE_PROTO_MAGIC || version == 0xffffffffU) { - release_region(VMMOUSE_PROTO_PORT, 4); + if (response != VMMOUSE_PROTO_MAGIC || version == 0xffffffffU) return -ENXIO; - } if (set_properties) { psmouse->vendor = VMMOUSE_VENDOR; @@ -374,8 +367,6 @@ int vmmouse_detect(struct psmouse *psmouse, bool set_properties) psmouse->model = version; } - release_region(VMMOUSE_PROTO_PORT, 4); - return 0; } @@ -394,7 +385,6 @@ static void vmmouse_disconnect(struct psmouse *psmouse) psmouse_reset(psmouse); input_unregister_device(priv->abs_dev); kfree(priv); - release_region(VMMOUSE_PROTO_PORT, 4); } /** @@ -438,15 +428,10 @@ int vmmouse_init(struct psmouse *psmouse) struct input_dev *rel_dev = psmouse->dev, *abs_dev; int error; - if (!request_region(VMMOUSE_PROTO_PORT, 4, "vmmouse")) { - psmouse_dbg(psmouse, "VMMouse port in use.\n"); - return -EBUSY; - } - psmouse_reset(psmouse); error = vmmouse_enable(psmouse); if (error) - goto release_region; + return error; priv = kzalloc(sizeof(*priv), GFP_KERNEL); abs_dev = input_allocate_device(); @@ -502,8 +487,5 @@ init_fail: kfree(priv); psmouse->private = NULL; -release_region: - release_region(VMMOUSE_PROTO_PORT, 4); - return error; } From 4bc476735615a766b6f3014984b3d06378f8f26a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 21 Jun 2016 16:09:00 -0700 Subject: [PATCH 026/813] Input: elantech - add more IC body types to the list commit 226ba707744a51acb4244724e09caacb1d96aed9 upstream. The touchpad in HP Pavilion 14-ab057ca reports it's version as 12 and according to Elan both 11 and 12 are valid IC types and should be identified as hw_version 4. Reported-by: Patrick Lessard Tested-by: Patrick Lessard Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elantech.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 78f93cf68840..be5b399da5d3 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1568,13 +1568,7 @@ static int elantech_set_properties(struct elantech_data *etd) case 5: etd->hw_version = 3; break; - case 6: - case 7: - case 8: - case 9: - case 10: - case 13: - case 14: + case 6 ... 14: etd->hw_version = 4; break; default: From 526410bc85d3f9e30515b2086eaea6440231ba48 Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Thu, 23 Jun 2016 10:24:42 -0700 Subject: [PATCH 027/813] Input: xpad - fix oops when attaching an unknown Xbox One gamepad commit c7f1429389ec1aa25e042bb13451385fbb596f8c upstream. Xbox One controllers have multiple interfaces which all have the same class, subclass, and protocol. One of the these interfaces has only a single endpoint. When Xpad attempts to bind to this interface, it causes an oops when trying initialize the output URB by trying to access the second endpoint's descriptor. This situation was avoided for known Xbox One devices by checking the XTYPE constant associated with the VID and PID tuple. However, this breaks when new or previously unknown Xbox One controllers are attached to the system. This change addresses the problem by deriving the XTYPE for Xbox One controllers based on the interface protocol before checking the interface number. Fixes: 1a48ff81b391 ("Input: xpad - add support for Xbox One controllers") Signed-off-by: Cameron Gutman Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index fd4100d56d8c..35e444b4b8b0 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1206,16 +1206,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id break; } - if (xpad_device[i].xtype == XTYPE_XBOXONE && - intf->cur_altsetting->desc.bInterfaceNumber != 0) { - /* - * The Xbox One controller lists three interfaces all with the - * same interface class, subclass and protocol. Differentiate by - * interface number. - */ - return -ENODEV; - } - xpad = kzalloc(sizeof(struct usb_xpad), GFP_KERNEL); if (!xpad) return -ENOMEM; @@ -1246,6 +1236,8 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) { if (intf->cur_altsetting->desc.bInterfaceProtocol == 129) xpad->xtype = XTYPE_XBOX360W; + else if (intf->cur_altsetting->desc.bInterfaceProtocol == 208) + xpad->xtype = XTYPE_XBOXONE; else xpad->xtype = XTYPE_XBOX360; } else { @@ -1260,6 +1252,17 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id xpad->mapping |= MAP_STICKS_TO_NULL; } + if (xpad->xtype == XTYPE_XBOXONE && + intf->cur_altsetting->desc.bInterfaceNumber != 0) { + /* + * The Xbox One controller lists three interfaces all with the + * same interface class, subclass and protocol. Differentiate by + * interface number. + */ + error = -ENODEV; + goto err_free_in_urb; + } + error = xpad_init_output(intf, xpad); if (error) goto err_free_in_urb; From 1dbdba62abb49c84981354d821b22e0b6562232a Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Thu, 23 Jun 2016 10:54:17 -0700 Subject: [PATCH 028/813] Input: wacom_w8001 - w8001_MAX_LENGTH should be 13 commit 12afb34400eb2b301f06b2aa3535497d14faee59 upstream. Somehow the patch that added two-finger touch support forgot to update W8001_MAX_LENGTH from 11 to 13. Signed-off-by: Ping Cheng Reviewed-by: Peter Hutterer Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/wacom_w8001.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/wacom_w8001.c b/drivers/input/touchscreen/wacom_w8001.c index 2792ca397dd0..3ed0ce1e4dcb 100644 --- a/drivers/input/touchscreen/wacom_w8001.c +++ b/drivers/input/touchscreen/wacom_w8001.c @@ -27,7 +27,7 @@ MODULE_AUTHOR("Jaya Kumar "); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); -#define W8001_MAX_LENGTH 11 +#define W8001_MAX_LENGTH 13 #define W8001_LEAD_MASK 0x80 #define W8001_LEAD_BYTE 0x80 #define W8001_TAB_MASK 0x40 From c2e5023425c82caf1957e401a5183b9e62f43ebb Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Wed, 29 Jun 2016 09:51:35 -0700 Subject: [PATCH 029/813] Input: xpad - validate USB endpoint count during probe commit caca925fca4fb30c67be88cacbe908eec6721e43 upstream. This prevents a malicious USB device from causing an oops. Signed-off-by: Cameron Gutman Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 35e444b4b8b0..2b2f9d66c2c7 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1200,6 +1200,9 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id int ep_irq_in_idx; int i, error; + if (intf->cur_altsetting->desc.bNumEndpoints != 2) + return -ENODEV; + for (i = 0; xpad_device[i].idVendor; i++) { if ((le16_to_cpu(udev->descriptor.idVendor) == xpad_device[i].idVendor) && (le16_to_cpu(udev->descriptor.idProduct) == xpad_device[i].idProduct)) From aab045e9a95b139aa628498787ce132ade04ff47 Mon Sep 17 00:00:00 2001 From: Michael Welling Date: Wed, 20 Jul 2016 10:02:07 -0700 Subject: [PATCH 030/813] Input: tsc200x - report proper input_dev name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e9003c9cfaa17d26991688268b04244adb67ee2b upstream. Passes input_id struct to the common probe function for the tsc200x drivers instead of just the bustype. This allows for the use of the product variable to set the input_dev->name variable according to the type of touchscreen used. Note that when we introduced support for TSC2004 we started calling everything TSC200X, so let's keep this quirk. Signed-off-by: Michael Welling Acked-by: Pavel Machek Acked-by: Pali Rohár Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/tsc2004.c | 7 ++++++- drivers/input/touchscreen/tsc2005.c | 7 ++++++- drivers/input/touchscreen/tsc200x-core.c | 15 ++++++++++++--- drivers/input/touchscreen/tsc200x-core.h | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/input/touchscreen/tsc2004.c b/drivers/input/touchscreen/tsc2004.c index 7295c198aa08..6fe55d598fac 100644 --- a/drivers/input/touchscreen/tsc2004.c +++ b/drivers/input/touchscreen/tsc2004.c @@ -22,6 +22,11 @@ #include #include "tsc200x-core.h" +static const struct input_id tsc2004_input_id = { + .bustype = BUS_I2C, + .product = 2004, +}; + static int tsc2004_cmd(struct device *dev, u8 cmd) { u8 tx = TSC200X_CMD | TSC200X_CMD_12BIT | cmd; @@ -42,7 +47,7 @@ static int tsc2004_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { - return tsc200x_probe(&i2c->dev, i2c->irq, BUS_I2C, + return tsc200x_probe(&i2c->dev, i2c->irq, &tsc2004_input_id, devm_regmap_init_i2c(i2c, &tsc200x_regmap_config), tsc2004_cmd); } diff --git a/drivers/input/touchscreen/tsc2005.c b/drivers/input/touchscreen/tsc2005.c index b9f593dfd2ef..f2c5f0e47f77 100644 --- a/drivers/input/touchscreen/tsc2005.c +++ b/drivers/input/touchscreen/tsc2005.c @@ -24,6 +24,11 @@ #include #include "tsc200x-core.h" +static const struct input_id tsc2005_input_id = { + .bustype = BUS_SPI, + .product = 2005, +}; + static int tsc2005_cmd(struct device *dev, u8 cmd) { u8 tx = TSC200X_CMD | TSC200X_CMD_12BIT | cmd; @@ -62,7 +67,7 @@ static int tsc2005_probe(struct spi_device *spi) if (error) return error; - return tsc200x_probe(&spi->dev, spi->irq, BUS_SPI, + return tsc200x_probe(&spi->dev, spi->irq, &tsc2005_input_id, devm_regmap_init_spi(spi, &tsc200x_regmap_config), tsc2005_cmd); } diff --git a/drivers/input/touchscreen/tsc200x-core.c b/drivers/input/touchscreen/tsc200x-core.c index 15240c1ee850..dfa7f1c4f545 100644 --- a/drivers/input/touchscreen/tsc200x-core.c +++ b/drivers/input/touchscreen/tsc200x-core.c @@ -450,7 +450,7 @@ static void tsc200x_close(struct input_dev *input) mutex_unlock(&ts->mutex); } -int tsc200x_probe(struct device *dev, int irq, __u16 bustype, +int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id, struct regmap *regmap, int (*tsc200x_cmd)(struct device *dev, u8 cmd)) { @@ -547,9 +547,18 @@ int tsc200x_probe(struct device *dev, int irq, __u16 bustype, snprintf(ts->phys, sizeof(ts->phys), "%s/input-ts", dev_name(dev)); - input_dev->name = "TSC200X touchscreen"; + if (tsc_id->product == 2004) { + input_dev->name = "TSC200X touchscreen"; + } else { + input_dev->name = devm_kasprintf(dev, GFP_KERNEL, + "TSC%04d touchscreen", + tsc_id->product); + if (!input_dev->name) + return -ENOMEM; + } + input_dev->phys = ts->phys; - input_dev->id.bustype = bustype; + input_dev->id = *tsc_id; input_dev->dev.parent = dev; input_dev->evbit[0] = BIT(EV_ABS) | BIT(EV_KEY); input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); diff --git a/drivers/input/touchscreen/tsc200x-core.h b/drivers/input/touchscreen/tsc200x-core.h index 7a482d102614..49a63a3c6840 100644 --- a/drivers/input/touchscreen/tsc200x-core.h +++ b/drivers/input/touchscreen/tsc200x-core.h @@ -70,7 +70,7 @@ extern const struct regmap_config tsc200x_regmap_config; extern const struct dev_pm_ops tsc200x_pm_ops; -int tsc200x_probe(struct device *dev, int irq, __u16 bustype, +int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id, struct regmap *regmap, int (*tsc200x_cmd)(struct device *dev, u8 cmd)); int tsc200x_remove(struct device *dev); From ca3455867e4a98d34b65400ee82c7c3dacce510c Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 27 May 2016 14:17:10 +0800 Subject: [PATCH 031/813] pvclock: Add CPU barriers to get correct version value commit 749d088b8e7f4b9826ede02b9a043e417fa84aa1 upstream. Protocol for the "version" fields is: hypervisor raises it (making it uneven) before it starts updating the fields and raises it again (making it even) when it is done. Thus the guest can make sure the time values it got are consistent by checking the version before and after reading them. Add CPU barries after getting version value just like what function vread_pvclock does, because all of callees in this function is inline. Fixes: 502dfeff239e8313bfbe906ca0a1a6827ac8481b Signed-off-by: Minfei Huang Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pvclock.h | 2 ++ arch/x86/kernel/pvclock.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7a6bed5c08bc..baad72e4c100 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -76,6 +76,8 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; + /* Make the latest version visible */ + smp_rmb(); offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a58..bf0ce75735b0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -66,6 +66,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); return flags & valid_flags; @@ -80,6 +82,8 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { From 6701df3c0a3672faef3e2cfbc4747254e603324a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 31 May 2016 14:17:06 -0700 Subject: [PATCH 032/813] pinctrl: single: Fix missing flush of posted write for a wakeirq commit 0ac3c0a4025f41748a083bdd4970cb3ede802b15 upstream. With many repeated suspend resume cycles, the pin specific wakeirq may not always work on omaps. This is because the write to enable the pin interrupt may not have reached the device over the interconnect before suspend happens. Let's fix the issue with a flush of posted write with a readback. Reported-by: Nishanth Menon Signed-off-by: Tony Lindgren Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-single.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 23b6b8c29a99..73d8d47ea465 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -1576,6 +1576,9 @@ static inline void pcs_irq_set(struct pcs_soc_data *pcs_soc, else mask &= ~soc_mask; pcs->write(mask, pcswi->reg); + + /* flush posted write */ + mask = pcs->read(pcswi->reg); raw_spin_unlock(&pcs->lock); } From 8f808f122f445fe1b391d1ee047dadf16437c749 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Wed, 1 Jun 2016 22:21:53 +0300 Subject: [PATCH 033/813] pinctrl: imx: Do not treat a PIN without MUX register as an error commit ba562d5e54fd3136bfea0457add3675850247774 upstream. Some PINs do not have a MUX register, it is not an error. It is necessary to allow the continuation of the PINs configuration, otherwise the whole PIN-group will be configured incorrectly. Signed-off-by: Alexander Shiyan Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/freescale/pinctrl-imx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index 1029aa7889b5..398ec45aadef 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -207,9 +207,9 @@ static int imx_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, pin_reg = &info->pin_regs[pin_id]; if (pin_reg->mux_reg == -1) { - dev_err(ipctl->dev, "Pin(%s) does not support mux function\n", + dev_dbg(ipctl->dev, "Pin(%s) does not support mux function\n", info->pins[pin_id].name); - return -EINVAL; + continue; } if (info->flags & SHARE_MUX_CONF_REG) { From 75d6026fd7d605a668ef532193c2bae707a4316c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 May 2016 15:42:13 -0400 Subject: [PATCH 034/813] cgroup: set css->id to -1 during init commit 8fa3b8d689a54d6d04ff7803c724fb7aca6ce98e upstream. If percpu_ref initialization fails during css_create(), the free path can end up trying to free css->id of zero. As ID 0 is unused, it doesn't cause a critical breakage but it does trigger a warning message. Fix it by setting css->id to -1 from init_and_link_css(). Signed-off-by: Tejun Heo Cc: Wenwei Tao Fixes: 01e586598b22 ("cgroup: release css->id after css_free") Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1c9d701f7a72..a3424f28aaf4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4793,6 +4793,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; From acbda596b5cdf4784938fe6feca43b28c2dc8a53 Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Thu, 9 Jun 2016 17:28:39 -0400 Subject: [PATCH 035/813] power_supply: power_supply_read_temp only if use_cnt > 0 commit 5bc28b93a36e3cb3acc2870fb75cb6ffb182fece upstream. Change power_supply_read_temp() to use power_supply_get_property() so that it will check the use_cnt and ensure it is > 0. The use_cnt will be incremented at the end of __power_supply_register, so this will block to case where get_property can be called before the supply is fully registered. This fixes the issue show in the stack below: [ 1.452598] power_supply_read_temp+0x78/0x80 [ 1.458680] thermal_zone_get_temp+0x5c/0x11c [ 1.464765] thermal_zone_device_update+0x34/0xb4 [ 1.471195] thermal_zone_device_register+0x87c/0x8cc [ 1.477974] __power_supply_register+0x364/0x424 [ 1.484317] power_supply_register_no_ws+0x10/0x18 [ 1.490833] bq27xxx_battery_setup+0x10c/0x164 [ 1.497003] bq27xxx_battery_i2c_probe+0xd0/0x1b0 [ 1.503435] i2c_device_probe+0x174/0x240 [ 1.509172] driver_probe_device+0x1fc/0x29c [ 1.515167] __driver_attach+0xa4/0xa8 [ 1.520643] bus_for_each_dev+0x58/0x98 [ 1.526204] driver_attach+0x20/0x28 [ 1.531505] bus_add_driver+0x1c8/0x22c [ 1.537067] driver_register+0x68/0x108 [ 1.542630] i2c_register_driver+0x38/0x7c [ 1.548457] bq27xxx_battery_i2c_driver_init+0x18/0x20 [ 1.555321] do_one_initcall+0x38/0x12c [ 1.560886] kernel_init_freeable+0x148/0x1ec [ 1.566972] kernel_init+0x10/0xfc [ 1.572101] ret_from_fork+0x10/0x40 Also make the same change to ps_get_max_charge_cntl_limit() and ps_get_cur_chrage_cntl_limit() to be safe. Lastly, change the return value of power_supply_get_property() to -EAGAIN from -ENODEV if use_cnt <= 0. Fixes: 297d716f6260 ("power_supply: Change ownership from driver to core") Signed-off-by: Rhyland Klein Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/power_supply_core.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c index 456987c88baa..b13cd074c52a 100644 --- a/drivers/power/power_supply_core.c +++ b/drivers/power/power_supply_core.c @@ -565,11 +565,12 @@ static int power_supply_read_temp(struct thermal_zone_device *tzd, WARN_ON(tzd == NULL); psy = tzd->devdata; - ret = psy->desc->get_property(psy, POWER_SUPPLY_PROP_TEMP, &val); + ret = power_supply_get_property(psy, POWER_SUPPLY_PROP_TEMP, &val); + if (ret) + return ret; /* Convert tenths of degree Celsius to milli degree Celsius. */ - if (!ret) - *temp = val.intval * 100; + *temp = val.intval * 100; return ret; } @@ -612,10 +613,12 @@ static int ps_get_max_charge_cntl_limit(struct thermal_cooling_device *tcd, int ret; psy = tcd->devdata; - ret = psy->desc->get_property(psy, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, &val); - if (!ret) - *state = val.intval; + ret = power_supply_get_property(psy, + POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, &val); + if (ret) + return ret; + + *state = val.intval; return ret; } @@ -628,10 +631,12 @@ static int ps_get_cur_chrage_cntl_limit(struct thermal_cooling_device *tcd, int ret; psy = tcd->devdata; - ret = psy->desc->get_property(psy, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, &val); - if (!ret) - *state = val.intval; + ret = power_supply_get_property(psy, + POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, &val); + if (ret) + return ret; + + *state = val.intval; return ret; } From a3bdfa7b6185c677921df51cd2e44ab8aa656a2d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 1 Jul 2016 14:56:07 +0200 Subject: [PATCH 036/813] locks: use file_inode() commit 6343a2120862f7023006c8091ad95c1f16a32077 upstream. (Another one for the f_path debacle.) ltp fcntl33 testcase caused an Oops in selinux_file_send_sigiotask. The reason is that generic_add_lease() used filp->f_path.dentry->inode while all the others use file_inode(). This makes a difference for files opened on overlayfs since the former will point to the overlay inode the latter to the underlying inode. So generic_add_lease() added the lease to the overlay inode and generic_delete_lease() removed it from the underlying inode. When the file was released the lease remained on the overlay inode's lock list, resulting in use after free. Reported-by: Eryu Guan Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Signed-off-by: Miklos Szeredi Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 6333263b7bc8..8eddae23e10b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1602,7 +1602,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; From ae159a027893a8bdeaab0c5f863ca6e57b4468d3 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:29 -0400 Subject: [PATCH 037/813] Revert "ecryptfs: forbid opening files without mmap handler" commit 78c4e172412de5d0456dc00d2b34050aa0b683b5 upstream. This reverts commit 2f36db71009304b3f0b95afacd8eba1f9f046b87. It fixed a local root exploit but also introduced a dependency on the lower file system implementing an mmap operation just to open a file, which is a bit of a heavy hammer. The right fix is to have mmap depend on the existence of the mmap handler instead. Signed-off-by: Jeff Mahoney Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/kthread.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index e818f5ac7a26..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto have_file; + goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) { + if (IS_ERR(*lower_file)) rc = PTR_ERR(*lower_file); - goto out; - } -have_file: - if ((*lower_file)->f_op->mmap == NULL) { - fput(*lower_file); - *lower_file = NULL; - rc = -EMEDIUMTYPE; - } out: return rc; } From ed5c955e31ff07fa74738b6e0d94c5c17ebbf7c7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:30 -0400 Subject: [PATCH 038/813] ecryptfs: don't allow mmap when the lower fs doesn't support it commit f0fe970df3838c202ef6c07a4c2b36838ef0a88b upstream. There are legitimate reasons to disallow mmap on certain files, notably in sysfs or procfs. We shouldn't emulate mmap support on file systems that don't offer support natively. CVE-2016-1583 Signed-off-by: Jeff Mahoney [tyhicks: clean up f_op check by using ecryptfs_file_to_lower()] Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..11309683d65f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -170,6 +170,19 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode speciying file to open @@ -364,7 +377,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, From 26015f0ad252dd1ae397a6b6e9400ca868f4e584 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 15 Jul 2016 00:22:07 -0400 Subject: [PATCH 039/813] ext4: verify extent header depth commit 7bc9491645118c9461bd21099c31755ff6783593 upstream. Although the extent tree depth of 5 should enough be for the worst case of 2*32 extents of length 1, the extent tree code does not currently to merge nodes which are less than half-full with a sibling node, or to shrink the tree depth if possible. So it's possible, at least in theory, for the tree depth to be greater than 5. However, even in the worst case, a tree depth of 32 is highly unlikely, and if the file system is maliciously corrupted, an insanely large eh_depth can cause memory allocation failures that will trigger kernel warnings (here, eh_depth = 65280): JBD2: ext4.exe wants too many credits credits:195849 rsv_credits:0 max:256 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 50 at fs/jbd2/transaction.c:293 start_this_handle+0x569/0x580 CPU: 0 PID: 50 Comm: ext4.exe Not tainted 4.7.0-rc5+ #508 Stack: 604a8947 625badd8 0002fd09 00000000 60078643 00000000 62623910 601bf9bc 62623970 6002fc84 626239b0 900000125 Call Trace: [<6001c2dc>] show_stack+0xdc/0x1a0 [<601bf9bc>] dump_stack+0x2a/0x2e [<6002fc84>] __warn+0x114/0x140 [<6002fdff>] warn_slowpath_null+0x1f/0x30 [<60165829>] start_this_handle+0x569/0x580 [<60165d4e>] jbd2__journal_start+0x11e/0x220 [<60146690>] __ext4_journal_start_sb+0x60/0xa0 [<60120a81>] ext4_truncate+0x131/0x3a0 [<60123677>] ext4_setattr+0x757/0x840 [<600d5d0f>] notify_change+0x16f/0x2a0 [<600b2b16>] do_truncate+0x76/0xc0 [<600c3e56>] path_openat+0x806/0x1300 [<600c55c9>] do_filp_open+0x89/0xf0 [<600b4074>] do_sys_open+0x134/0x1e0 [<600b4140>] SyS_open+0x20/0x30 [<6001ea68>] handle_syscall+0x88/0x90 [<600295fd>] userspace+0x3fd/0x500 [<6001ac55>] fork_handler+0x85/0x90 ---[ end trace 08b0b88b6387a244 ]--- [ Commit message modified and the extent tree depath check changed from 5 to 32 -- tytso ] Cc: Darrick J. Wong Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3578b25fccfd..62880586ed85 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -469,6 +469,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + if (unlikely(depth > 32)) { + error_msg = "too large eh_depth"; + goto corrupted; + } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { From 7d9f345ca6810010456e99c7cdea112b500b9547 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jun 2016 10:54:23 +0200 Subject: [PATCH 040/813] 9p: use file_dentry() commit b403f0e37a11f84f7ceaf40b0075499e5bcfd220 upstream. v9fs may be used as lower layer of overlayfs and accessing f_path.dentry can lead to a crash. In this case it's a NULL pointer dereference in p9_fid_create(). Fix by replacing direct access of file->f_path.dentry with the file_dentry() accessor, which will always return a native object. Reported-by: Alessio Igor Bogani Signed-off-by: Miklos Szeredi Tested-by: Alessio Igor Bogani Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc8..12ceaf52dae6 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { - fid = v9fs_fid_clone(file->f_path.dentry); + fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(file->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(file)); if (IS_ERR(fid)) { err = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); @@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(filp->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(filp)); if (IS_ERR(fid)) { retval = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); From df582d46960cc3151bc2dc564af9128b4c7f9b4b Mon Sep 17 00:00:00 2001 From: Andrey Ulanov Date: Fri, 15 Apr 2016 14:24:41 -0700 Subject: [PATCH 041/813] namespace: update event counter when umounting a deleted dentry commit e06b933e6ded42384164d28a2060b7f89243b895 upstream. - m_start() in fs/namespace.c expects that ns->event is incremented each time a mount added or removed from ns->list. - umount_tree() removes items from the list but does not increment event counter, expecting that it's done before the function is called. - There are some codepaths that call umount_tree() without updating "event" counter. e.g. from __detach_mounts(). - When this happens m_start may reuse a cached mount structure that no longer belongs to ns->list (i.e. use after free which usually leads to infinite loop). This change fixes the above problem by incrementing global event counter before invoking umount_tree(). Change-Id: I622c8e84dcb9fb63542372c5dbf0178ee86bb589 Signed-off-by: Andrey Ulanov Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/namespace.c b/fs/namespace.c index 33064fcbfff9..5be02a0635be 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1562,6 +1562,7 @@ void __detach_mounts(struct dentry *dentry) goto out_unlock; lock_mount_hash(); + event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { From 873b6e316a581c976a29a1d8c889c76394d7393d Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Mon, 13 Jun 2016 17:46:49 +0000 Subject: [PATCH 042/813] spi: sunxi: fix transfer timeout commit 719bd6542044efd9b338a53dba1bef45f40ca169 upstream. The trasfer timeout is fixed at 1000 ms. Reading a 4Mbyte flash over 1MHz SPI bus takes way longer than that. Calculate the timeout from the actual time the transfer is supposed to take and multiply by 2 for good measure. Signed-off-by: Michal Suchanek Acked-by: Maxime Ripard Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-sun4i.c | 10 +++++++++- drivers/spi/spi-sun6i.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index fbb0a4d74e91..836328730457 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -170,6 +170,7 @@ static int sun4i_spi_transfer_one(struct spi_master *master, { struct sun4i_spi *sspi = spi_master_get_devdata(master); unsigned int mclk_rate, div, timeout; + unsigned int start, end, tx_time; unsigned int tx_len = 0; int ret = 0; u32 reg; @@ -279,9 +280,16 @@ static int sun4i_spi_transfer_one(struct spi_master *master, reg = sun4i_spi_read(sspi, SUN4I_CTL_REG); sun4i_spi_write(sspi, SUN4I_CTL_REG, reg | SUN4I_CTL_XCH); + tx_time = max(tfr->len * 8 * 2 / (tfr->speed_hz / 1000), 100U); + start = jiffies; timeout = wait_for_completion_timeout(&sspi->done, - msecs_to_jiffies(1000)); + msecs_to_jiffies(tx_time)); + end = jiffies; if (!timeout) { + dev_warn(&master->dev, + "%s: timeout transferring %u bytes@%iHz for %i(%i)ms", + dev_name(&spi->dev), tfr->len, tfr->speed_hz, + jiffies_to_msecs(end - start), tx_time); ret = -ETIMEDOUT; goto out; } diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c index ac48f59705a8..e77add01b0e9 100644 --- a/drivers/spi/spi-sun6i.c +++ b/drivers/spi/spi-sun6i.c @@ -160,6 +160,7 @@ static int sun6i_spi_transfer_one(struct spi_master *master, { struct sun6i_spi *sspi = spi_master_get_devdata(master); unsigned int mclk_rate, div, timeout; + unsigned int start, end, tx_time; unsigned int tx_len = 0; int ret = 0; u32 reg; @@ -269,9 +270,16 @@ static int sun6i_spi_transfer_one(struct spi_master *master, reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG); sun6i_spi_write(sspi, SUN6I_TFR_CTL_REG, reg | SUN6I_TFR_CTL_XCH); + tx_time = max(tfr->len * 8 * 2 / (tfr->speed_hz / 1000), 100U); + start = jiffies; timeout = wait_for_completion_timeout(&sspi->done, - msecs_to_jiffies(1000)); + msecs_to_jiffies(tx_time)); + end = jiffies; if (!timeout) { + dev_warn(&master->dev, + "%s: timeout transferring %u bytes@%iHz for %i(%i)ms", + dev_name(&spi->dev), tfr->len, tfr->speed_hz, + jiffies_to_msecs(end - start), tx_time); ret = -ETIMEDOUT; goto out; } From 9162d29bc48a43a30217fc1fd939023a75ce604b Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Mon, 13 Jun 2016 17:46:49 +0000 Subject: [PATCH 043/813] spi: sun4i: fix FIFO limit commit 6d9fe44bd73d567d04d3a68a2d2fa521ab9532f2 upstream. When testing SPI without DMA I noticed that filling the FIFO on the spi controller causes timeout. Always leave room for one byte in the FIFO. Signed-off-by: Michal Suchanek Acked-by: Maxime Ripard Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-sun4i.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index 836328730457..39d7c7c70112 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -177,7 +177,10 @@ static int sun4i_spi_transfer_one(struct spi_master *master, /* We don't support transfer larger than the FIFO */ if (tfr->len > SUN4I_FIFO_DEPTH) - return -EINVAL; + return -EMSGSIZE; + + if (tfr->tx_buf && tfr->len >= SUN4I_FIFO_DEPTH) + return -EMSGSIZE; reinit_completion(&sspi->done); sspi->tx_buf = tfr->tx_buf; @@ -270,8 +273,12 @@ static int sun4i_spi_transfer_one(struct spi_master *master, sun4i_spi_write(sspi, SUN4I_BURST_CNT_REG, SUN4I_BURST_CNT(tfr->len)); sun4i_spi_write(sspi, SUN4I_XMIT_CNT_REG, SUN4I_XMIT_CNT(tx_len)); - /* Fill the TX FIFO */ - sun4i_spi_fill_fifo(sspi, SUN4I_FIFO_DEPTH); + /* + * Fill the TX FIFO + * Filling the FIFO fully causes timeout for some reason + * at least on spi2 on A10s + */ + sun4i_spi_fill_fifo(sspi, SUN4I_FIFO_DEPTH - 1); /* Enable the interrupts */ sun4i_spi_write(sspi, SUN4I_INT_CTL_REG, SUN4I_INT_CTL_TC); From 68f99031897d63ae4937b0f945475dc6782afde4 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 17 May 2016 20:57:50 +0200 Subject: [PATCH 044/813] clk: rockchip: initialize flags of clk_init_data in mmc-phase clock commit 595144c1141c951a3c6bb9004ae6a2bc29aad66f upstream. The flags element of clk_init_data was never initialized for mmc- phase-clocks resulting in the element containing a random value and thus possibly enabling unwanted clock flags. Fixes: 89bf26cbc1a0 ("clk: rockchip: Add support for the mmc clock phases using the framework") Signed-off-by: Heiko Stuebner Signed-off-by: Greg Kroah-Hartman --- drivers/clk/rockchip/clk-mmc-phase.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/rockchip/clk-mmc-phase.c b/drivers/clk/rockchip/clk-mmc-phase.c index 2685644826a0..33c20c6b45af 100644 --- a/drivers/clk/rockchip/clk-mmc-phase.c +++ b/drivers/clk/rockchip/clk-mmc-phase.c @@ -153,6 +153,7 @@ struct clk *rockchip_clk_register_mmc(const char *name, return NULL; init.name = name; + init.flags = 0; init.num_parents = num_parents; init.parent_names = parent_names; init.ops = &rockchip_mmc_clk_ops; From 69ca969a2626dc4b3bb83b953c053a01e3b9f7e6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Jun 2016 16:58:46 +0300 Subject: [PATCH 045/813] platform/chrome: cros_ec_dev - double fetch bug in ioctl commit 096cdc6f52225835ff503f987a0d68ef770bb78e upstream. We verify "u_cmd.outsize" and "u_cmd.insize" but we need to make sure that those values have not changed between the two copy_from_user() calls. Otherwise it could lead to a buffer overflow. Additionally, cros_ec_cmd_xfer() can set s_cmd->insize to a lower value. We should use the new smaller value so we don't copy too much data to the user. Reported-by: Pengfei Wang Fixes: a841178445bb ('mfd: cros_ec: Use a zero-length array for command data') Signed-off-by: Dan Carpenter Reviewed-by: Kees Cook Tested-by: Gwendal Grignou Signed-off-by: Olof Johansson Signed-off-by: Greg Kroah-Hartman --- drivers/platform/chrome/cros_ec_dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c index d45cd254ed1c..2b331d5b9e79 100644 --- a/drivers/platform/chrome/cros_ec_dev.c +++ b/drivers/platform/chrome/cros_ec_dev.c @@ -147,13 +147,19 @@ static long ec_device_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) goto exit; } + if (u_cmd.outsize != s_cmd->outsize || + u_cmd.insize != s_cmd->insize) { + ret = -EINVAL; + goto exit; + } + s_cmd->command += ec->cmd_offset; ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd); /* Only copy data to userland if data was received. */ if (ret < 0) goto exit; - if (copy_to_user(arg, s_cmd, sizeof(*s_cmd) + u_cmd.insize)) + if (copy_to_user(arg, s_cmd, sizeof(*s_cmd) + s_cmd->insize)) ret = -EFAULT; exit: kfree(s_cmd); From b782756a66d302dfc8f3b9786672c965eae35a17 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 4 Jul 2016 14:07:16 +0200 Subject: [PATCH 046/813] qeth: delete napi struct when removing a qeth device commit 7831b4ff0d926e0deeaabef9db8800ed069a2757 upstream. A qeth_card contains a napi_struct linked to the net_device during device probing. This struct must be deleted when removing the qeth device, otherwise Panic on oops can occur when qeth devices are repeatedly removed and added. Fixes: a1c3ed4c9ca ("qeth: NAPI support for l2 and l3 discipline") Signed-off-by: Ursula Braun Tested-by: Alexander Klein Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_l2_main.c | 1 + drivers/s390/net/qeth_l3_main.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 8f1b091e1732..12b2cb7769f9 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1051,6 +1051,7 @@ static void qeth_l2_remove_device(struct ccwgroup_device *cgdev) qeth_l2_set_offline(cgdev); if (card->dev) { + netif_napi_del(&card->napi); unregister_netdev(card->dev); card->dev = NULL; } diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 543960e96b42..50cec6b13d27 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3246,6 +3246,7 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_l3_set_offline(cgdev); if (card->dev) { + netif_napi_del(&card->napi); unregister_netdev(card->dev); card->dev = NULL; } From 5161144c3a9d6ea775b293edbb8523deaeff4442 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 1 Jul 2016 00:39:35 -0700 Subject: [PATCH 047/813] block: fix use-after-free in sys_ioprio_get() commit 8ba8682107ee2ca3347354e018865d8e1967c5f4 upstream. get_task_ioprio() accesses the task->io_context without holding the task lock and thus can race with exit_io_context(), leading to a use-after-free. The reproducer below hits this within a few seconds on my 4-core QEMU VM: #define _GNU_SOURCE #include #include #include #include int main(int argc, char **argv) { pid_t pid, child; long nproc, i; /* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */ syscall(SYS_ioprio_set, 1, 0, 0x6000); nproc = sysconf(_SC_NPROCESSORS_ONLN); for (i = 0; i < nproc; i++) { pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { pid = fork(); assert(pid != -1); if (pid == 0) { _exit(0); } else { child = wait(NULL); assert(child == pid); } } } pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } } } for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } return 0; } This gets us KASAN dumps like this: [ 35.526914] ================================================================== [ 35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c [ 35.530009] Read of size 2 by task ioprio-gpf/363 [ 35.530009] ============================================================================= [ 35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected [ 35.530009] ----------------------------------------------------------------------------- [ 35.530009] Disabling lock debugging due to kernel taint [ 35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360 [ 35.530009] ___slab_alloc+0x55d/0x5a0 [ 35.530009] __slab_alloc.isra.20+0x2b/0x40 [ 35.530009] kmem_cache_alloc_node+0x84/0x200 [ 35.530009] create_task_io_context+0x2b/0x370 [ 35.530009] get_task_io_context+0x92/0xb0 [ 35.530009] copy_process.part.8+0x5029/0x5660 [ 35.530009] _do_fork+0x155/0x7e0 [ 35.530009] SyS_clone+0x19/0x20 [ 35.530009] do_syscall_64+0x195/0x3a0 [ 35.530009] return_from_SYSCALL_64+0x0/0x6a [ 35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060 [ 35.530009] __slab_free+0x27b/0x3d0 [ 35.530009] kmem_cache_free+0x1fb/0x220 [ 35.530009] put_io_context+0xe7/0x120 [ 35.530009] put_io_context_active+0x238/0x380 [ 35.530009] exit_io_context+0x66/0x80 [ 35.530009] do_exit+0x158e/0x2b90 [ 35.530009] do_group_exit+0xe5/0x2b0 [ 35.530009] SyS_exit_group+0x1d/0x20 [ 35.530009] entry_SYSCALL_64_fastpath+0x1a/0xa4 [ 35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080 [ 35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001 [ 35.530009] ================================================================== Fix it by grabbing the task lock while we poke at the io_context. Reported-by: Dmitry Vyukov Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/ioprio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/ioprio.c b/block/ioprio.c index cc7800e9eb44..01b8116298a1 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p) if (ret) goto out; ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + task_unlock(p); out: return ret; } From d29e5fa5859c37ae2076f1a0fa28d894e2857249 Mon Sep 17 00:00:00 2001 From: Taras Kondratiuk Date: Wed, 13 Jul 2016 22:05:38 +0000 Subject: [PATCH 048/813] mmc: block: fix packed command header endianness commit f68381a70bb2b26c31b13fdaf67c778f92fd32b4 upstream. The code that fills packed command header assumes that CPU runs in little-endian mode. Hence the header is malformed in big-endian mode and causes MMC data transfer errors: [ 563.200828] mmcblk0: error -110 transferring data, sector 2048, nr 8, cmd response 0x900, card status 0xc40 [ 563.219647] mmcblk0: packed cmd failed, nr 2, sectors 16, failure index: -1 Convert header data to LE. Signed-off-by: Taras Kondratiuk Fixes: ce39f9d17c14 ("mmc: support packed write command for eMMC4.5 devices") Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/card/block.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index c641c202fe7e..64950035613b 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1767,8 +1767,8 @@ static void mmc_blk_packed_hdr_wrq_prep(struct mmc_queue_req *mqrq, packed_cmd_hdr = packed->cmd_hdr; memset(packed_cmd_hdr, 0, sizeof(packed->cmd_hdr)); - packed_cmd_hdr[0] = (packed->nr_entries << 16) | - (PACKED_CMD_WR << 8) | PACKED_CMD_VER; + packed_cmd_hdr[0] = cpu_to_le32((packed->nr_entries << 16) | + (PACKED_CMD_WR << 8) | PACKED_CMD_VER); hdr_blocks = mmc_large_sector(card) ? 8 : 1; /* @@ -1782,14 +1782,14 @@ static void mmc_blk_packed_hdr_wrq_prep(struct mmc_queue_req *mqrq, ((brq->data.blocks * brq->data.blksz) >= card->ext_csd.data_tag_unit_size); /* Argument of CMD23 */ - packed_cmd_hdr[(i * 2)] = + packed_cmd_hdr[(i * 2)] = cpu_to_le32( (do_rel_wr ? MMC_CMD23_ARG_REL_WR : 0) | (do_data_tag ? MMC_CMD23_ARG_TAG_REQ : 0) | - blk_rq_sectors(prq); + blk_rq_sectors(prq)); /* Argument of CMD18 or CMD25 */ - packed_cmd_hdr[((i * 2)) + 1] = + packed_cmd_hdr[((i * 2)) + 1] = cpu_to_le32( mmc_card_blockaddr(card) ? - blk_rq_pos(prq) : blk_rq_pos(prq) << 9; + blk_rq_pos(prq) : blk_rq_pos(prq) << 9); packed->blocks += blk_rq_sectors(prq); i++; } From 34bf12312bd4222a1b945be3f58173edc8aa3f22 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jun 2016 15:53:54 +0200 Subject: [PATCH 049/813] sched/fair: Fix effective_load() to consistently use smoothed load commit 7dd4912594daf769a46744848b05bd5bc6d62469 upstream. Starting with the following commit: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") calc_tg_weight() doesn't compute the right value as expected by effective_load(). The difference is in the 'correction' term. In order to ensure \Sum rw_j >= rw_i we cannot use tg->load_avg directly, since that might be lagging a correction on the current cfs_rq->avg.load_avg value. Therefore we use tg->load_avg - cfs_rq->tg_load_avg_contrib + cfs_rq->avg.load_avg. Now, per the referenced commit, calc_tg_weight() doesn't use cfs_rq->avg.load_avg, as is later used in @w, but uses cfs_rq->load.weight instead. So stop using calc_tg_weight() and do it explicitly. The effects of this bug are wake_affine() making randomly poor choices in cgroup-intense workloads. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51c615279b23..b8b516c37bf1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -687,8 +687,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -4594,19 +4592,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long w, W; + struct cfs_rq *cfs_rq = se->my_q; + long W, w = cfs_rq_load_avg(cfs_rq); - tg = se->my_q->tg; + tg = cfs_rq->tg; /* * W = @wg + \Sum rw_j */ - W = wg + calc_tg_weight(tg, se->my_q); + W = wg + atomic_long_read(&tg->load_avg); + + /* Ensure \Sum rw_j >= rw_i */ + W -= cfs_rq->tg_load_avg_contrib; + W += w; /* * w = rw_i + @wl */ - w = cfs_rq_load_avg(se->my_q) + wl; + w += wl; /* * wl = S * s'_i; see (2) From b82c78948a5311ca3952900dffbee5c932c2d03b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 4 Jul 2016 16:49:48 +0200 Subject: [PATCH 050/813] ovl: handle ATTR_KILL* commit b99c2d913810e56682a538c9f2394d76fca808f8 upstream. Before 4bacc9c9234c ("overlayfs: Make f_path...") file->f_path pointed to the underlying file, hence suid/sgid removal on write worked fine. After that patch file->f_path pointed to the overlay file, and the file mode bits weren't copied to overlay_inode->i_mode. So the suid/sgid removal simply stopped working. The fix is to copy the mode bits, but then ovl_setattr() needs to clear ATTR_MODE to avoid the BUG() in notify_change(). So do this first, then in the next patch copy the mode. Reported-by: Eryu Guan Signed-off-by: Miklos Szeredi Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Cc: Eric Schultz Cc: Eric Hameleers [backported by Eric Hameleers as seen in https://bugzilla.kernel.org/show_bug.cgi?id=150711] Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 0597820f5d9d..4f729ffff75d 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,6 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + mutex_lock(&upperdentry->d_inode->i_mutex); err = notify_change(upperdentry, attr, NULL); if (!err) From b4fedbef96b8d29f336d355d2b0858518e405090 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 3 Dec 2015 23:33:18 +0100 Subject: [PATCH 051/813] perf/x86: fix PEBS issues on Intel Atom/Core2 commit 1424a09a9e1839285e948d4ea9fdfca26c9a2086 upstream. This patch fixes broken PEBS support on Intel Atom and Core2 due to wrong pointer arithmetic in intel_pmu_drain_pebs_core(). The get_next_pebs_record_by_bit() was called on PEBS format fmt0 which does not use the pebs_record_nhm layout. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Fixes: 21509084f999 ("perf/x86/intel: Handle multiple records in the PEBS buffer") Link: http://lkml.kernel.org/r/1449182000-31524-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7abb2b88572e..1e7de3cefc9c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1110,6 +1110,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) void *at; u64 pebs_status; + /* + * fmt0 does not have a status bitfield (does not use + * perf_record_nhm format) + */ + if (x86_pmu.intel_cap.pebs_format < 1) + return base; + if (base == NULL) return NULL; @@ -1195,7 +1202,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) if (!event->attr.precise_ip) return; - n = (top - at) / x86_pmu.pebs_record_size; + n = top - at; if (n <= 0) return; From 63b9e0f32f72892de7064c6888484b881ddbb42f Mon Sep 17 00:00:00 2001 From: Wolfgang Grandegger Date: Mon, 13 Jun 2016 15:44:19 +0200 Subject: [PATCH 052/813] can: at91_can: RX queue could get stuck at high bus load commit 43200a4480cbbe660309621817f54cbb93907108 upstream. At high bus load it could happen that "at91_poll()" enters with all RX message boxes filled up. If then at the end the "quota" is exceeded as well, "rx_next" will not be reset to the first RX mailbox and hence the interrupts remain disabled. Signed-off-by: Wolfgang Grandegger Tested-by: Amr Bekhit Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/at91_can.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 8b3275d7792a..8f5e93cb7975 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -712,9 +712,10 @@ static int at91_poll_rx(struct net_device *dev, int quota) /* upper group completed, look again in lower */ if (priv->rx_next > get_mb_rx_low_last(priv) && - quota > 0 && mb > get_mb_rx_last(priv)) { + mb > get_mb_rx_last(priv)) { priv->rx_next = get_mb_rx_first(priv); - goto again; + if (quota > 0) + goto again; } return received; From 1cee72ed4856504fd597145ce10b29751c4d27a1 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Thu, 16 Jun 2016 11:10:19 -0500 Subject: [PATCH 053/813] can: c_can: Update D_CAN TX and RX functions to 32 bit - fix Altera Cyclone access commit 427460c83cdf55069eee49799a0caef7dde8df69 upstream. When testing CAN write floods on Altera's CycloneV, the first 2 bytes are sometimes 0x00, 0x00 or corrupted instead of the values sent. Also observed bytes 4 & 5 were corrupted in some cases. The D_CAN Data registers are 32 bits and changing from 16 bit writes to 32 bit writes fixes the problem. Testing performed on Altera CycloneV (D_CAN). Requesting tests on other C_CAN & D_CAN platforms. Reported-by: Richard Andrysek Signed-off-by: Thor Thayer Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/c_can/c_can.c | 38 ++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index f91b094288da..e3dccd3200d5 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -332,9 +332,23 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface, priv->write_reg(priv, C_CAN_IFACE(MSGCTRL_REG, iface), ctrl); - for (i = 0; i < frame->can_dlc; i += 2) { - priv->write_reg(priv, C_CAN_IFACE(DATA1_REG, iface) + i / 2, - frame->data[i] | (frame->data[i + 1] << 8)); + if (priv->type == BOSCH_D_CAN) { + u32 data = 0, dreg = C_CAN_IFACE(DATA1_REG, iface); + + for (i = 0; i < frame->can_dlc; i += 4, dreg += 2) { + data = (u32)frame->data[i]; + data |= (u32)frame->data[i + 1] << 8; + data |= (u32)frame->data[i + 2] << 16; + data |= (u32)frame->data[i + 3] << 24; + priv->write_reg32(priv, dreg, data); + } + } else { + for (i = 0; i < frame->can_dlc; i += 2) { + priv->write_reg(priv, + C_CAN_IFACE(DATA1_REG, iface) + i / 2, + frame->data[i] | + (frame->data[i + 1] << 8)); + } } } @@ -402,10 +416,20 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl) } else { int i, dreg = C_CAN_IFACE(DATA1_REG, iface); - for (i = 0; i < frame->can_dlc; i += 2, dreg ++) { - data = priv->read_reg(priv, dreg); - frame->data[i] = data; - frame->data[i + 1] = data >> 8; + if (priv->type == BOSCH_D_CAN) { + for (i = 0; i < frame->can_dlc; i += 4, dreg += 2) { + data = priv->read_reg32(priv, dreg); + frame->data[i] = data; + frame->data[i + 1] = data >> 8; + frame->data[i + 2] = data >> 16; + frame->data[i + 3] = data >> 24; + } + } else { + for (i = 0; i < frame->can_dlc; i += 2, dreg++) { + data = priv->read_reg(priv, dreg); + frame->data[i] = data; + frame->data[i + 1] = data >> 8; + } } } From 864844524efebf19da164ed38f25aa3fb3a2d2de Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 21 Jun 2016 12:14:07 +0200 Subject: [PATCH 054/813] can: fix handling of unmodifiable configuration options fix commit bce271f255dae8335dc4d2ee2c4531e09cc67f5a upstream. With upstream commit bb208f144cf3f59 (can: fix handling of unmodifiable configuration options) a new can_validate() function was introduced. When invoking 'ip link set can0 type can' without any configuration data can_validate() tries to validate the content without taking into account that there's totally no content. This patch adds a check for missing content. Reported-by: ajneu Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 910c12e2638e..348dd5001fa4 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -798,6 +798,9 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[]) * - control mode with CAN_CTRLMODE_FD set */ + if (!data) + return 0; + if (data[IFLA_CAN_CTRLMODE]) { struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); From d9e1886bddeb99038c127f384c254a7c4997ecc5 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 21 Jun 2016 15:45:47 +0200 Subject: [PATCH 055/813] can: fix oops caused by wrong rtnl dellink usage commit 25e1ed6e64f52a692ba3191c4fde650aab3ecc07 upstream. For 'real' hardware CAN devices the netlink interface is used to set CAN specific communication parameters. Real CAN hardware can not be created nor removed with the ip tool ... This patch adds a private dellink function for the CAN device driver interface that does just nothing. It's a follow up to commit 993e6f2fd ("can: fix oops caused by wrong rtnl newlink usage") but for dellink. Reported-by: ajneu Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 348dd5001fa4..ad535a854e5c 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -1011,6 +1011,11 @@ static int can_newlink(struct net *src_net, struct net_device *dev, return -EOPNOTSUPP; } +static void can_dellink(struct net_device *dev, struct list_head *head) +{ + return; +} + static struct rtnl_link_ops can_link_ops __read_mostly = { .kind = "can", .maxtype = IFLA_CAN_MAX, @@ -1019,6 +1024,7 @@ static struct rtnl_link_ops can_link_ops __read_mostly = { .validate = can_validate, .newlink = can_newlink, .changelink = can_changelink, + .dellink = can_dellink, .get_size = can_get_size, .fill_info = can_fill_info, .get_xstats_size = can_get_xstats_size, From b3a061d1d8288e89a899653fff4ef021df8ed2b3 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 3 Jul 2016 10:54:54 +0200 Subject: [PATCH 056/813] RDS: fix rds_tcp_init() error path commit 3dad5424adfb346c871847d467f97dcdca64ea97 upstream. If register_pernet_subsys() fails, we shouldn't try to call unregister_pernet_subsys(). Fixes: 467fa15356 ("RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.") Cc: Sowmini Varadhan Cc: David S. Miller Signed-off-by: Vegard Nossum Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rds/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9d6ddbacd875..18e50a8fc05f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -421,7 +421,7 @@ static int rds_tcp_init(void) ret = rds_tcp_recv_init(); if (ret) - goto out_slab; + goto out_pernet; ret = rds_trans_register(&rds_tcp_transport); if (ret) @@ -433,8 +433,9 @@ static int rds_tcp_init(void) out_recv: rds_tcp_recv_exit(); -out_slab: +out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); +out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; From c4c2a8f5b740e3ce527357fba43c68dfc3e982ba Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 23 Jun 2016 15:05:26 -0400 Subject: [PATCH 057/813] SCSI: fix new bug in scsi_dev_info_list string matching commit 5e7ff2ca7f2da55fe777167849d0c93403bd0dc8 upstream. Commit b704f70ce200 ("SCSI: fix bug in scsi_dev_info_list matching") changed the way vendor- and model-string matching was carried out in the routine that looks up entries in a SCSI devinfo list. The new matching code failed to take into account the case of a maximum-length string; in such cases it could end up testing for a terminating '\0' byte beyond the end of the memory allocated to the string. This out-of-bounds bug was detected by UBSAN. I don't know if anybody has actually encountered this bug. The symptom would be that a device entry in the blacklist might not be matched properly if it contained an 8-character vendor name or a 16-character model name. Such entries certainly exist in scsi_static_device_list. This patch fixes the problem by adding a check for a maximum-length string before the '\0' test. Signed-off-by: Alan Stern Fixes: b704f70ce200 ("SCSI: fix bug in scsi_dev_info_list matching") Tested-by: Wilfried Klaebe Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_devinfo.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 93cbefa75b26..11cdb172cfaf 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -426,7 +426,7 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, * here, and we don't know what device it is * trying to work with, leave it as-is. */ - vmax = 8; /* max length of vendor */ + vmax = sizeof(devinfo->vendor); vskip = vendor; while (vmax > 0 && *vskip == ' ') { vmax--; @@ -436,7 +436,7 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, while (vmax > 0 && vskip[vmax - 1] == ' ') --vmax; - mmax = 16; /* max length of model */ + mmax = sizeof(devinfo->model); mskip = model; while (mmax > 0 && *mskip == ' ') { mmax--; @@ -452,10 +452,12 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, * Behave like the older version of get_device_flags. */ if (memcmp(devinfo->vendor, vskip, vmax) || - devinfo->vendor[vmax]) + (vmax < sizeof(devinfo->vendor) && + devinfo->vendor[vmax])) continue; if (memcmp(devinfo->model, mskip, mmax) || - devinfo->model[mmax]) + (mmax < sizeof(devinfo->model) && + devinfo->model[mmax])) continue; return devinfo; } else { From 87271783380afbd50d13333fadb1e3a93017d5da Mon Sep 17 00:00:00 2001 From: Brian King Date: Mon, 27 Jun 2016 09:09:40 -0500 Subject: [PATCH 058/813] ipr: Clear interrupt on croc/crocodile when running with LSI commit 54e430bbd490e18ab116afa4cd90dcc45787b3df upstream. If we fall back to using LSI on the Croc or Crocodile chip we need to clear the interrupt so we don't hang the system. Tested-by: Benjamin Herrenschmidt Signed-off-by: Brian King Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ipr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 43ac62623bf2..7a58128a0000 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -10095,6 +10095,7 @@ static int ipr_probe_ioa(struct pci_dev *pdev, ioa_cfg->intr_flag = IPR_USE_MSI; else { ioa_cfg->intr_flag = IPR_USE_LSI; + ioa_cfg->clear_isr = 1; ioa_cfg->nvectors = 1; dev_info(&pdev->dev, "Cannot enable MSI.\n"); } From d863bec646a590584eabcb40550bff0708c26b0d Mon Sep 17 00:00:00 2001 From: James Patrick-Evans Date: Fri, 15 Jul 2016 16:40:45 +0100 Subject: [PATCH 059/813] media: fix airspy usb probe error path commit aa93d1fee85c890a34f2510a310e55ee76a27848 upstream. Fix a memory leak on probe error of the airspy usb device driver. The problem is triggered when more than 64 usb devices register with v4l2 of type VFL_TYPE_SDR or VFL_TYPE_SUBDEV. The memory leak is caused by the probe function of the airspy driver mishandeling errors and not freeing the corresponding control structures when an error occours registering the device to v4l2 core. A badusb device can emulate 64 of these devices, and then through continual emulated connect/disconnect of the 65th device, cause the kernel to run out of RAM and crash the kernel, thus causing a local DOS vulnerability. Fixes CVE-2016-5400 Signed-off-by: James Patrick-Evans Reviewed-by: Kees Cook Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/airspy/airspy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/usb/airspy/airspy.c b/drivers/media/usb/airspy/airspy.c index 565a59310747..34b35ebd60ac 100644 --- a/drivers/media/usb/airspy/airspy.c +++ b/drivers/media/usb/airspy/airspy.c @@ -1073,7 +1073,7 @@ static int airspy_probe(struct usb_interface *intf, if (ret) { dev_err(s->dev, "Failed to register as video device (%d)\n", ret); - goto err_unregister_v4l2_dev; + goto err_free_controls; } dev_info(s->dev, "Registered as %s\n", video_device_node_name(&s->vdev)); @@ -1082,7 +1082,6 @@ static int airspy_probe(struct usb_interface *intf, err_free_controls: v4l2_ctrl_handler_free(&s->hdl); -err_unregister_v4l2_dev: v4l2_device_unregister(&s->v4l2_dev); err_free_mem: kfree(s); From 470f47fcf2a5f9c22081c1c4708e6948e3c2dc13 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Jul 2016 01:39:11 +0300 Subject: [PATCH 060/813] posix_cpu_timer: Exit early when process has been reaped commit 2c13ce8f6b2f6fd9ba2f9261b1939fc0f62d1307 upstream. Variable "now" seems to be genuinely used unintialized if branch if (CPUCLOCK_PERTHREAD(timer->it_clock)) { is not taken and branch if (unlikely(sighand == NULL)) { is taken. In this case the process has been reaped and the timer is marked as disarmed anyway. So none of the postprocessing of the sample is required. Return right away. Signed-off-by: Alexey Dobriyan Link: http://lkml.kernel.org/r/20160707223911.GA26483@p183.telecom.by Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..80016b329d94 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -808,6 +808,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); From 1b0b5ca8f498a29c8646a3fd3bd5accbb8f8a156 Mon Sep 17 00:00:00 2001 From: Lukasz Gemborowski Date: Mon, 27 Jun 2016 12:57:47 +0200 Subject: [PATCH 061/813] i2c: mux: reg: wrong condition checked for of_address_to_resource return value commit 22ebf00eb56fe77922de8138aa9af9996582c2b3 upstream. of_address_to_resource return 0 on successful call but devm_ioremap_resource is called only if it returns non-zero value Signed-off-by: Lukasz Gemborowski Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/muxes/i2c-mux-reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c index 5fbd5bd0878f..49fc2c7e560a 100644 --- a/drivers/i2c/muxes/i2c-mux-reg.c +++ b/drivers/i2c/muxes/i2c-mux-reg.c @@ -150,7 +150,7 @@ static int i2c_mux_reg_probe_dt(struct regmux *mux, mux->data.idle_in_use = true; /* map address from "reg" if exists */ - if (of_address_to_resource(np, 0, &res)) { + if (of_address_to_resource(np, 0, &res) == 0) { mux->data.reg_size = resource_size(&res); mux->data.reg = devm_ioremap_resource(&pdev->dev, &res); if (IS_ERR(mux->data.reg)) From 79cc80f89c4219fc03644c3b30602184a44fb54e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 18 Jul 2016 18:40:00 -0400 Subject: [PATCH 062/813] libata: LITE-ON CX1-JB256-HP needs lower max_sectors commit 1488a1e3828d60d74c9b802a05e24c0487babe4e upstream. Since 34b48db66e08 ("block: remove artifical max_hw_sectors cap"), max_sectors is no longer limited to BLK_DEF_MAX_SECTORS and LITE-ON CX1-JB256-HP keeps timing out with higher max_sectors. Revert it to the previous value. Signed-off-by: Tejun Heo Reported-by: dgerasimov@gmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=121671 Fixes: 34b48db66e08 ("block: remove artifical max_hw_sectors cap") Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b79cb10e289e..bd370c98f77d 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4138,6 +4138,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { */ { "ST380013AS", "3.20", ATA_HORKAGE_MAX_SEC_1024 }, + /* + * Device times out with higher max sects. + * https://bugzilla.kernel.org/show_bug.cgi?id=121671 + */ + { "LITEON CX1-JB256-HP", NULL, ATA_HORKAGE_MAX_SEC_1024 }, + /* Devices we expect to fail diagnostics */ /* Devices where NCQ should be avoided */ From 032951d32c13b7564dfba82758260cb7aa1149d2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 Jul 2016 03:50:28 +0200 Subject: [PATCH 063/813] libceph: apply new_state before new_up_client on incrementals commit 930c532869774ebf8af9efe9484c597f896a7d46 upstream. Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin Signed-off-by: Greg Kroah-Hartman --- net/ceph/osdmap.c | 156 +++++++++++++++++++++++++++++++++------------- 1 file changed, 113 insertions(+), 43 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f..ddc3573894b0 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1191,6 +1191,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) return map; } +/* + * Encoding order is (new_up_client, new_state, new_weight). Need to + * apply in the (new_weight, new_state, new_up_client) order, because + * an incremental map may look like e.g. + * + * new_up_client: { osd=6, addr=... } # set osd_state and addr + * new_state: { osd=6, xorstate=EXISTS } # clear osd_state + */ +static int decode_new_up_state_weight(void **p, void *end, + struct ceph_osdmap *map) +{ + void *new_up_client; + void *new_state; + void *new_weight_end; + u32 len; + + new_up_client = *p; + ceph_decode_32_safe(p, end, len, e_inval); + len *= sizeof(u32) + sizeof(struct ceph_entity_addr); + ceph_decode_need(p, end, len, e_inval); + *p += len; + + new_state = *p; + ceph_decode_32_safe(p, end, len, e_inval); + len *= sizeof(u32) + sizeof(u8); + ceph_decode_need(p, end, len, e_inval); + *p += len; + + /* new_weight */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + s32 osd; + u32 w; + + ceph_decode_need(p, end, 2*sizeof(u32), e_inval); + osd = ceph_decode_32(p); + w = ceph_decode_32(p); + BUG_ON(osd >= map->max_osd); + pr_info("osd%d weight 0x%x %s\n", osd, w, + w == CEPH_OSD_IN ? "(in)" : + (w == CEPH_OSD_OUT ? "(out)" : "")); + map->osd_weight[osd] = w; + + /* + * If we are marking in, set the EXISTS, and clear the + * AUTOOUT and NEW bits. + */ + if (w) { + map->osd_state[osd] |= CEPH_OSD_EXISTS; + map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | + CEPH_OSD_NEW); + } + } + new_weight_end = *p; + + /* new_state (up/down) */ + *p = new_state; + len = ceph_decode_32(p); + while (len--) { + s32 osd; + u8 xorstate; + int ret; + + osd = ceph_decode_32(p); + xorstate = ceph_decode_8(p); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + BUG_ON(osd >= map->max_osd); + if ((map->osd_state[osd] & CEPH_OSD_UP) && + (xorstate & CEPH_OSD_UP)) + pr_info("osd%d down\n", osd); + if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && + (xorstate & CEPH_OSD_EXISTS)) { + pr_info("osd%d does not exist\n", osd); + map->osd_weight[osd] = CEPH_OSD_IN; + ret = set_primary_affinity(map, osd, + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + if (ret) + return ret; + memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); + map->osd_state[osd] = 0; + } else { + map->osd_state[osd] ^= xorstate; + } + } + + /* new_up_client */ + *p = new_up_client; + len = ceph_decode_32(p); + while (len--) { + s32 osd; + struct ceph_entity_addr addr; + + osd = ceph_decode_32(p); + ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); + BUG_ON(osd >= map->max_osd); + pr_info("osd%d up\n", osd); + map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + *p = new_weight_end; + return 0; + +e_inval: + return -EINVAL; +} + /* * decode and apply an incremental map update. */ @@ -1290,49 +1399,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __remove_pg_pool(&map->pg_pools, pi); } - /* new_up */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd; - struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, e_inval); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); - ceph_decode_addr(&addr); - pr_info("osd%d up\n", osd); - BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; - map->osd_addr[osd] = addr; - } - - /* new_state */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd; - u8 xorstate; - ceph_decode_32_safe(p, end, osd, e_inval); - xorstate = **(u8 **)p; - (*p)++; /* clean flag */ - if (xorstate == 0) - xorstate = CEPH_OSD_UP; - if (xorstate & CEPH_OSD_UP) - pr_info("osd%d down\n", osd); - if (osd < map->max_osd) - map->osd_state[osd] ^= xorstate; - } - - /* new_weight */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, e_inval); - osd = ceph_decode_32(p); - off = ceph_decode_32(p); - pr_info("osd%d weight 0x%x %s\n", osd, off, - off == CEPH_OSD_IN ? "(in)" : - (off == CEPH_OSD_OUT ? "(out)" : "")); - if (osd < map->max_osd) - map->osd_weight[osd] = off; - } + /* new_up_client, new_state, new_weight */ + err = decode_new_up_state_weight(p, end, map); + if (err) + goto bad; /* new_pg_temp */ err = decode_new_pg_temp(p, end, map); From 703cfaf375e83159d2113774faa53d2c68c86d67 Mon Sep 17 00:00:00 2001 From: Dmitri Epshtein Date: Wed, 6 Jul 2016 04:18:58 +0200 Subject: [PATCH 064/813] net: mvneta: set real interrupt per packet for tx_done commit 06708f81528725148473c0869d6af5f809c6824b upstream. Commit aebea2ba0f74 ("net: mvneta: fix Tx interrupt delay") intended to set coalescing threshold to a value guaranteeing interrupt generation per each sent packet, so that buffers can be released with no delay. In fact setting threshold to '1' was wrong, because it causes interrupt every two packets. According to the documentation a reason behind it is following - interrupt occurs once sent buffers counter reaches a value, which is higher than one specified in MVNETA_TXQ_SIZE_REG(q). This behavior was confirmed during tests. Also when testing the SoC working as a NAS device, better performance was observed with int-per-packet, as it strongly depends on the fact that all transmitted packets are released immediately. This commit enables NETA controller work in interrupt per sent packet mode by setting coalescing threshold to 0. Signed-off-by: Dmitri Epshtein Signed-off-by: Marcin Wojtas Fixes aebea2ba0f74 ("net: mvneta: fix Tx interrupt delay") Acked-by: Willy Tarreau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index a4ac6fedac75..71ec9cb08e06 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -226,7 +226,7 @@ /* Various constants */ /* Coalescing */ -#define MVNETA_TXDONE_COAL_PKTS 1 +#define MVNETA_TXDONE_COAL_PKTS 0 /* interrupt per packet */ #define MVNETA_RX_COAL_PKTS 32 #define MVNETA_RX_COAL_USEC 100 From c800964923a365289152304bdc047f0d470dbcab Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 28 Jun 2016 18:55:23 +0300 Subject: [PATCH 065/813] intel_th: pci: Add Kaby Lake PCH-H support commit 7a1a47ce35821b40f5b2ce46379ba14393bc3873 upstream. This adds Intel(R) Trace Hub PCI ID for Kaby Lake PCH-H. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 641e87936064..d57a2f75dccf 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -67,6 +67,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa126), .driver_data = (kernel_ulong_t)0, }, + { + /* Kaby Lake PCH-H */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa2a6), + .driver_data = (kernel_ulong_t)0, + }, { 0 }, }; From 33f9cff6ec2fbfcf0b40d4328d292c745185fdf4 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jun 2016 11:51:44 +0300 Subject: [PATCH 066/813] intel_th: Fix a deadlock in modprobing commit a36aa80f3cb2540fb1dbad6240852de4365a2e82 upstream. Driver initialization tries to request a hub (GTH) driver module from its probe callback, resulting in a deadlock. This patch solves the problem by adding a deferred work for requesting the hub module. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/core.c | 35 ++++++++++++++++++++++++++- drivers/hwtracing/intel_th/intel_th.h | 3 +++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c index 165d3001c301..c6ec5c62b7a9 100644 --- a/drivers/hwtracing/intel_th/core.c +++ b/drivers/hwtracing/intel_th/core.c @@ -419,6 +419,38 @@ static struct intel_th_subdevice { }, }; +#ifdef CONFIG_MODULES +static void __intel_th_request_hub_module(struct work_struct *work) +{ + struct intel_th *th = container_of(work, struct intel_th, + request_module_work); + + request_module("intel_th_%s", th->hub->name); +} + +static int intel_th_request_hub_module(struct intel_th *th) +{ + INIT_WORK(&th->request_module_work, __intel_th_request_hub_module); + schedule_work(&th->request_module_work); + + return 0; +} + +static void intel_th_request_hub_module_flush(struct intel_th *th) +{ + flush_work(&th->request_module_work); +} +#else +static inline int intel_th_request_hub_module(struct intel_th *th) +{ + return -EINVAL; +} + +static inline void intel_th_request_hub_module_flush(struct intel_th *th) +{ +} +#endif /* CONFIG_MODULES */ + static int intel_th_populate(struct intel_th *th, struct resource *devres, unsigned int ndevres, int irq) { @@ -488,7 +520,7 @@ static int intel_th_populate(struct intel_th *th, struct resource *devres, /* need switch driver to be loaded to enumerate the rest */ if (subdev->type == INTEL_TH_SWITCH && !req) { th->hub = thdev; - err = request_module("intel_th_%s", subdev->name); + err = intel_th_request_hub_module(th); if (!err) req++; } @@ -603,6 +635,7 @@ void intel_th_free(struct intel_th *th) { int i; + intel_th_request_hub_module_flush(th); for (i = 0; i < TH_SUBDEVICE_MAX; i++) if (th->thdev[i] != th->hub) intel_th_device_remove(th->thdev[i]); diff --git a/drivers/hwtracing/intel_th/intel_th.h b/drivers/hwtracing/intel_th/intel_th.h index 57fd72b20fae..d03a6cd1c65d 100644 --- a/drivers/hwtracing/intel_th/intel_th.h +++ b/drivers/hwtracing/intel_th/intel_th.h @@ -197,6 +197,9 @@ struct intel_th { int id; int major; +#ifdef CONFIG_MODULES + struct work_struct request_module_work; +#endif /* CONFIG_MODULES */ #ifdef CONFIG_INTEL_TH_DEBUG struct dentry *dbg; #endif From 8e510cd92199e863bd457f6b56ad85e00dfb3cb3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 3 Aug 2016 13:44:27 +0200 Subject: [PATCH 067/813] vfs: fix deadlock in file_remove_privs() on overlayfs commit c1892c37769cf89c7e7ba57528ae2ccb5d153c9b upstream. file_remove_privs() is called with inode lock on file_inode(), which proceeds to calling notify_change() on file->f_path.dentry. Which triggers the WARN_ON_ONCE(!inode_is_locked(inode)) in addition to deadlocking later when ovl_setattr tries to lock the underlying inode again. Fix this mess by not mixing the layers, but doing everything on underlying dentry/inode. Signed-off-by: Miklos Szeredi Fixes: 07a2daab49c5 ("ovl: Copy up underlying inode's ->i_mode to overlay inode") Signed-off-by: Greg Kroah-Hartman --- fs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 1be5f9003eb3..b0edef500590 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1733,8 +1733,8 @@ static int __remove_privs(struct dentry *dentry, int kill) */ int file_remove_privs(struct file *file) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = d_inode(dentry); + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); int kill; int error = 0; @@ -1742,7 +1742,7 @@ int file_remove_privs(struct file *file) if (IS_NOSEC(inode)) return 0; - kill = file_needs_remove_privs(file); + kill = dentry_needs_remove_privs(dentry); if (kill < 0) return kill; if (kill) From 133cec911c639d2cdf544ed602442951f702e08c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 10 Aug 2016 11:49:43 +0200 Subject: [PATCH 068/813] Linux 4.4.17 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index da7621cadc8e..76d34f763a41 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 16 +SUBLEVEL = 17 EXTRAVERSION = NAME = Blurry Fish Butt From 656ab59892afdd4808602b9d2725d40d19f62d95 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Jul 2016 16:15:14 +0200 Subject: [PATCH 069/813] arm64: vmlinux.lds: make __rela_offset and __dynsym_offset ABSOLUTE Due to the untyped KIMAGE_VADDR constant, the linker may not notice that the __rela_offset and __dynsym_offset expressions are absolute values (i.e., are not subject to relocation). This does not matter for KASLR, but it does confuse kallsyms in relative mode, since it uses the lowest non-absolute symbol address as the anchor point, and expects all other symbol addresses to be within 4 GB of it. Fix this by qualifying these expressions as ABSOLUTE() explicitly. Fixes: 0cd3defe0af4 ("arm64: kernel: perform relocation processing from ID map") Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon (cherry picked from commit d6732fc402c2665f61e72faf206a0268e65236e9) Signed-off-by: Ard Biesheuvel --- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index ab2c6df55a0a..63f6607ba842 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -168,9 +168,9 @@ SECTIONS *(.hash) } - __rela_offset = ADDR(.rela) - KIMAGE_VADDR; + __rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR); __rela_size = SIZEOF(.rela); - __dynsym_offset = ADDR(.dynsym) - KIMAGE_VADDR; + __dynsym_offset = ABSOLUTE(ADDR(.dynsym) - KIMAGE_VADDR); . = ALIGN(SEGMENT_ALIGN); __init_end = .; From f56fa8cba132fbf27e2339d52679cb8c48dabc50 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 24 Jul 2016 14:00:13 +0200 Subject: [PATCH 070/813] arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux The linker routines that we rely on to produce a relocatable PIE binary treat it as a shared ELF object in some ways, i.e., it emits symbol based R_AARCH64_ABS64 relocations into the final binary since doing so would be appropriate when linking a shared library that is subject to symbol preemption. (This means that an executable can override certain symbols that are exported by a shared library it is linked with, and that the shared library *must* update all its internal references as well, and point them to the version provided by the executable.) Symbol preemption does not occur for OS hosted PIE executables, let alone for vmlinux, and so we would prefer to get rid of these symbol based relocations. This would allow us to simplify the relocation routines, and to strip the .dynsym, .dynstr and .hash sections from the binary. (Note that these are tiny, and are placed in the .init segment, but they clutter up the vmlinux binary.) Note that these R_AARCH64_ABS64 relocations are only emitted for absolute references to symbols defined in the linker script, all other relocatable quantities are covered by anonymous R_AARCH64_RELATIVE relocations that simply list the offsets to all 64-bit values in the binary that need to be fixed up based on the offset between the link time and run time addresses. Fortunately, GNU ld has a -Bsymbolic option, which is intended for shared libraries to allow them to ignore symbol preemption, and unconditionally bind all internal symbol references to its own definitions. So set it for our PIE binary as well, and get rid of the asoociated sections and the relocation code that processes them. Signed-off-by: Ard Biesheuvel [will: fixed conflict with __dynsym_offset linker script entry] Signed-off-by: Will Deacon (cherry picked from commit 08cc55b2afd97a654f71b3bebf8bb0ec89fdc498) Signed-off-by: Ard Biesheuvel --- arch/arm64/Makefile | 2 +- arch/arm64/kernel/head.S | 21 +++------------------ arch/arm64/kernel/vmlinux.lds.S | 11 +---------- 3 files changed, 5 insertions(+), 29 deletions(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 304dcc3da06f..0a9bf4500852 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -16,7 +16,7 @@ OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 ifneq ($(CONFIG_RELOCATABLE),) -LDFLAGS_vmlinux += -pie +LDFLAGS_vmlinux += -pie -Bsymbolic endif KBUILD_DEFCONFIG := defconfig diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 491ad4124615..9890d04a96cb 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -717,40 +717,25 @@ __primary_switch: * Iterate over each entry in the relocation table, and apply the * relocations in place. */ - ldr w8, =__dynsym_offset // offset to symbol table ldr w9, =__rela_offset // offset to reloc table ldr w10, =__rela_size // size of reloc table mov_q x11, KIMAGE_VADDR // default virtual offset add x11, x11, x23 // actual virtual offset - add x8, x8, x11 // __va(.dynsym) add x9, x9, x11 // __va(.rela) add x10, x9, x10 // __va(.rela) + sizeof(.rela) 0: cmp x9, x10 - b.hs 2f + b.hs 1f ldp x11, x12, [x9], #24 ldr x13, [x9, #-8] cmp w12, #R_AARCH64_RELATIVE - b.ne 1f + b.ne 0b add x13, x13, x23 // relocate str x13, [x11, x23] b 0b -1: cmp w12, #R_AARCH64_ABS64 - b.ne 0b - add x12, x12, x12, lsl #1 // symtab offset: 24x top word - add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word - ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx - ldr x15, [x12, #8] // Elf64_Sym::st_value - cmp w14, #-0xf // SHN_ABS (0xfff1) ? - add x14, x15, x23 // relocate - csel x15, x14, x15, ne - add x15, x13, x15 - str x15, [x11, x23] - b 0b - -2: +1: #endif ldr x8, =__primary_switched br x8 diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 63f6607ba842..ac925e54e7eb 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -93,6 +93,7 @@ SECTIONS *(.discard) *(.discard.*) *(.interp .dynamic) + *(.dynsym .dynstr .hash) } . = KIMAGE_VADDR + TEXT_OFFSET; @@ -158,19 +159,9 @@ SECTIONS .rela : ALIGN(8) { *(.rela .rela*) } - .dynsym : ALIGN(8) { - *(.dynsym) - } - .dynstr : { - *(.dynstr) - } - .hash : { - *(.hash) - } __rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR); __rela_size = SIZEOF(.rela); - __dynsym_offset = ABSOLUTE(ADDR(.dynsym) - KIMAGE_VADDR); . = ALIGN(SEGMENT_ALIGN); __init_end = .; From 72c2d3bccaba4a0a4de354f9d2d24eccd05bfccf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: [PATCH 071/813] tcp: make challenge acks less predictable [ Upstream commit 75ff39ccc1bd5d3c455b6822ab09e533c551f758 ] Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d4c51158470f..05f10df6ee86 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,7 +89,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3427,7 +3427,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ if (tcp_oow_rate_limited(sock_net(sk), skb, @@ -3435,13 +3435,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } From 5413f1a526d2d51d7a5768133c90936c017165c6 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 14 Jul 2016 11:38:40 -0400 Subject: [PATCH 072/813] tcp: enable per-socket rate limiting of all 'challenge acks' [ Upstream commit 083ae308280d13d187512b9babe3454342a7987e ] The per-socket rate limit for 'challenge acks' was introduced in the context of limiting ack loops: commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") And I think it can be extended to rate limit all 'challenge acks' on a per-socket basis. Since we have the global tcp_challenge_ack_limit, this patch allows for tcp_challenge_ack_limit to be set to a large value and effectively rely on the per-socket limit, or set tcp_challenge_ack_limit to a lower value and still prevents a single connections from consuming the entire challenge ack quota. It further moves in the direction of eliminating the global limit at some point, as Eric Dumazet has suggested. This a follow-up to: Subject: tcp: make challenge acks less predictable Cc: Eric Dumazet Cc: David S. Miller Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Yue Cao Signed-off-by: Jason Baron Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 05f10df6ee86..12b98e257c5f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3390,6 +3390,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3403,21 +3420,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; + return false; - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; - -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3430,9 +3435,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; /* Then check host-wide RFC 5961 rate limit. */ From a9c221859696f976ba47ba39178af1175e4558e0 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 Jul 2016 21:11:55 +0300 Subject: [PATCH 073/813] ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space [ Upstream commit 80610229ef7b26615dbb6cb6e873709a60bacc9f ] Vegard Nossum is reporting for a crash in fib_dump_info when nh_dev = NULL and fib_nhs == 1: Pid: 50, comm: netlink.exe Not tainted 4.7.0-rc5+ RIP: 0033:[<00000000602b3d18>] RSP: 0000000062623890 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000006261b800 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000024 RDI: 000000006245ba00 RBP: 00000000626238f0 R08: 000000000000029c R09: 0000000000000000 R10: 0000000062468038 R11: 000000006245ba00 R12: 000000006245ba00 R13: 00000000625f96c0 R14: 00000000601e16f0 R15: 0000000000000000 Kernel panic - not syncing: Kernel mode fault at addr 0x2e0, ip 0x602b3d18 CPU: 0 PID: 50 Comm: netlink.exe Not tainted 4.7.0-rc5+ #581 Stack: 626238f0 960226a02 00000400 000000fe 62623910 600afca7 62623970 62623a48 62468038 00000018 00000000 00000000 Call Trace: [<602b3e93>] rtmsg_fib+0xd3/0x190 [<602b6680>] fib_table_insert+0x260/0x500 [<602b0e5d>] inet_rtm_newroute+0x4d/0x60 [<60250def>] rtnetlink_rcv_msg+0x8f/0x270 [<60267079>] netlink_rcv_skb+0xc9/0xe0 [<60250d4b>] rtnetlink_rcv+0x3b/0x50 [<60265400>] netlink_unicast+0x1a0/0x2c0 [<60265e47>] netlink_sendmsg+0x3f7/0x470 [<6021dc9a>] sock_sendmsg+0x3a/0x90 [<6021e0d0>] ___sys_sendmsg+0x300/0x360 [<6021fa64>] __sys_sendmsg+0x54/0xa0 [<6021fac0>] SyS_sendmsg+0x10/0x20 [<6001ea68>] handle_syscall+0x88/0x90 [<600295fd>] userspace+0x3fd/0x500 [<6001ac55>] fork_handler+0x85/0x90 $ addr2line -e vmlinux -i 0x602b3d18 include/linux/inetdevice.h:222 net/ipv4/fib_semantics.c:1264 Problem happens when RTNH_F_LINKDOWN is provided from user space when creating routes that do not use the flag, catched with netlink fuzzer. Currently, the kernel allows user space to set both flags to nh_flags and fib_flags but this is not intentional, the assumption was that they are not set. Fix this by rejecting both flags with EINVAL. Reported-by: Vegard Nossum Fixes: 0eeb075fad73 ("net: ipv4 sysctl option to ignore routes when nexthop link is down") Signed-off-by: Julian Anastasov Cc: Andy Gospodarek Cc: Dinesh Dutt Cc: Scott Feldman Reviewed-by: Andy Gospodarek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fib_semantics.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b68418c7198..ffe95d954007 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + return -EINVAL; + nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; nexthop_nh->nh_oif = rtnh->rtnh_ifindex; @@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + goto err_inval; + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); From 0020fa536cc610216f80b798b9a1c9b13c3a37fb Mon Sep 17 00:00:00 2001 From: Beniamino Galvani Date: Wed, 13 Jul 2016 18:25:08 +0200 Subject: [PATCH 074/813] bonding: set carrier off for devices created through netlink [ Upstream commit 005db31d5f5f7c31cfdc43505d77eb3ca5cf8ec6 ] Commit e826eafa65c6 ("bonding: Call netif_carrier_off after register_netdevice") moved netif_carrier_off() from bond_init() to bond_create(), but the latter is called only for initial default devices and ones created through sysfs: $ modprobe bonding $ echo +bond1 > /sys/class/net/bonding_masters $ ip link add bond2 type bond $ grep "MII Status" /proc/net/bonding/* /proc/net/bonding/bond0:MII Status: down /proc/net/bonding/bond1:MII Status: down /proc/net/bonding/bond2:MII Status: up Ensure that carrier is initially off also for devices created through netlink. Signed-off-by: Beniamino Galvani Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index db760e84119f..b8df0f5e8c25 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -446,7 +446,11 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev, if (err < 0) return err; - return register_netdevice(bond_dev); + err = register_netdevice(bond_dev); + + netif_carrier_off(bond_dev); + + return err; } static size_t bond_get_size(const struct net_device *bond_dev) From 863c8bb8be39ad11f0d9d66a431b3d9ca5c11dd7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2016 15:42:52 -0700 Subject: [PATCH 075/813] net: bgmac: Fix infinite loop in bgmac_dma_tx_add() [ Upstream commit e86663c475d384ab5f46cb5637e9b7ad08c5c505 ] Nothing is decrementing the index "i" while we are cleaning up the fragments we could not successful transmit. Fixes: 9cde94506eacf ("bgmac: implement scatter/gather support") Reported-by: coverity (CID 1352048) Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bgmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 28f7610b03fe..c32f5d32f811 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -219,7 +219,7 @@ err_dma: dma_unmap_single(dma_dev, slot->dma_addr, skb_headlen(skb), DMA_TO_DEVICE); - while (i > 0) { + while (i-- > 0) { int index = (ring->end + i) % BGMAC_TX_RING_SLOTS; struct bgmac_slot_info *slot = &ring->slots[index]; u32 ctl1 = le32_to_cpu(ring->cpu_base[index].ctl1); From fc9b7c086b6743aa4b1a70ada58352c665ada49a Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 07:43:50 +0200 Subject: [PATCH 076/813] net/irda: fix NULL pointer dereference on memory allocation failure [ Upstream commit d3e6952cfb7ba5f4bfa29d4803ba91f96ce1204d ] I ran into this: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 2 PID: 2012 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff8800b745f2c0 ti: ffff880111740000 task.ti: ffff880111740000 RIP: 0010:[] [] irttp_connect_request+0x36/0x710 RSP: 0018:ffff880111747bb8 EFLAGS: 00010286 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000069dd8358 RDX: 0000000000000009 RSI: 0000000000000027 RDI: 0000000000000048 RBP: ffff880111747c00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000069dd8358 R11: 1ffffffff0759723 R12: 0000000000000000 R13: ffff88011a7e4780 R14: 0000000000000027 R15: 0000000000000000 FS: 00007fc738404700(0000) GS:ffff88011af00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc737fdfb10 CR3: 0000000118087000 CR4: 00000000000006e0 Stack: 0000000000000200 ffff880111747bd8 ffffffff810ee611 ffff880119f1f220 ffff880119f1f4f8 ffff880119f1f4f0 ffff88011a7e4780 ffff880119f1f232 ffff880119f1f220 ffff880111747d58 ffffffff82bca542 0000000000000000 Call Trace: [] irda_connect+0x562/0x1190 [] SYSC_connect+0x202/0x2a0 [] SyS_connect+0x9/0x10 [] do_syscall_64+0x19c/0x410 [] entry_SYSCALL64_slow_path+0x25/0x25 Code: 41 89 ca 48 89 e5 41 57 41 56 41 55 41 54 41 89 d7 53 48 89 fb 48 83 c7 48 48 89 fa 41 89 f6 48 c1 ea 03 48 83 ec 20 4c 8b 65 10 <0f> b6 04 02 84 c0 74 08 84 c0 0f 8e 4c 04 00 00 80 7b 48 00 74 RIP [] irttp_connect_request+0x36/0x710 RSP ---[ end trace 4cda2588bc055b30 ]--- The problem is that irda_open_tsap() can fail and leave self->tsap = NULL, and then irttp_connect_request() almost immediately dereferences it. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/irda/af_irda.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 923abd6b3064..8d2f7c9b491d 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1024,8 +1024,11 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, } /* Check if we have opened a local TSAP */ - if (!self->tsap) - irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (!self->tsap) { + err = irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (err) + goto out; + } /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; From e23696bc441f5e4fefb18e81d51069632480f64a Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 25 Jul 2016 19:07:46 +0300 Subject: [PATCH 077/813] qed: Fix setting/clearing bit in completion bitmap [ Upstream commit 59d3f1ceb69b54569685d0c34dff16a1e0816b19 ] Slowpath completion handling is incorrectly changing SPQ_RING_SIZE bits instead of a single one. Fixes: 76a9a3642a0b ("qed: fix handling of concurrent ramrods") Signed-off-by: Manish Chopra Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qed/qed_spq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index 3dd548ab8df1..40365cb1abe6 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -794,13 +794,12 @@ int qed_spq_completion(struct qed_hwfn *p_hwfn, * in a bitmap and increasing the chain consumer only * for the first successive completed entries. */ - bitmap_set(p_spq->p_comp_bitmap, pos, SPQ_RING_SIZE); + __set_bit(pos, p_spq->p_comp_bitmap); while (test_bit(p_spq->comp_bitmap_idx, p_spq->p_comp_bitmap)) { - bitmap_clear(p_spq->p_comp_bitmap, - p_spq->comp_bitmap_idx, - SPQ_RING_SIZE); + __clear_bit(p_spq->comp_bitmap_idx, + p_spq->p_comp_bitmap); p_spq->comp_bitmap_idx++; qed_chain_return_produced(&p_spq->chain); } From 9c946c931b63068c4197d9d0b4d24635418bc67d Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 29 Jul 2016 09:34:02 -0400 Subject: [PATCH 078/813] tcp: consider recv buf for the initial window scale [ Upstream commit f626300a3e776ccc9671b0dd94698fb3aa315966 ] tcp_select_initial_window() intends to advertise a window scaling for the maximum possible window size. To do so, it considers the maximum of net.ipv4.tcp_rmem[2] and net.core.rmem_max as the only possible upper-bounds. However, users with CAP_NET_ADMIN can use SO_RCVBUFFORCE to set the socket's receive buffer size to values larger than net.ipv4.tcp_rmem[2] and net.core.rmem_max. Thus, SO_RCVBUFFORCE is effectively ignored by tcp_select_initial_window(). To fix this, consider the maximum of net.ipv4.tcp_rmem[2], net.core.rmem_max and socket's initial buffer space. Fixes: b0573dea1fb3 ("[NET]: Introduce SO_{SND,RCV}BUFFORCE socket options") Signed-off-by: Soheil Hassas Yeganeh Suggested-by: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7c9883ab56e5..660c967ba84a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -239,7 +239,8 @@ void tcp_select_initial_window(int __space, __u32 mss, /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 */ - space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; From 694dfd0ef02ded5b6fbea03a12350ee8a74921d5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 31 May 2016 03:33:57 +0100 Subject: [PATCH 079/813] ipath: Restrict use of the write() interface Commit e6bd18f57aad ("IB/security: Restrict use of the write() interface") fixed a security problem with various write() implementations in the Infiniband subsystem. In older kernel versions the ipath_write() function has the same problem and needs the same restriction. (The ipath driver has been completely removed upstream.) Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rdma/ipath/ipath_file_ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c index 13c3cd11ab92..05d30f433b19 100644 --- a/drivers/staging/rdma/ipath/ipath_file_ops.c +++ b/drivers/staging/rdma/ipath/ipath_file_ops.c @@ -45,6 +45,8 @@ #include #include +#include + #include "ipath_kernel.h" #include "ipath_common.h" #include "ipath_user_sdma.h" @@ -2243,6 +2245,9 @@ static ssize_t ipath_write(struct file *fp, const char __user *data, ssize_t ret = 0; void *dest; + if (WARN_ON_ONCE(!ib_safe_file_access(fp))) + return -EACCES; + if (count < sizeof(cmd.type)) { ret = -EINVAL; goto bail; From 5a6f9d06d844763261f89850f33a4b84cfc0f1c1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 1 Dec 2015 10:16:42 +0100 Subject: [PATCH 080/813] scsi: ignore errors from scsi_dh_add_device() commit 221255aee67ec1c752001080aafec0c4e9390d95 upstream. device handler initialisation might fail due to a number of reasons. But as device_handlers are optional this shouldn't cause us to disable the device entirely. So just ignore errors from scsi_dh_add_device(). Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_sysfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index f7ae898833dd..7232d43e2207 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -1058,11 +1058,12 @@ int scsi_sysfs_add_sdev(struct scsi_device *sdev) } error = scsi_dh_add_device(sdev); - if (error) { + if (error) + /* + * device_handler is optional, so any error can be ignored + */ sdev_printk(KERN_INFO, sdev, "failed to add device handler: %d\n", error); - return error; - } device_enable_async_suspend(&sdev->sdev_dev); error = device_add(&sdev->sdev_dev); From 02170f4afcb4514270fcd39cec05650b7858c605 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 3 Feb 2016 01:00:29 +0100 Subject: [PATCH 081/813] PNP: Add Haswell-ULT to Intel MCH size workaround commit ed1f0eeebaeeb7f790e9e7642116a208581e5bfc upstream. Add device ID 0x0a04 for Haswell-ULT to the list of devices with MCH problems. From a Lenovo ThinkPad T440S: [ 0.188604] pnp: PnP ACPI init [ 0.189044] system 00:00: [mem 0x00000000-0x0009ffff] could not be reserved [ 0.189048] system 00:00: [mem 0x000c0000-0x000c3fff] could not be reserved [ 0.189050] system 00:00: [mem 0x000c4000-0x000c7fff] could not be reserved [ 0.189052] system 00:00: [mem 0x000c8000-0x000cbfff] could not be reserved [ 0.189054] system 00:00: [mem 0x000cc000-0x000cffff] could not be reserved [ 0.189056] system 00:00: [mem 0x000d0000-0x000d3fff] has been reserved [ 0.189058] system 00:00: [mem 0x000d4000-0x000d7fff] has been reserved [ 0.189060] system 00:00: [mem 0x000d8000-0x000dbfff] has been reserved [ 0.189061] system 00:00: [mem 0x000dc000-0x000dffff] has been reserved [ 0.189063] system 00:00: [mem 0x000e0000-0x000e3fff] could not be reserved [ 0.189065] system 00:00: [mem 0x000e4000-0x000e7fff] could not be reserved [ 0.189067] system 00:00: [mem 0x000e8000-0x000ebfff] could not be reserved [ 0.189069] system 00:00: [mem 0x000ec000-0x000effff] could not be reserved [ 0.189071] system 00:00: [mem 0x000f0000-0x000fffff] could not be reserved [ 0.189073] system 00:00: [mem 0x00100000-0xdf9fffff] could not be reserved [ 0.189075] system 00:00: [mem 0xfec00000-0xfed3ffff] could not be reserved [ 0.189078] system 00:00: [mem 0xfed4c000-0xffffffff] could not be reserved [ 0.189082] system 00:00: Plug and Play ACPI device, IDs PNP0c01 (active) [ 0.189216] system 00:01: [io 0x1800-0x189f] could not be reserved [ 0.189220] system 00:01: [io 0x0800-0x087f] has been reserved [ 0.189222] system 00:01: [io 0x0880-0x08ff] has been reserved [ 0.189224] system 00:01: [io 0x0900-0x097f] has been reserved [ 0.189226] system 00:01: [io 0x0980-0x09ff] has been reserved [ 0.189229] system 00:01: [io 0x0a00-0x0a7f] has been reserved [ 0.189231] system 00:01: [io 0x0a80-0x0aff] has been reserved [ 0.189233] system 00:01: [io 0x0b00-0x0b7f] has been reserved [ 0.189235] system 00:01: [io 0x0b80-0x0bff] has been reserved [ 0.189238] system 00:01: [io 0x15e0-0x15ef] has been reserved [ 0.189240] system 00:01: [io 0x1600-0x167f] has been reserved [ 0.189242] system 00:01: [io 0x1640-0x165f] has been reserved [ 0.189246] system 00:01: [mem 0xf8000000-0xfbffffff] could not be reserved [ 0.189249] system 00:01: [mem 0x00000000-0x00000fff] could not be reserved [ 0.189251] system 00:01: [mem 0xfed1c000-0xfed1ffff] has been reserved [ 0.189254] system 00:01: [mem 0xfed10000-0xfed13fff] has been reserved [ 0.189256] system 00:01: [mem 0xfed18000-0xfed18fff] has been reserved [ 0.189258] system 00:01: [mem 0xfed19000-0xfed19fff] has been reserved [ 0.189261] system 00:01: [mem 0xfed45000-0xfed4bfff] has been reserved [ 0.189264] system 00:01: Plug and Play ACPI device, IDs PNP0c02 (active) [....] [ 0.583653] resource sanity check: requesting [mem 0xfed10000-0xfed15fff], which spans more than pnp 00:01 [mem 0xfed10000-0xfed13fff] [ 0.583654] ------------[ cut here ]------------ [ 0.583660] WARNING: CPU: 0 PID: 1 at arch/x86/mm/ioremap.c:198 __ioremap_caller+0x2c5/0x380() [ 0.583661] Info: mapping multiple BARs. Your kernel is fine. [ 0.583662] Modules linked in: [ 0.583666] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.3.3-303.fc23.x86_64 #1 [ 0.583668] Hardware name: LENOVO 20AR001GXS/20AR001GXS, BIOS GJET86WW (2.36 ) 12/04/2015 [ 0.583670] 0000000000000000 0000000014cf7e59 ffff880214a1baf8 ffffffff813a625f [ 0.583673] ffff880214a1bb40 ffff880214a1bb30 ffffffff810a07c2 00000000fed10000 [ 0.583675] ffffc90000cb8000 0000000000006000 0000000000000000 ffff8800d6381040 [ 0.583678] Call Trace: [ 0.583683] [] dump_stack+0x44/0x55 [ 0.583686] [] warn_slowpath_common+0x82/0xc0 [ 0.583688] [] warn_slowpath_fmt+0x5c/0x80 [ 0.583692] [] ? iomem_map_sanity_check+0xba/0xd0 [ 0.583695] [] __ioremap_caller+0x2c5/0x380 [ 0.583698] [] ioremap_nocache+0x17/0x20 [ 0.583701] [] snb_uncore_imc_init_box+0x79/0xb0 [ 0.583705] [] uncore_pci_probe+0xd0/0x1b0 [ 0.583707] [] local_pci_probe+0x45/0xa0 [ 0.583710] [] pci_device_probe+0xfd/0x140 [ 0.583713] [] driver_probe_device+0x222/0x480 [ 0.583715] [] __driver_attach+0x84/0x90 [ 0.583717] [] ? driver_probe_device+0x480/0x480 [ 0.583720] [] bus_for_each_dev+0x6c/0xc0 [ 0.583722] [] driver_attach+0x1e/0x20 [ 0.583724] [] bus_add_driver+0x1eb/0x280 [ 0.583727] [] ? uncore_cpu_setup+0x12/0x12 [ 0.583729] [] driver_register+0x60/0xe0 [ 0.583733] [] __pci_register_driver+0x4c/0x50 [ 0.583736] [] intel_uncore_init+0xe2/0x2e6 [ 0.583738] [] ? uncore_cpu_setup+0x12/0x12 [ 0.583741] [] do_one_initcall+0xb3/0x200 [ 0.583745] [] ? parse_args+0x1a0/0x4a0 [ 0.583749] [] kernel_init_freeable+0x189/0x223 [ 0.583752] [] ? rest_init+0x80/0x80 [ 0.583754] [] kernel_init+0xe/0xe0 [ 0.583758] [] ret_from_fork+0x3f/0x70 [ 0.583760] [] ? rest_init+0x80/0x80 [ 0.583765] ---[ end trace 077c426a39e018aa ]--- 00:00.0 Host bridge [0600]: Intel Corporation Haswell-ULT DRAM Controller [8086:0a04] (rev 0b) Subsystem: Lenovo Device [17aa:220c] Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=fast >TAbort- SERR- Kernel driver in use: hsw_uncore Link: https://bugzilla.redhat.com/show_bug.cgi?id=1300955 Tested-by: Signed-off-by: Josh Boyer Signed-off-by: Rafael J. Wysocki Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/pnp/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c index 943c1cb9566c..f5444f7ecc41 100644 --- a/drivers/pnp/quirks.c +++ b/drivers/pnp/quirks.c @@ -342,6 +342,7 @@ static void quirk_amd_mmconfig_area(struct pnp_dev *dev) /* Device IDs of parts that have 32KB MCH space */ static const unsigned int mch_quirk_devices[] = { 0x0154, /* Ivy Bridge */ + 0x0a04, /* Haswell-ULT */ 0x0c00, /* Haswell */ }; From d71d4aceae67acf0dd95fa288439d9801f76c9cb Mon Sep 17 00:00:00 2001 From: Christophe Le Roy Date: Fri, 11 Dec 2015 09:13:42 +0100 Subject: [PATCH 082/813] PNP: Add Broadwell to Intel MCH size workaround commit a77060f07ffc6ac978e280e738302f3e5572a99e upstream. Add device ID 0x1604 for Broadwell to commit cb171f7abb9a ("PNP: Work around BIOS defects in Intel MCH area reporting"). >From a Lenovo ThinkPad T550: system 00:01: [io 0x1800-0x189f] could not be reserved system 00:01: [io 0x0800-0x087f] has been reserved system 00:01: [io 0x0880-0x08ff] has been reserved system 00:01: [io 0x0900-0x097f] has been reserved system 00:01: [io 0x0980-0x09ff] has been reserved system 00:01: [io 0x0a00-0x0a7f] has been reserved system 00:01: [io 0x0a80-0x0aff] has been reserved system 00:01: [io 0x0b00-0x0b7f] has been reserved system 00:01: [io 0x0b80-0x0bff] has been reserved system 00:01: [io 0x15e0-0x15ef] has been reserved system 00:01: [io 0x1600-0x167f] has been reserved system 00:01: [io 0x1640-0x165f] has been reserved system 00:01: [mem 0xf8000000-0xfbffffff] could not be reserved system 00:01: [mem 0xfed1c000-0xfed1ffff] has been reserved system 00:01: [mem 0xfed10000-0xfed13fff] has been reserved system 00:01: [mem 0xfed18000-0xfed18fff] has been reserved system 00:01: [mem 0xfed19000-0xfed19fff] has been reserved system 00:01: [mem 0xfed45000-0xfed4bfff] has been reserved system 00:01: Plug and Play ACPI device, IDs PNP0c02 (active) [...] resource sanity check: requesting [mem 0xfed10000-0xfed15fff], which spans more than pnp 00:01 [mem 0xfed10000-0xfed13fff] ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1 at /build/linux-CrHvZ_/linux-4.2.6/arch/x86/mm/ioremap.c:198 __ioremap_caller+0x2ee/0x360() Info: mapping multiple BARs. Your kernel is fine. Modules linked in: CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.2.0-1-amd64 #1 Debian 4.2.6-1 Hardware name: LENOVO 20CKCTO1WW/20CKCTO1WW, BIOS N11ET34W (1.10 ) 08/20/2015 0000000000000000 ffffffff817e6868 ffffffff8154e2f6 ffff8802241efbf8 ffffffff8106e5b1 ffffc90000e98000 0000000000006000 ffffc90000e98000 0000000000006000 0000000000000000 ffffffff8106e62a ffffffff817e68c8 Call Trace: [] ? dump_stack+0x40/0x50 [] ? warn_slowpath_common+0x81/0xb0 [] ? warn_slowpath_fmt+0x4a/0x50 [] ? iomem_map_sanity_check+0xb3/0xc0 [] ? __ioremap_caller+0x2ee/0x360 [] ? snb_uncore_imc_init_box+0x66/0x90 [] ? uncore_pci_probe+0xc8/0x1a0 [] ? local_pci_probe+0x3f/0xa0 [] ? pci_device_probe+0xc4/0x110 [] ? driver_probe_device+0x1ee/0x450 [] ? __driver_attach+0x7b/0x80 [] ? driver_probe_device+0x450/0x450 [] ? bus_for_each_dev+0x5a/0x90 [] ? bus_add_driver+0x1f1/0x290 [] ? uncore_cpu_setup+0xc/0xc [] ? driver_register+0x5f/0xe0 [] ? intel_uncore_init+0xcc/0x2b0 [] ? uncore_cpu_setup+0xc/0xc [] ? do_one_initcall+0xce/0x200 [] ? parse_args+0x140/0x4e0 [] ? kernel_init_freeable+0x162/0x1e8 [] ? rest_init+0x80/0x80 [] ? kernel_init+0xe/0xf0 [] ? ret_from_fork+0x3f/0x70 [] ? rest_init+0x80/0x80 ---[ end trace 472e7959536abf12 ]--- 00:00.0 Host bridge: Intel Corporation Broadwell-U Host Bridge -OPI (rev 09) Subsystem: Lenovo Device 2223 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=fast >TAbort- SERR- Kernel driver in use: bdw_uncore 00: 86 80 04 16 06 00 90 20 09 00 00 06 00 00 00 00 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20: 00 00 00 00 00 00 00 00 00 00 00 00 aa 17 23 22 30: 00 00 00 00 e0 00 00 00 00 00 00 00 00 00 00 00 Signed-off-by: Christophe Le Roy Signed-off-by: Rafael J. Wysocki Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/pnp/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c index f5444f7ecc41..d28e3ab9479c 100644 --- a/drivers/pnp/quirks.c +++ b/drivers/pnp/quirks.c @@ -344,6 +344,7 @@ static const unsigned int mch_quirk_devices[] = { 0x0154, /* Ivy Bridge */ 0x0a04, /* Haswell-ULT */ 0x0c00, /* Haswell */ + 0x1604, /* Broadwell */ }; static struct pci_dev *get_intel_host(void) From 6e1242497cdf8274b8a27f24325634089c77285e Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 8 Jan 2016 17:58:49 +0100 Subject: [PATCH 083/813] HID: sony: do not bail out when the sixaxis refuses the output report commit 19f4c2ba869517048add62c202f9645b6adf5dfb upstream. When setting the operational mode, some third party (Speedlink Strike-FX) gamepads refuse the output report. Failing here means we refuse to initialize the gamepad while this should be harmless. The weird part is that the initial commit that added this: a7de9b8 ("HID: sony: Enable Gasia third-party PS3 controllers") mentions this very same controller as one requiring this output report. Anyway, it's broken for one user at least, so let's change it. We will report an error, but at least the controller should work. And no, these devices present themselves as legacy Sony controllers (VID:PID of 054C:0268, as in the official ones) so there are no ways of discriminating them from the official ones. https://bugzilla.redhat.com/show_bug.cgi?id=1255325 Reported-and-tested-by: Max Fedotov Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-sony.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 774cd2210566..21febbb0d84e 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1418,8 +1418,10 @@ static int sixaxis_set_operational_usb(struct hid_device *hdev) } ret = hid_hw_output_report(hdev, buf, 1); - if (ret < 0) - hid_err(hdev, "can't set operational mode: step 3\n"); + if (ret < 0) { + hid_info(hdev, "can't set operational mode: step 3, ignoring\n"); + ret = 0; + } out: kfree(buf); From 979a61a02992e2029fcedcdf32c05050aa652c9c Mon Sep 17 00:00:00 2001 From: Hector Marco-Gisbert Date: Thu, 10 Mar 2016 20:51:00 +0100 Subject: [PATCH 084/813] x86/mm/32: Enable full randomization on i386 and X86_32 commit 8b8addf891de8a00e4d39fc32f93f7c5eb8feceb upstream. Currently on i386 and on X86_64 when emulating X86_32 in legacy mode, only the stack and the executable are randomized but not other mmapped files (libraries, vDSO, etc.). This patch enables randomization for the libraries, vDSO and mmap requests on i386 and in X86_32 in legacy mode. By default on i386 there are 8 bits for the randomization of the libraries, vDSO and mmaps which only uses 1MB of VA. This patch preserves the original randomness, using 1MB of VA out of 3GB or 4GB. We think that 1MB out of 3GB is not a big cost for having the ASLR. The first obvious security benefit is that all objects are randomized (not only the stack and the executable) in legacy mode which highly increases the ASLR effectiveness, otherwise the attackers may use these non-randomized areas. But also sensitive setuid/setgid applications are more secure because currently, attackers can disable the randomization of these applications by setting the ulimit stack to "unlimited". This is a very old and widely known trick to disable the ASLR in i386 which has been allowed for too long. Another trick used to disable the ASLR was to set the ADDR_NO_RANDOMIZE personality flag, but fortunately this doesn't work on setuid/setgid applications because there is security checks which clear Security-relevant flags. This patch always randomizes the mmap_legacy_base address, removing the possibility to disable the ASLR by setting the stack to "unlimited". Signed-off-by: Hector Marco-Gisbert Acked-by: Ismael Ripoll Ripoll Acked-by: Kees Cook Acked-by: Arjan van de Ven Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: kees Cook Link: http://lkml.kernel.org/r/1457639460-5242-1-git-send-email-hecmargi@upv.es Signed-off-by: Ingo Molnar Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/mmap.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 844b06d67df4..307f60ecfc6d 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } -/* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ -static unsigned long mmap_legacy_base(unsigned long rnd) -{ - if (mmap_is_ia32()) - return TASK_UNMAPPED_BASE; - else - return TASK_UNMAPPED_BASE + rnd; -} - /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = mmap_legacy_base(random_factor); + mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; From 3088903a55f218c0d3758de086ede3901b8711b0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 9 Jun 2016 16:56:28 +0300 Subject: [PATCH 085/813] i2c: i801: Allow ACPI SystemIO OpRegion to conflict with PCI BAR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a7ae81952cdab56a1277bd2f9ed7284c0f575120 upstream. Many Intel systems the BIOS declares a SystemIO OpRegion below the SMBus PCI device as can be seen in ACPI DSDT table from Lenovo Yoga 900: Device (SBUS) { OperationRegion (SMBI, SystemIO, (SBAR << 0x05), 0x10) Field (SMBI, ByteAcc, NoLock, Preserve) { HSTS, 8, Offset (0x02), HCON, 8, HCOM, 8, TXSA, 8, DAT0, 8, DAT1, 8, HBDR, 8, PECR, 8, RXSA, 8, SDAT, 16 } There are also bunch of AML methods that that the BIOS can use to access these fields. Most of the systems in question AML methods accessing the SMBI OpRegion are never used. Now, because of this SMBI OpRegion many systems fail to load the SMBus driver with an error looking like one below: ACPI Warning: SystemIO range 0x0000000000003040-0x000000000000305F conflicts with OpRegion 0x0000000000003040-0x000000000000304F (\_SB.PCI0.SBUS.SMBI) (20160108/utaddress-255) ACPI: If an ACPI driver is available for this device, you should use it instead of the native driver The reason is that this SMBI OpRegion conflicts with the PCI BAR used by the SMBus driver. It turns out that we can install a custom SystemIO address space handler for the SMBus device to intercept all accesses through that OpRegion. This allows us to share the PCI BAR with the AML code if it for some reason is using it. We do not expect that this OpRegion handler will ever be called but if it is we print a warning and prevent all access from the SMBus driver itself. Link: https://bugzilla.kernel.org/show_bug.cgi?id=110041 Reported-by: Andy Lutomirski Reported-by: Pali Rohár Suggested-by: Rafael J. Wysocki Signed-off-by: Mika Westerberg Acked-by: Rafael J. Wysocki Reviewed-by: Jean Delvare Reviewed-by: Benjamin Tissoires Tested-by: Pali Rohár Tested-by: Jean Delvare Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-i801.c | 103 +++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 27fa0cb09538..85f39cc3e276 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -244,6 +244,13 @@ struct i801_priv { struct platform_device *mux_pdev; #endif struct platform_device *tco_pdev; + + /* + * If set to true the host controller registers are reserved for + * ACPI AML use. Protected by acpi_lock. + */ + bool acpi_reserved; + struct mutex acpi_lock; }; #define FEATURE_SMBUS_PEC (1 << 0) @@ -714,9 +721,15 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, { int hwpec; int block = 0; - int ret, xact = 0; + int ret = 0, xact = 0; struct i801_priv *priv = i2c_get_adapdata(adap); + mutex_lock(&priv->acpi_lock); + if (priv->acpi_reserved) { + mutex_unlock(&priv->acpi_lock); + return -EBUSY; + } + hwpec = (priv->features & FEATURE_SMBUS_PEC) && (flags & I2C_CLIENT_PEC) && size != I2C_SMBUS_QUICK && size != I2C_SMBUS_I2C_BLOCK_DATA; @@ -773,7 +786,8 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, default: dev_err(&priv->pci_dev->dev, "Unsupported transaction %d\n", size); - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + goto out; } if (hwpec) /* enable/disable hardware PEC */ @@ -796,11 +810,11 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, ~(SMBAUXCTL_CRC | SMBAUXCTL_E32B), SMBAUXCTL(priv)); if (block) - return ret; + goto out; if (ret) - return ret; + goto out; if ((read_write == I2C_SMBUS_WRITE) || (xact == I801_QUICK)) - return 0; + goto out; switch (xact & 0x7f) { case I801_BYTE: /* Result put in SMBHSTDAT0 */ @@ -812,7 +826,10 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, (inb_p(SMBHSTDAT1(priv)) << 8); break; } - return 0; + +out: + mutex_unlock(&priv->acpi_lock); + return ret; } @@ -1249,6 +1266,72 @@ static void i801_add_tco(struct i801_priv *priv) priv->tco_pdev = pdev; } +#ifdef CONFIG_ACPI +static acpi_status +i801_acpi_io_handler(u32 function, acpi_physical_address address, u32 bits, + u64 *value, void *handler_context, void *region_context) +{ + struct i801_priv *priv = handler_context; + struct pci_dev *pdev = priv->pci_dev; + acpi_status status; + + /* + * Once BIOS AML code touches the OpRegion we warn and inhibit any + * further access from the driver itself. This device is now owned + * by the system firmware. + */ + mutex_lock(&priv->acpi_lock); + + if (!priv->acpi_reserved) { + priv->acpi_reserved = true; + + dev_warn(&pdev->dev, "BIOS is accessing SMBus registers\n"); + dev_warn(&pdev->dev, "Driver SMBus register access inhibited\n"); + } + + if ((function & ACPI_IO_MASK) == ACPI_READ) + status = acpi_os_read_port(address, (u32 *)value, bits); + else + status = acpi_os_write_port(address, (u32)*value, bits); + + mutex_unlock(&priv->acpi_lock); + + return status; +} + +static int i801_acpi_probe(struct i801_priv *priv) +{ + struct acpi_device *adev; + acpi_status status; + + adev = ACPI_COMPANION(&priv->pci_dev->dev); + if (adev) { + status = acpi_install_address_space_handler(adev->handle, + ACPI_ADR_SPACE_SYSTEM_IO, i801_acpi_io_handler, + NULL, priv); + if (ACPI_SUCCESS(status)) + return 0; + } + + return acpi_check_resource_conflict(&priv->pci_dev->resource[SMBBAR]); +} + +static void i801_acpi_remove(struct i801_priv *priv) +{ + struct acpi_device *adev; + + adev = ACPI_COMPANION(&priv->pci_dev->dev); + if (!adev) + return; + + acpi_remove_address_space_handler(adev->handle, + ACPI_ADR_SPACE_SYSTEM_IO, i801_acpi_io_handler); +} +#else +static inline int i801_acpi_probe(struct i801_priv *priv) { return 0; } +static inline void i801_acpi_remove(struct i801_priv *priv) { } +#endif + static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) { unsigned char temp; @@ -1266,6 +1349,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) priv->adapter.dev.parent = &dev->dev; ACPI_COMPANION_SET(&priv->adapter.dev, ACPI_COMPANION(&dev->dev)); priv->adapter.retries = 3; + mutex_init(&priv->acpi_lock); priv->pci_dev = dev; switch (dev->device) { @@ -1328,10 +1412,8 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) return -ENODEV; } - err = acpi_check_resource_conflict(&dev->resource[SMBBAR]); - if (err) { + if (i801_acpi_probe(priv)) return -ENODEV; - } err = pcim_iomap_regions(dev, 1 << SMBBAR, dev_driver_string(&dev->dev)); @@ -1340,6 +1422,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) "Failed to request SMBus region 0x%lx-0x%Lx\n", priv->smba, (unsigned long long)pci_resource_end(dev, SMBBAR)); + i801_acpi_remove(priv); return err; } @@ -1404,6 +1487,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) err = i2c_add_adapter(&priv->adapter); if (err) { dev_err(&dev->dev, "Failed to add SMBus adapter\n"); + i801_acpi_remove(priv); return err; } @@ -1422,6 +1506,7 @@ static void i801_remove(struct pci_dev *dev) i801_del_mux(priv); i2c_del_adapter(&priv->adapter); + i801_acpi_remove(priv); pci_write_config_byte(dev, SMBHSTCFG, priv->original_hstcfg); platform_device_unregister(priv->tco_pdev); From 66e5d7b47c864f1821041f77752930ec3b8dfc22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 7 Mar 2016 21:15:36 +0100 Subject: [PATCH 086/813] cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4d06dd537f95683aba3651098ae288b7cbff8274 upstream. usbnet_link_change will call schedule_work and should be avoided if bind is failing. Otherwise we will end up with scheduled work referring to a netdev which has gone away. Instead of making the call conditional, we can just defer it to usbnet_probe, using the driver_info flag made for this purpose. Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change") Reported-by: Andrey Konovalov Suggested-by: Linus Torvalds Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/cdc_ncm.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index a790d5f90b83..e0e94b855bbe 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -952,8 +952,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) { - int ret; - /* MBIM backwards compatible function? */ if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM) return -ENODEV; @@ -962,16 +960,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) * Additionally, generic NCM devices are assumed to accept arbitrarily * placed NDP. */ - ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); - - /* - * We should get an event when network connection is "connected" or - * "disconnected". Set network connection in "disconnected" state - * (carrier is OFF) during attach, so the IP network stack does not - * start IPv6 negotiation and more. - */ - usbnet_link_change(dev, 0, 0); - return ret; + return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); } static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max) @@ -1554,7 +1543,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", - .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET, + .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET + | FLAG_LINK_INTR, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1567,7 +1557,7 @@ static const struct driver_info cdc_ncm_info = { static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN, + | FLAG_LINK_INTR | FLAG_WWAN, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1580,7 +1570,7 @@ static const struct driver_info wwan_info = { static const struct driver_info wwan_noarp_info = { .description = "Mobile Broadband Network Device (NO ARP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN | FLAG_NOARP, + | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, From 0107ea0e0928c8a077f0f912c809f2b86fa7496c Mon Sep 17 00:00:00 2001 From: Dave Weinstein Date: Thu, 28 Jul 2016 11:55:41 -0700 Subject: [PATCH 087/813] arm: oabi compat: add missing access checks commit 7de249964f5578e67b99699c5f0b405738d820a2 upstream. Add access checks to sys_oabi_epoll_wait() and sys_oabi_semtimedop(). This fixes CVE-2016-3857, a local privilege escalation under CONFIG_OABI_COMPAT. Reported-by: Chiachih Wu Reviewed-by: Kees Cook Reviewed-by: Nicolas Pitre Signed-off-by: Dave Weinstein Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/sys_oabi-compat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 087acb569b63..5f221acd21ae 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -279,8 +279,12 @@ asmlinkage long sys_oabi_epoll_wait(int epfd, mm_segment_t fs; long ret, err, i; - if (maxevents <= 0 || maxevents > (INT_MAX/sizeof(struct epoll_event))) + if (maxevents <= 0 || + maxevents > (INT_MAX/sizeof(*kbuf)) || + maxevents > (INT_MAX/sizeof(*events))) return -EINVAL; + if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents)) + return -EFAULT; kbuf = kmalloc(sizeof(*kbuf) * maxevents, GFP_KERNEL); if (!kbuf) return -ENOMEM; @@ -317,6 +321,8 @@ asmlinkage long sys_oabi_semtimedop(int semid, if (nsops < 1 || nsops > SEMOPM) return -EINVAL; + if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops)) + return -EFAULT; sops = kmalloc(sizeof(*sops) * nsops, GFP_KERNEL); if (!sops) return -ENOMEM; From cca36a7dad58fc7a95944319e48162194ead6f00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Jul 2016 11:43:37 +0100 Subject: [PATCH 088/813] KEYS: 64-bit MIPS needs to use compat_sys_keyctl for 32-bit userspace commit 20f06ed9f61a185c6dabd662c310bed6189470df upstream. MIPS64 needs to use compat_sys_keyctl for 32-bit userspace rather than calling sys_keyctl. The latter will work in a lot of cases, thereby hiding the issue. Reported-by: Stephan Mueller Signed-off-by: David Howells Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: keyrings@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/13832/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 5a69eb48d0a8..ee93d5fe61d7 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -344,7 +344,7 @@ EXPORT(sysn32_call_table) PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key PTR sys_request_key - PTR sys_keyctl /* 6245 */ + PTR compat_sys_keyctl /* 6245 */ PTR sys_set_thread_area PTR sys_inotify_init PTR sys_inotify_add_watch diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index e4b6d7c97822..b77052ec6fb2 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -500,7 +500,7 @@ EXPORT(sys32_call_table) PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key /* 4280 */ PTR sys_request_key - PTR sys_keyctl + PTR compat_sys_keyctl PTR sys_set_thread_area PTR sys_inotify_init PTR sys_inotify_add_watch /* 4285 */ From 4cf8f0b0b3e635d8a17f19ef3a183c4c95a4af39 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Mon, 13 Jun 2016 17:03:48 +0200 Subject: [PATCH 089/813] Revert "s390/kdump: Clear subchannel ID to signal non-CCW/SCSI IPL" commit 5419447e2142d6ed68c9f5c1a28630b3a290a845 upstream. This reverts commit 852ffd0f4e23248b47531058e531066a988434b5. There are use cases where an intermediate boot kernel (1) uses kexec to boot the final production kernel (2). For this scenario we should provide the original boot information to the production kernel (2). Therefore clearing the boot information during kexec() should not be done. Reported-by: Steffen Maier Signed-off-by: Michael Holzheu Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/ipl.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index b1f0a90f933b..42570d8fb265 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2070,13 +2070,6 @@ void s390_reset_system(void (*fn_pre)(void), S390_lowcore.program_new_psw.addr = PSW_ADDR_AMODE | (unsigned long) s390_base_pgm_handler; - /* - * Clear subchannel ID and number to signal new kernel that no CCW or - * SCSI IPL has been done (for kexec and kdump) - */ - S390_lowcore.subchannel_id = 0; - S390_lowcore.subchannel_nr = 0; - /* Store status at absolute zero */ store_status(); From 6090bfb684a9985e29c3c0aae52a4b93f967e90f Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 18 Nov 2015 11:41:05 -0800 Subject: [PATCH 090/813] apparmor: fix ref count leak when profile sha1 hash is read commit 0b938a2e2cf0b0a2c8bac9769111545aff0fee97 upstream. Signed-off-by: John Johansen Acked-by: Seth Arnold Signed-off-by: Greg Kroah-Hartman --- security/apparmor/apparmorfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index ad4fa49ad1db..9068369f8a1b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -331,6 +331,7 @@ static int aa_fs_seq_hash_show(struct seq_file *seq, void *v) seq_printf(seq, "%.2x", profile->hash[i]); seq_puts(seq, "\n"); } + aa_put_profile(profile); return 0; } From 93f84c8864658c740d205624ab9d23ceca235e46 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 3 Jul 2016 17:01:26 -0400 Subject: [PATCH 091/813] random: strengthen input validation for RNDADDTOENTCNT commit 86a574de4590ffe6fd3f3ca34cdcf655a78e36ec upstream. Don't allow RNDADDTOENTCNT or RNDADDENTROPY to accept a negative entropy value. It doesn't make any sense to subtract from the entropy counter, and it can trigger a warning: random: negative entropy/overflow: pool input count -40000 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 6828 at drivers/char/random.c:670[< none >] credit_entropy_bits+0x21e/0xad0 drivers/char/random.c:670 Modules linked in: CPU: 3 PID: 6828 Comm: a.out Not tainted 4.7.0-rc4+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 ffffffff880b58e0 ffff88005dd9fcb0 ffffffff82cc838f ffffffff87158b40 fffffbfff1016b1c 0000000000000000 0000000000000000 ffffffff87158b40 ffffffff83283dae 0000000000000009 ffff88005dd9fcf8 ffffffff8136d27f Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x12e/0x18f lib/dump_stack.c:51 [] __warn+0x19f/0x1e0 kernel/panic.c:516 [] warn_slowpath_null+0x2c/0x40 kernel/panic.c:551 [] credit_entropy_bits+0x21e/0xad0 drivers/char/random.c:670 [< inline >] credit_entropy_bits_safe drivers/char/random.c:734 [] random_ioctl+0x21d/0x250 drivers/char/random.c:1546 [< inline >] vfs_ioctl fs/ioctl.c:43 [] do_vfs_ioctl+0x18c/0xff0 fs/ioctl.c:674 [< inline >] SYSC_ioctl fs/ioctl.c:689 [] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:680 [] entry_SYSCALL_64_fastpath+0x23/0xc1 arch/x86/entry/entry_64.S:207 ---[ end trace 5d4902b2ba842f1f ]--- This was triggered using the test program: // autogenerated by syzkaller (http://github.com/google/syzkaller) int main() { int fd = open("/dev/random", O_RDWR); int val = -5000; ioctl(fd, RNDADDTOENTCNT, &val); return 0; } It's harmless in that (a) only root can trigger it, and (b) after complaining the code never does let the entropy count go negative, but it's better to simply not allow this userspace from passing in a negative entropy value altogether. Google-Bug-Id: #29575089 Reported-By: Dmitry Vyukov Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index d0da5d852d41..0227b0465b40 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -722,15 +722,18 @@ retry: } } -static void credit_entropy_bits_safe(struct entropy_store *r, int nbits) +static int credit_entropy_bits_safe(struct entropy_store *r, int nbits) { const int nbits_max = (int)(~0U >> (ENTROPY_SHIFT + 1)); + if (nbits < 0) + return -EINVAL; + /* Cap the value to avoid overflows */ nbits = min(nbits, nbits_max); - nbits = max(nbits, -nbits_max); credit_entropy_bits(r, nbits); + return 0; } /********************************************************************* @@ -1542,8 +1545,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EPERM; if (get_user(ent_count, p)) return -EFAULT; - credit_entropy_bits_safe(&input_pool, ent_count); - return 0; + return credit_entropy_bits_safe(&input_pool, ent_count); case RNDADDENTROPY: if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1557,8 +1559,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) size); if (retval < 0) return retval; - credit_entropy_bits_safe(&input_pool, ent_count); - return 0; + return credit_entropy_bits_safe(&input_pool, ent_count); case RNDZAPENTCNT: case RNDCLEARPOOL: /* From 5c7d0f49cf1492866fa619af4538f56938abe07d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2016 15:16:07 -0700 Subject: [PATCH 092/813] devpts: clean up interface to pty drivers commit 67245ff332064c01b760afa7a384ccda024bfd24 upstream. This gets rid of the horrible notion of having that struct inode *ptmx_inode be the linchpin of the interface between the pty code and devpts. By de-emphasizing the ptmx inode, a lot of things actually get cleaner, and we will have a much saner way forward. In particular, this will allow us to associate with any particular devpts instance at open-time, and not be artificially tied to one particular ptmx inode. The patch itself is actually fairly straightforward, and apart from some locking and return path cleanups it's pretty mechanical: - the interfaces that devpts exposes all take "struct pts_fs_info *" instead of "struct inode *ptmx_inode" now. NOTE! The "struct pts_fs_info" thing is a completely opaque structure as far as the pty driver is concerned: it's still declared entirely internally to devpts. So the pty code can't actually access it in any way, just pass it as a "cookie" to the devpts code. - the "look up the pts fs info" is now a single clear operation, that also does the reference count increment on the pts superblock. So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get ref" operation (devpts_get_ref(inode)), along with a "put ref" op (devpts_put_ref()). - the pty master "tty->driver_data" field now contains the pts_fs_info, not the ptmx inode. - because we don't care about the ptmx inode any more as some kind of base index, the ref counting can now drop the inode games - it just gets the ref on the superblock. - the pts_fs_info now has a back-pointer to the super_block. That's so that we can easily look up the information we actually need. Although quite often, the pts fs info was actually all we wanted, and not having to look it up based on some magical inode makes things more straightforward. In particular, now that "devpts_get_ref(inode)" operation should really be the *only* place we need to look up what devpts instance we're associated with, and we do it exactly once, at ptmx_open() time. The other side of this is that one ptmx node could now be associated with multiple different devpts instances - you could have a single /dev/ptmx node, and then have multiple mount namespaces with their own instances of devpts mounted on /dev/pts/. And that's all perfectly sane in a model where we just look up the pts instance at open time. This will eventually allow us to get rid of our odd single-vs-multiple pts instance model, but this patch in itself changes no semantics, only an internal binding model. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds Cc: Francesco Ruggeri Cc: "Herton R. Krzesinski" Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 63 +++++++++++++++++++-------------------- fs/devpts/inode.c | 49 +++++++++++++++--------------- include/linux/devpts_fs.h | 34 +++++++-------------- 3 files changed, 64 insertions(+), 82 deletions(-) diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 7865228f664f..807d80145686 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -679,14 +679,14 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) /* this is called once with whichever end is closed last */ static void pty_unix98_shutdown(struct tty_struct *tty) { - struct inode *ptmx_inode; + struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) - ptmx_inode = tty->driver_data; + fsi = tty->driver_data; else - ptmx_inode = tty->link->driver_data; - devpts_kill_index(ptmx_inode, tty->index); - devpts_del_ref(ptmx_inode); + fsi = tty->link->driver_data; + devpts_kill_index(fsi, tty->index); + devpts_put_ref(fsi); } static const struct tty_operations ptm_unix98_ops = { @@ -738,6 +738,7 @@ static const struct tty_operations pty_unix98_ops = { static int ptmx_open(struct inode *inode, struct file *filp) { + struct pts_fs_info *fsi; struct tty_struct *tty; struct inode *slave_inode; int retval; @@ -752,47 +753,41 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; + fsi = devpts_get_ref(inode, filp); + retval = -ENODEV; + if (!fsi) + goto out_free_file; + /* find a device that is not in use. */ mutex_lock(&devpts_mutex); - index = devpts_new_index(inode); - if (index < 0) { - retval = index; - mutex_unlock(&devpts_mutex); - goto err_file; - } - + index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); + retval = index; + if (index < 0) + goto out_put_ref; + + mutex_lock(&tty_mutex); tty = tty_init_dev(ptm_driver, index); - - if (IS_ERR(tty)) { - retval = PTR_ERR(tty); - goto out; - } - /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - tty->driver_data = inode; + retval = PTR_ERR(tty); + if (IS_ERR(tty)) + goto out; /* - * In the case where all references to ptmx inode are dropped and we - * still have /dev/tty opened pointing to the master/slave pair (ptmx - * is closed/released before /dev/tty), we must make sure that the inode - * is still valid when we call the final pty_unix98_shutdown, thus we - * hold an additional reference to the ptmx inode. For the same /dev/tty - * last close case, we also need to make sure the super_block isn't - * destroyed (devpts instance unmounted), before /dev/tty is closed and - * on its release devpts_kill_index is called. + * From here on out, the tty is "live", and the index and + * fsi will be killed/put by the tty_release() */ - devpts_add_ref(inode); + set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ + tty->driver_data = fsi; tty_add_file(tty, filp); - slave_inode = devpts_pty_new(inode, + slave_inode = devpts_pty_new(fsi, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index, tty->link); if (IS_ERR(slave_inode)) { @@ -811,12 +806,14 @@ static int ptmx_open(struct inode *inode, struct file *filp) return 0; err_release: tty_unlock(tty); + // This will also put-ref the fsi tty_release(inode, filp); return retval; out: - mutex_unlock(&tty_mutex); - devpts_kill_index(inode, index); -err_file: + devpts_kill_index(fsi, index); +out_put_ref: + devpts_put_ref(fsi); +out_free_file: tty_free_file(filp); return retval; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 706de324f2a6..c82edb049117 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -128,6 +128,7 @@ static const match_table_t tokens = { struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; + struct super_block *sb; struct dentry *ptmx_dentry; }; @@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; -static void *new_pts_fs_info(void) +static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; @@ -369,6 +370,7 @@ static void *new_pts_fs_info(void) ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->sb = sb; return fsi; } @@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; - s->s_fs_info = new_pts_fs_info(); + s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; @@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi; int index; int ida_ret; - if (!sb) + if (!fsi) return -ENODEV; - fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -564,11 +563,8 @@ retry: return index; } -void devpts_kill_index(struct inode *ptmx_inode, int idx) +void devpts_kill_index(struct pts_fs_info *fsi, int idx) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); - mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); pty_count--; @@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) /* * pty code needs to hold extra references in case of last /dev/tty close */ - -void devpts_add_ref(struct inode *ptmx_inode) +struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; + struct pts_fs_info *fsi; + + sb = pts_sb_from_inode(ptmx_inode); + if (!sb) + return NULL; + fsi = DEVPTS_SB(sb); + if (!fsi) + return NULL; atomic_inc(&sb->s_active); - ihold(ptmx_inode); + return fsi; } -void devpts_del_ref(struct inode *ptmx_inode) +void devpts_put_ref(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - - iput(ptmx_inode); - deactivate_super(sb); + deactivate_super(fsi->sb); } /** @@ -604,22 +604,21 @@ void devpts_del_ref(struct inode *ptmx_inode) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, +struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, void *priv) { struct dentry *dentry; - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; struct inode *inode; struct dentry *root; - struct pts_fs_info *fsi; struct pts_mount_opts *opts; char s[12]; - if (!sb) + if (!fsi) return ERR_PTR(-ENODEV); + sb = fsi->sb; root = sb->s_root; - fsi = DEVPTS_SB(sb); opts = &fsi->mount_opts; inode = new_inode(sb); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index e0ee0b3000b2..358a4db72a27 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,38 +15,24 @@ #include +struct pts_fs_info; + #ifdef CONFIG_UNIX98_PTYS -int devpts_new_index(struct inode *ptmx_inode); -void devpts_kill_index(struct inode *ptmx_inode, int idx); -void devpts_add_ref(struct inode *ptmx_inode); -void devpts_del_ref(struct inode *ptmx_inode); +/* Look up a pts fs info and get a ref to it */ +struct pts_fs_info *devpts_get_ref(struct inode *, struct file *); +void devpts_put_ref(struct pts_fs_info *); + +int devpts_new_index(struct pts_fs_info *); +void devpts_kill_index(struct pts_fs_info *, int); + /* mknod in devpts */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, - void *priv); +struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); /* get private structure */ void *devpts_get_priv(struct inode *pts_inode); /* unlink */ void devpts_pty_kill(struct inode *inode); -#else - -/* Dummy stubs in the no-pty case */ -static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; } -static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { } -static inline void devpts_add_ref(struct inode *ptmx_inode) { } -static inline void devpts_del_ref(struct inode *ptmx_inode) { } -static inline struct inode *devpts_pty_new(struct inode *ptmx_inode, - dev_t device, int index, void *priv) -{ - return ERR_PTR(-EINVAL); -} -static inline void *devpts_get_priv(struct inode *pts_inode) -{ - return NULL; -} -static inline void devpts_pty_kill(struct inode *inode) { } - #endif From 8f5b8210fff0e8469c056b82490d786bc6bde92a Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:57 -0600 Subject: [PATCH 093/813] x86/mm/pat: Add support of non-default PAT MSR setting commit 02f037d641dc6672be5cfe7875a48ab99b95b154 upstream. In preparation for fixing a regression caused by: 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled")' ... PAT needs to support a case that PAT MSR is initialized with a non-default value. When pat_init() is called and PAT is disabled, it initializes the PAT table with the BIOS default value. Xen, however, sets PAT MSR with a non-default value to enable WC. This causes inconsistency between the PAT table and PAT MSR when PAT is set to disable on Xen. Change pat_init() to handle the PAT disable cases properly. Add init_cache_modes() to handle two cases when PAT is set to disable. 1. CPU supports PAT: Set PAT table to be consistent with PAT MSR. 2. CPU does not support PAT: Set PAT table to be consistent with PWT and PCD bits in a PTE. Note, __init_cache_modes(), renamed from pat_init_cache_modes(), will be changed to a static function in a later patch. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pat.h | 2 +- arch/x86/mm/pat.c | 73 +++++++++++++++++++++++++++----------- arch/x86/xen/enlighten.c | 2 +- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index ca6c228d5e62..97ea55bc2b54 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -6,7 +6,7 @@ bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(u64); +void __init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 188e3e07eeeb..86066ffb014a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -180,7 +180,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(u64 pat) +void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; @@ -206,9 +206,6 @@ static void pat_bsp_init(u64 pat) return; } - if (!pat_enabled()) - goto done; - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); @@ -217,15 +214,11 @@ static void pat_bsp_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); -done: - pat_init_cache_modes(pat); + __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!pat_enabled()) - return; - if (!cpu_has_pat) { /* * If this happens we are on a secondary CPU, but switched to @@ -237,18 +230,32 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void pat_init(void) +static void init_cache_modes(void) { - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; + u64 pat = 0; + static int init_cm_done; - if (!pat_enabled()) { + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two - * cache bits, PWT (Write Through) and PCD (Cache Disable). This - * setup is the same as the BIOS default setup when the system - * has PAT but the "nopat" boot option has been specified. This - * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. * * PTE encoding: * @@ -265,10 +272,36 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } - } else if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + __init_cache_modes(pat); + + init_cm_done = 1; +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC and WT. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + init_cache_modes(); + return; + } + + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index beab8c706ac9..cf8d1bcabc56 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1632,7 +1632,7 @@ asmlinkage __visible void __init xen_start_kernel(void) * configuration. */ rdmsrl(MSR_IA32_CR_PAT, pat); - pat_init_cache_modes(pat); + __init_cache_modes(pat); /* keep using Xen gdt for now; no urgent need to change it */ From d50e8b108ef8980bd193de587d984e986be2ecc1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:58 -0600 Subject: [PATCH 094/813] x86/mm/pat: Add pat_disable() interface commit 224bb1e5d67ba0f2872c98002d6a6f991ac6fd4a upstream. In preparation for fixing a regression caused by: 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled") ... PAT needs to provide an interface that prevents the OS from initializing the PAT MSR. PAT MSR initialization must be done on all CPUs using the specific sequence of operations defined in the Intel SDM. This requires MTRRs to be enabled since pat_init() is called as part of MTRR init from mtrr_rendezvous_handler(). Make pat_disable() as the interface that prevents the OS from initializing the PAT MSR. MTRR will call this interface when it cannot provide the SDM-defined sequence to initialize PAT. This also assures that pat_disable() called from pat_bsp_init() will set the PAT table properly when CPU does not support PAT. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Robert Elliott Cc: Toshi Kani Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pat.h | 1 + arch/x86/mm/pat.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 97ea55bc2b54..0ad356c066ef 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -5,6 +5,7 @@ #include bool pat_enabled(void); +void pat_disable(const char *reason); extern void pat_init(void); void __init_cache_modes(u64); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 86066ffb014a..a10dd4fcd538 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -39,11 +39,22 @@ static bool boot_cpu_done; static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); +static void init_cache_modes(void); -static inline void pat_disable(const char *reason) +void pat_disable(const char *reason) { + if (!__pat_enabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); + + init_cache_modes(); } static int __init nopat(char *str) From 32c854288949a34f4dc08655d0c4b0294916e6c0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:59 -0600 Subject: [PATCH 095/813] x86/mm/pat: Replace cpu_has_pat with boot_cpu_has() commit d63dcf49cf5ae5605f4d14229e3888e104f294b1 upstream. Borislav Petkov suggested: > Please use on init paths boot_cpu_has(X86_FEATURE_PAT) and on fast > paths static_cpu_has(X86_FEATURE_PAT). No more of that cpu_has_XXX > ugliness. Replace the use of cpu_has_pat on init paths with boot_cpu_has(). Suggested-by: Borislav Petkov Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Robert Elliott Cc: Toshi Kani Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-4-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a10dd4fcd538..869bb3f03a2c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -212,7 +212,7 @@ static void pat_bsp_init(u64 pat) { u64 tmp_pat; - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { pat_disable("PAT not supported by CPU."); return; } @@ -230,7 +230,7 @@ static void pat_bsp_init(u64 pat) static void pat_ap_init(u64 pat) { - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. From 594055cf63d2ed5b06387f91ce9505ae651fc38d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:00 -0600 Subject: [PATCH 096/813] x86/mtrr: Fix Xorg crashes in Qemu sessions commit edfe63ec97ed8d4496225f7ba54c9ce4207c5431 upstream. A Xorg failure on qemu32 was reported as a regression [1] caused by commit 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled"). This patch fixes the Xorg crash. Negative effects of this regression were the following two failures [2] in Xorg on QEMU with QEMU CPU model "qemu32" (-cpu qemu32), which were triggered by the fact that its virtual CPU does not support MTRRs. #1. copy_process() failed in the check in reserve_pfn_range() copy_process copy_mm dup_mm dup_mmap copy_page_range track_pfn_copy reserve_pfn_range A WC map request was tracked as WC in memtype, which set a PTE as UC (pgprot) per __cachemode2pte_tbl[]. This led to this error in reserve_pfn_range() called from track_pfn_copy(), which obtained a pgprot from a PTE. It converts pgprot to page_cache_mode, which does not necessarily result in the original page_cache_mode since __cachemode2pte_tbl[] redirects multiple types to UC. #2. error path in copy_process() then hit WARN_ON_ONCE in untrack_pfn(). x86/PAT: Xorg:509 map pfn expected mapping type uncached- minus for [mem 0xfd000000-0xfdffffff], got write-combining Call Trace: dump_stack warn_slowpath_common ? untrack_pfn ? untrack_pfn warn_slowpath_null untrack_pfn ? __kunmap_atomic unmap_single_vma ? pagevec_move_tail_fn unmap_vmas exit_mmap mmput copy_process.part.47 _do_fork SyS_clone do_syscall_32_irqs_on entry_INT80_32 These negative effects are caused by two separate bugs, but they can be addressed in separate patches. Fixing the pat_init() issue described below addresses the root cause, and avoids Xorg to hit these cases. When the CPU does not support MTRRs, MTRR does not call pat_init(), which leaves PAT enabled without initializing PAT. This pat_init() issue is a long-standing issue, but manifested as issue #1 (and then hit issue #2) with the above-mentioned commit because the memtype now tracks cache attribute with 'page_cache_mode'. This pat_init() issue existed before the commit, but we used pgprot in memtype. Hence, we did not have issue #1 before. But WC request resulted in WT in effect because WC pgrot is actually WT when PAT is not initialized. This is not how it was designed to work. When PAT is set to disable properly, WC is converted to UC. The use of WT can result in a system crash if the target range does not support WT. Fortunately, nobody ran into such issue before. To fix this pat_init() issue, PAT code has been enhanced to provide pat_disable() interface. Call this interface when MTRRs are disabled. By setting PAT to disable properly, PAT bypasses the memtype check, and avoids issue #1. [1]: https://lkml.org/lkml/2016/3/3/828 [2]: https://lkml.org/lkml/2016/3/4/775 Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-5-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mtrr.h | 6 +++++- arch/x86/kernel/cpu/mtrr/main.c | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index b94f6f64e23d..dbff1456d215 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,6 +24,7 @@ #define _ASM_X86_MTRR_H #include +#include /* @@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline void mtrr_bp_init(void) +{ + pat_disable("MTRRs disabled, skipping PAT initialization too."); +} #define mtrr_ap_init() do {} while (0) -#define mtrr_bp_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index f891b4750f04..1b3417db125b 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -759,8 +759,16 @@ void __init mtrr_bp_init(void) } } - if (!mtrr_enabled()) + if (!mtrr_enabled()) { pr_info("MTRR: Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } } void mtrr_ap_init(void) From a23b299b4a7d0083a3bdb61d1586956f817e8961 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:01 -0600 Subject: [PATCH 097/813] x86/mtrr: Fix PAT init handling when MTRR is disabled commit ad025a73f0e9344ac73ffe1b74c184033e08e7d5 upstream. get_mtrr_state() calls pat_init() on BSP even if MTRR is disabled. This results in calling pat_init() on BSP only since APs do not call pat_init() when MTRR is disabled. This inconsistency between BSP and APs leads to undefined behavior. Make BSP's calling condition to pat_init() consistent with AP's, mtrr_ap_init() and mtrr_aps_init(). Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-6-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mtrr/generic.c | 24 ++++++++++++++---------- arch/x86/kernel/cpu/mtrr/main.c | 3 +++ arch/x86/kernel/cpu/mtrr/mtrr.h | 1 + 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3b533cf37c74..b5624fafa44a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -444,11 +444,24 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } +/* PAT setup for BP. We need to go through sync steps here */ +void __init mtrr_bp_pat_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; - unsigned long flags; unsigned lo, dummy; unsigned int i; @@ -481,15 +494,6 @@ bool __init get_mtrr_state(void) mtrr_state_set = 1; - /* PAT setup for BP. We need to go through sync steps here */ - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); - return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1b3417db125b..fa77ac8291f0 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -752,6 +752,9 @@ void __init mtrr_bp_init(void) /* BIOS may override */ __mtrr_enabled = get_mtrr_state(); + if (mtrr_enabled()) + mtrr_bp_pat_init(); + if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 951884dcc433..6c7ced07d16d 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); +void mtrr_bp_pat_init(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); From 26b340ea33f49af99449607c20c97fa3f499c5fa Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:02 -0600 Subject: [PATCH 098/813] x86/xen, pat: Remove PAT table init code from Xen commit 88ba281108ed0c25c9d292b48bd3f272fcb90dd0 upstream. Xen supports PAT without MTRRs for its guests. In order to enable WC attribute, it was necessary for xen_start_kernel() to call pat_init_cache_modes() to update PAT table before starting guest kernel. Now that the kernel initializes PAT table to the BIOS handoff state when MTRR is disabled, this Xen-specific PAT init code is no longer necessary. Delete it from xen_start_kernel(). Also change __init_cache_modes() to a static function since PAT table should not be tweaked by other modules. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Acked-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-7-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pat.h | 1 - arch/x86/mm/pat.c | 2 +- arch/x86/xen/enlighten.c | 9 --------- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0ad356c066ef..0b1ff4c1c14e 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,7 +7,6 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); -void __init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 869bb3f03a2c..9222e6ae449a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -191,7 +191,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void __init_cache_modes(u64 pat) +static void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cf8d1bcabc56..ffa41591bff9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -74,7 +74,6 @@ #include #include #include -#include #include #ifdef CONFIG_ACPI @@ -1519,7 +1518,6 @@ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; - u64 pat; int rc; if (!xen_start_info) @@ -1627,13 +1625,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->nr_pages); xen_reserve_special_pages(); - /* - * Modify the cache mode translation tables to match Xen's PAT - * configuration. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - __init_cache_modes(pat); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 From e270fdc5237154293d0b58d85ee6584742f98aeb Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:03 -0600 Subject: [PATCH 099/813] x86/pat: Document the PAT initialization sequence commit b6350c21cfe8aa9d65e189509a23c0ea4b8362c2 upstream. Update PAT documentation to describe how PAT is initialized under various configurations. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-8-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- Documentation/x86/pat.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 54944c71b819..2a4ee6302122 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with "debugpat" boot parameter. With this parameter, various debug messages are printed to dmesg log. +PAT Initialization +------------------ + +The following table describes how PAT is initialized under various +configurations. The PAT MSR must be updated by Linux in order to support WC +and WT attributes. Otherwise, the PAT MSR has the value programmed in it +by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests. + + MTRR PAT Call Sequence PAT State PAT MSR + ========================================================= + E E MTRR -> PAT init Enabled OS + E D MTRR -> PAT init Disabled - + D E MTRR -> PAT disable Disabled BIOS + D D MTRR -> PAT disable Disabled - + - np/E PAT -> PAT disable Disabled BIOS + - np/D PAT -> PAT disable Disabled - + E !P/E MTRR -> PAT init Disabled BIOS + D !P/E MTRR -> PAT disable Disabled BIOS + !M !P/E MTRR stub -> PAT disable Disabled BIOS + + Legend + ------------------------------------------------ + E Feature enabled in CPU + D Feature disabled/unsupported in CPU + np "nopat" boot option specified + !P CONFIG_X86_PAT option unset + !M CONFIG_MTRR option unset + Enabled PAT state set to enabled + Disabled PAT state set to disabled + OS PAT initializes PAT MSR with OS setting + BIOS PAT keeps PAT MSR with BIOS setting + From fb93281fa225923c89cf94db59abfd98bba4709f Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 11 Apr 2016 13:36:00 -0600 Subject: [PATCH 100/813] x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386 commit 1886297ce0c8d563a08c8a8c4c0b97743e06cd37 upstream. The following BUG_ON() crash was reported on QEMU/i386: kernel BUG at arch/x86/mm/physaddr.c:79! Call Trace: phys_mem_access_prot_allowed mmap_mem ? mmap_region mmap_region do_mmap vm_mmap_pgoff SyS_mmap_pgoff do_int80_syscall_32 entry_INT80_32 after commit: edfe63ec97ed ("x86/mtrr: Fix Xorg crashes in Qemu sessions") PAT is now set to disabled state when MTRRs are disabled. Thus, reactivating the __pa(high_memory) check in phys_mem_access_prot_allowed(). When CONFIG_DEBUG_VIRTUAL is set, __pa() calls __phys_addr(), which in turn calls slow_virt_to_phys() for 'high_memory'. Because 'high_memory' is set to (the max direct mapped virt addr + 1), it is not a valid virtual address. Hence, slow_virt_to_phys() returns 0 and hit the BUG_ON. Using __pa_nodebug() instead of __pa() will fix this BUG_ON. However, this code block, originally written for Pentiums and earlier, is no longer adequate since a 32-bit Xen guest has MTRRs disabled and supports ZONE_HIGHMEM. In this setup, this code sets UC attribute for accessing RAM in high memory range. Delete this code block as it has been unused for a long time. Reported-by: kernel test robot Reviewed-by: Borislav Petkov Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: David Vrabel Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1460403360-25441-1-git-send-email-toshi.kani@hpe.com Link: https://lkml.org/lkml/2016/4/1/608 Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pat.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9222e6ae449a..6ad687d104ca 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -777,25 +777,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; -#ifdef CONFIG_X86_32 - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting UC or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (!pat_enabled() && - !(boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - pcm = _PAGE_CACHE_MODE_UC; - } -#endif - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; From 821d5e6b8aed558a989514ea85fa14e097111cf0 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 8 Feb 2016 11:05:28 -0800 Subject: [PATCH 101/813] drm/i915: Pretend cursor is always on for ILK-style WM calculations (v2) commit e2e407dc093f530b771ee8bf8fe1be41e3cea8b3 upstream. Due to our lack of two-step watermark programming, our driver has historically pretended that the cursor plane is always on for the purpose of watermark calculations; this helps avoid serious flickering when the cursor turns off/on (e.g., when the user moves the mouse pointer to a different screen). That workaround was accidentally dropped as we started working toward atomic watermark updates. Since we still aren't quite there yet with two-stage updates, we need to resurrect the workaround and treat the cursor as always active. v2: Tweak cursor width calculations slightly to more closely match the logic we used before the atomic overhaul began. (Ville) Cc: simdev11@outlook.com Cc: manfred.kitzbichler@gmail.com Cc: drm-intel-fixes@lists.freedesktop.org Reported-by: simdev11@outlook.com Reported-by: manfred.kitzbichler@gmail.com Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93892 Fixes: 43d59eda1 ("drm/i915: Eliminate usage of plane_wm_parameters from ILK-style WM code (v2)") Signed-off-by: Matt Roper Link: http://patchwork.freedesktop.org/patch/msgid/1454479611-6804-1-git-send-email-matthew.d.roper@intel.com (cherry picked from commit b2435692dbb709d4c8ff3b2f2815c9b8423b72bb) Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/1454958328-30129-1-git-send-email-matthew.d.roper@intel.com Tested-by: Jay Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_pm.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 62284e45d531..eb434881ddbc 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -1789,16 +1789,20 @@ static uint32_t ilk_compute_cur_wm(const struct intel_crtc_state *cstate, const struct intel_plane_state *pstate, uint32_t mem_value) { - int bpp = pstate->base.fb ? pstate->base.fb->bits_per_pixel / 8 : 0; + /* + * We treat the cursor plane as always-on for the purposes of watermark + * calculation. Until we have two-stage watermark programming merged, + * this is necessary to avoid flickering. + */ + int cpp = 4; + int width = pstate->visible ? pstate->base.crtc_w : 64; - if (!cstate->base.active || !pstate->visible) + if (!cstate->base.active) return 0; return ilk_wm_method2(ilk_pipe_pixel_rate(cstate), cstate->base.adjusted_mode.crtc_htotal, - drm_rect_width(&pstate->dst), - bpp, - mem_value); + width, cpp, mem_value); } /* Only for WM_LP. */ From 3cde0e742e29d112aca58731a77d8a3aee386fb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Jul 2016 11:42:38 +0100 Subject: [PATCH 102/813] x86/syscalls/64: Add compat_sys_keyctl for 32-bit userspace commit f7d665627e103e82d34306c7d3f6f46f387c0d8b upstream. x86_64 needs to use compat_sys_keyctl for 32-bit userspace rather than calling sys_keyctl(). The latter will work in a lot of cases, thereby hiding the issue. Reported-by: Stephan Mueller Tested-by: Stephan Mueller Signed-off-by: David Howells Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keyrings@vger.kernel.org Cc: linux-security-module@vger.kernel.org Link: http://lkml.kernel.org/r/146961615805.14395.5581949237156769439.stgit@warthog.procyon.org.uk Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index f17705e1332c..e62f4401e792 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -294,7 +294,7 @@ # 285 sys_setaltroot 286 i386 add_key sys_add_key 287 i386 request_key sys_request_key -288 i386 keyctl sys_keyctl +288 i386 keyctl sys_keyctl compat_sys_keyctl 289 i386 ioprio_set sys_ioprio_set 290 i386 ioprio_get sys_ioprio_get 291 i386 inotify_init sys_inotify_init From 9a95c0cfc6f21b9ac66269d4782ea5a0f58cdf91 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: [PATCH 103/813] block: fix use-after-free in seq file commit 77da160530dd1dc94f6ae15a981f24e5f0021e84 upstream. I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index e5cafa51567c..d2a1d43bf9fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -831,6 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } From 62659f0b9ed71ffb8a1e66a42eb52ab8ddadb77a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 2 Aug 2016 14:03:07 -0700 Subject: [PATCH 104/813] sysv, ipc: fix security-layer leaking commit 9b24fef9f0410fb5364245d6cc2bd044cc064007 upstream. Commit 53dad6d3a8e5 ("ipc: fix race with LSMs") updated ipc_rcu_putref() to receive rcu freeing function but used generic ipc_rcu_free() instead of msg_rcu_free() which does security cleaning. Running LTP msgsnd06 with kmemleak gives the following: cat /sys/kernel/debug/kmemleak unreferenced object 0xffff88003c0a11f8 (size 8): comm "msgsnd06", pid 1645, jiffies 4294672526 (age 6.549s) hex dump (first 8 bytes): 1b 00 00 00 01 00 00 00 ........ backtrace: kmemleak_alloc+0x23/0x40 kmem_cache_alloc_trace+0xe1/0x180 selinux_msg_queue_alloc_security+0x3f/0xd0 security_msg_queue_alloc+0x2e/0x40 newque+0x4e/0x150 ipcget+0x159/0x1b0 SyS_msgget+0x39/0x40 entry_SYSCALL_64_fastpath+0x13/0x8f Manfred Spraul suggested to fix sem.c as well and Davidlohr Bueso to only use ipc_rcu_free in case of security allocation failure in newary() Fixes: 53dad6d3a8e ("ipc: fix race with LSMs") Link: http://lkml.kernel.org/r/1470083552-22966-1-git-send-email-fabf@skynet.be Signed-off-by: Fabian Frederick Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/msg.c | 2 +- ipc/sem.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 1471db9a7e61..c6521c205cb4 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -680,7 +680,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, rcu_read_lock(); ipc_lock_object(&msq->q_perm); - ipc_rcu_putref(msq, ipc_rcu_free); + ipc_rcu_putref(msq, msg_rcu_free); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; diff --git a/ipc/sem.c b/ipc/sem.c index b471e5a3863d..20d07008ad5e 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -442,7 +442,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -1385,7 +1385,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_unlock(); sem_io = ipc_alloc(sizeof(ushort)*nsems); if (sem_io == NULL) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); return -ENOMEM; } @@ -1419,20 +1419,20 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if (nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); if (sem_io == NULL) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); return -ENOMEM; } } if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); err = -ERANGE; goto out_free; } @@ -1722,7 +1722,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); return ERR_PTR(-ENOMEM); } From 3d1c64d81fd887ec0cac56f0299c2234a5450011 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Tue, 19 Jul 2016 12:48:01 -0700 Subject: [PATCH 105/813] fuse: fsync() did not return IO errors commit ac7f052b9e1534c8248f814b6f0068ad8d4a06d2 upstream. Due to implementation of fuse writeback filemap_write_and_wait_range() does not catch errors. We have to do this directly after fuse_sync_writes() Signed-off-by: Alexey Kuznetsov Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi Fixes: 4d99ff8f12eb ("fuse: Turn writeback cache on") Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c2e340d6ec6e..82f714229b1f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -462,6 +462,21 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, goto out; fuse_sync_writes(inode); + + /* + * Due to implementation of fuse writeback + * filemap_write_and_wait_range() does not catch errors. + * We have to do this directly after fuse_sync_writes() + */ + if (test_bit(AS_ENOSPC, &file->f_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags)) + err = -ENOSPC; + if (test_bit(AS_EIO, &file->f_mapping->flags) && + test_and_clear_bit(AS_EIO, &file->f_mapping->flags)) + err = -EIO; + if (err) + goto out; + err = sync_inode_metadata(inode, 1); if (err) goto out; From 9ca5f11d9261e7ed491e425b2efde5e9cecf1447 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Tue, 19 Jul 2016 18:12:26 -0700 Subject: [PATCH 106/813] fuse: fuse_flush must check mapping->flags for errors commit 9ebce595f63a407c5cec98f98f9da8459b73740a upstream. fuse_flush() calls write_inode_now() that triggers writeback, but actual writeback will happen later, on fuse_sync_writes(). If an error happens, fuse_writepage_end() will set error bit in mapping->flags. So, we have to check mapping->flags after fuse_sync_writes(). Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi Fixes: 4d99ff8f12eb ("fuse: Turn writeback cache on") Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 82f714229b1f..d58d4c0af0ce 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -417,6 +417,15 @@ static int fuse_flush(struct file *file, fl_owner_t id) fuse_sync_writes(inode); mutex_unlock(&inode->i_mutex); + if (test_bit(AS_ENOSPC, &file->f_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags)) + err = -ENOSPC; + if (test_bit(AS_EIO, &file->f_mapping->flags) && + test_and_clear_bit(AS_EIO, &file->f_mapping->flags)) + err = -EIO; + if (err) + return err; + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; From b6e0a217f621c62a2abd3ce4c6ae8146c8122e98 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 25 Jul 2016 21:17:04 +0800 Subject: [PATCH 107/813] fuse: fix wrong assignment of ->flags in fuse_send_init() commit 9446385f05c9af25fed53dbed3cc75763730be52 upstream. FUSE_HAS_IOCTL_DIR should be assigned to ->flags, it may be a typo. Signed-off-by: Wei Fang Signed-off-by: Miklos Szeredi Fixes: 69fe05c90ed5 ("fuse: add missing INIT flags") Signed-off-by: Greg Kroah-Hartman --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99..0d5e8e59b390 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -926,7 +926,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; From 92f71339bceeda3a13b71e9663bf422bf3d3e941 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 6 Jul 2016 11:32:20 +0800 Subject: [PATCH 108/813] fs/dcache.c: avoid soft-lockup in dput() commit 47be61845c775643f1aa4d2a54343549f943c94c upstream. We triggered soft-lockup under stress test which open/access/write/close one file concurrently on more than five different CPUs: WARN: soft lockup - CPU#0 stuck for 11s! [who:30631] ... [] dput+0x100/0x298 [] terminate_walk+0x4c/0x60 [] path_lookupat+0x5cc/0x7a8 [] filename_lookup+0x38/0xf0 [] user_path_at_empty+0x78/0xd0 [] user_path_at+0x1c/0x28 [] SyS_faccessat+0xb4/0x230 ->d_lock trylock may failed many times because of concurrently operations, and dput() may execute a long time. Fix this by replacing cpu_relax() with cond_resched(). dput() used to be sleepable, so make it sleepable again should be safe. Signed-off-by: Wei Fang Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 108d7d810be3..71b6056ad35d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -578,7 +578,6 @@ static struct dentry *dentry_kill(struct dentry *dentry) failed: spin_unlock(&dentry->d_lock); - cpu_relax(); return dentry; /* try again with same dentry */ } @@ -752,6 +751,8 @@ void dput(struct dentry *dentry) return; repeat: + might_sleep(); + rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); @@ -783,8 +784,10 @@ repeat: kill_it: dentry = dentry_kill(dentry); - if (dentry) + if (dentry) { + cond_resched(); goto repeat; + } } EXPORT_SYMBOL(dput); From 148fbb966837725e6ff8f151ae6053521d04882c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 15 Jun 2016 22:27:05 +0800 Subject: [PATCH 109/813] crypto: gcm - Filter out async ghash if necessary commit b30bdfa86431afbafe15284a3ad5ac19b49b88e3 upstream. As it is if you ask for a sync gcm you may actually end up with an async one because it does not filter out async implementations of ghash. This patch fixes this by adding the necessary filter when looking for ghash. Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/gcm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/gcm.c b/crypto/gcm.c index bec329b3de8d..d9ea5f9c0574 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -639,7 +639,9 @@ static int crypto_gcm_create_common(struct crypto_template *tmpl, ghash_alg = crypto_find_alg(ghash_name, &crypto_ahash_type, CRYPTO_ALG_TYPE_HASH, - CRYPTO_ALG_TYPE_AHASH_MASK); + CRYPTO_ALG_TYPE_AHASH_MASK | + crypto_requires_sync(algt->type, + algt->mask)); if (IS_ERR(ghash_alg)) return PTR_ERR(ghash_alg); From 08bb036c9d82ec70fd88c7e08345373a97f98637 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 12 Jul 2016 13:17:57 +0800 Subject: [PATCH 110/813] crypto: scatterwalk - Fix test in scatterwalk_done commit 5f070e81bee35f1b7bd1477bb223a873ff657803 upstream. When there is more data to be processed, the current test in scatterwalk_done may prevent us from calling pagedone even when we should. In particular, if we're on an SG entry spanning multiple pages where the last page is not a full page, we will incorrectly skip calling pagedone on the second last page. This patch fixes this by adding a separate test for whether we've reached the end of a page. Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/scatterwalk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c index ea5815c5e128..bc769c448d4a 100644 --- a/crypto/scatterwalk.c +++ b/crypto/scatterwalk.c @@ -72,7 +72,8 @@ static void scatterwalk_pagedone(struct scatter_walk *walk, int out, void scatterwalk_done(struct scatter_walk *walk, int out, int more) { - if (!(scatterwalk_pagelen(walk) & (PAGE_SIZE - 1)) || !more) + if (!more || walk->offset >= walk->sg->offset + walk->sg->length || + !(walk->offset & (PAGE_SIZE - 1))) scatterwalk_pagedone(walk, out, more); } EXPORT_SYMBOL_GPL(scatterwalk_done); From 9e38db20d794504bb52f9592c90cdc8754f97251 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 30 Jun 2016 11:53:46 -0400 Subject: [PATCH 111/813] ext4: check for extents that wrap around commit f70749ca42943faa4d4dcce46dfdcaadb1d0c4b6 upstream. An extent with lblock = 4294967295 and len = 1 will pass the ext4_valid_extent() test: ext4_lblk_t last = lblock + len - 1; if (len == 0 || lblock > last) return 0; since last = 4294967295 + 1 - 1 = 4294967295. This would later trigger the BUG_ON(es->es_lblk + es->es_len < es->es_lblk) in ext4_es_end(). We can simplify it by removing the - 1 altogether and changing the test to use lblock + len <= lblock, since now if len = 0, then lblock + 0 == lblock and it fails, and if len > 0 then lblock + len > lblock in order to pass (i.e. it doesn't overflow). Fixes: 5946d0893 ("ext4: check for overlapping extents in ext4_valid_extent_entries()") Fixes: 2f974865f ("ext4: check for zero length extent explicitly") Cc: Eryu Guan Signed-off-by: Phil Turnbull Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 62880586ed85..8eac7d586997 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -376,9 +376,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); - ext4_lblk_t last = lblock + len - 1; - if (len == 0 || lblock > last) + /* + * We allow neither: + * - zero length + * - overflow/wrap-around + */ + if (lblock + len <= lblock) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } From 5a7f477c725e866729307ff87011f8dd812a3cdf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Jul 2016 10:14:01 -0400 Subject: [PATCH 112/813] ext4: fix deadlock during page writeback commit 646caa9c8e196880b41cd3e3d33a2ebc752bdb85 upstream. Commit 06bd3c36a733 (ext4: fix data exposure after a crash) uncovered a deadlock in ext4_writepages() which was previously much harder to hit. After this commit xfstest generic/130 reproduces the deadlock on small filesystems. The problem happens when ext4_do_update_inode() sets LARGE_FILE feature and marks current inode handle as synchronous. That subsequently results in ext4_journal_stop() called from ext4_writepages() to block waiting for transaction commit while still holding page locks, reference to io_end, and some prepared bio in mpd structure each of which can possibly block transaction commit from completing and thus results in deadlock. Fix the problem by releasing page locks, io_end reference, and submitting prepared bio before calling ext4_journal_stop(). [ Changed to defer the call to ext4_journal_stop() only if the handle is synchronous. --tytso ] Reported-and-tested-by: Eryu Guan Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e31d762eedce..6af24fe4ae2d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2589,13 +2589,36 @@ retry: done = true; } } - ext4_journal_stop(handle); + /* + * Caution: If the handle is synchronous, + * ext4_journal_stop() can wait for transaction commit + * to finish which may depend on writeback of pages to + * complete or on page lock to be released. In that + * case, we have to wait until after after we have + * submitted all the IO, released page locks we hold, + * and dropped io_end reference (for extent conversion + * to be able to complete) before stopping the handle. + */ + if (!ext4_handle_valid(handle) || handle->h_sync == 0) { + ext4_journal_stop(handle); + handle = NULL; + } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); - /* Drop our io_end reference we got from init */ - ext4_put_io_end(mpd.io_submit.io_end); + /* + * Drop our io_end reference we got from init. We have + * to be careful and use deferred io_end finishing if + * we are still holding the transaction as we can + * release the last reference to io_end which may end + * up doing unwritten extent conversion. + */ + if (handle) { + ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_journal_stop(handle); + } else + ext4_put_io_end(mpd.io_submit.io_end); if (ret == -ENOSPC && sbi->s_journal) { /* From 175f36cb34d4b06ca2384073f2b741db2e0f915b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 4 Jul 2016 11:03:00 -0400 Subject: [PATCH 113/813] ext4: don't call ext4_should_journal_data() on the journal inode commit 6a7fd522a7c94cdef0a3b08acf8e6702056e635c upstream. If ext4_fill_super() fails early, it's possible for ext4_evict_inode() to call ext4_should_journal_data() before superblock options and flags are fully set up. In that case, the iput() on the journal inode can end up causing a BUG(). Work around this problem by reordering the tests so we only call ext4_should_journal_data() after we know it's not the journal inode. Fixes: 2d859db3e4 ("ext4: fix data corruption in inodes with journalled data") Fixes: 2b405bfa84 ("ext4: fix data=journal fast mount/umount hang") Cc: Jan Kara Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6af24fe4ae2d..9a5ad0f0d3ed 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -205,9 +205,9 @@ void ext4_evict_inode(struct inode *inode) * Note that directories do not have this problem because they * don't use page cache. */ - if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT4_JOURNAL_INO) { + if (inode->i_ino != EXT4_JOURNAL_INO && + ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; From f8d4d52ce410c804d56fab866fa9fd2ec04d8d6e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 5 Jul 2016 20:01:52 -0400 Subject: [PATCH 114/813] ext4: validate s_reserved_gdt_blocks on mount commit 5b9554dc5bf008ae7f68a52e3d7e76c0920938a2 upstream. If s_reserved_gdt_blocks is extremely large, it's possible for ext4_init_block_bitmap(), which is called when ext4 sets up an uninitialized block bitmap, to corrupt random kernel memory. Add the same checks which e2fsck has --- it must never be larger than blocksize / sizeof(__u32) --- and then add a backup check in ext4_init_block_bitmap() in case the superblock gets modified after the file system is mounted. Reported-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/balloc.c | 3 +++ fs/ext4/super.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fe1f50fe764f..f97110461c19 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -208,6 +208,9 @@ static int ext4_init_block_bitmap(struct super_block *sb, memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 852c26806af2..2d7b5462bcaa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3372,6 +3372,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + goto failed_mount; + } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { if (blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, From db82c747482bab275cd639ed0007ee27ec0c35a1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 14 Jul 2016 23:21:35 -0400 Subject: [PATCH 115/813] ext4: short-cut orphan cleanup on error commit c65d5c6c81a1f27dec5f627f67840726fcd146de upstream. If we encounter a filesystem error during orphan cleanup, we should stop. Otherwise, we may end up in an infinite loop where the same inode is processed again and again. EXT4-fs (loop0): warning: checktime reached, running e2fsck is recommended EXT4-fs error (device loop0): ext4_mb_generate_buddy:758: group 2, block bitmap and bg descriptor inconsistent: 6117 vs 0 free clusters Aborting journal on device loop0-8. EXT4-fs (loop0): Remounting filesystem read-only EXT4-fs error (device loop0) in ext4_free_blocks:4895: Journal has aborted EXT4-fs error (device loop0) in ext4_do_update_inode:4893: Journal has aborted EXT4-fs error (device loop0) in ext4_do_update_inode:4893: Journal has aborted EXT4-fs error (device loop0) in ext4_ext_remove_space:3068: IO failure EXT4-fs error (device loop0) in ext4_ext_truncate:4667: Journal has aborted EXT4-fs error (device loop0) in ext4_orphan_del:2927: Journal has aborted EXT4-fs error (device loop0) in ext4_do_update_inode:4893: Journal has aborted EXT4-fs (loop0): Inode 16 (00000000618192a0): orphan list check failed! [...] EXT4-fs (loop0): Inode 16 (0000000061819748): orphan list check failed! [...] EXT4-fs (loop0): Inode 16 (0000000061819bf0): orphan list check failed! [...] See-also: c9eb13a9105 ("ext4: fix hang when processing corrupted orphaned inode list") Cc: Jan Kara Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d7b5462bcaa..c542ebcf7a92 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2240,6 +2240,16 @@ static void ext4_orphan_cleanup(struct super_block *sb, while (es->s_last_orphan) { struct inode *inode; + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; From 3a22cf0c7b597f7139d3fdd27fa70aa55aa6d977 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 14 Jul 2016 23:02:47 -0400 Subject: [PATCH 116/813] ext4: fix reference counting bug on block allocation error commit 554a5ccc4e4a20c5f3ec859de0842db4b4b9c77e upstream. If we hit this error when mounted with errors=continue or errors=remount-ro: EXT4-fs error (device loop0): ext4_mb_mark_diskspace_used:2940: comm ext4.exe: Allocating blocks 5090-6081 which overlap fs metadata then ext4_mb_new_blocks() will call ext4_mb_release_context() and try to continue. However, ext4_mb_release_context() is the wrong thing to call here since we are still actually using the allocation context. Instead, just error out. We could retry the allocation, but there is a possibility of getting stuck in an infinite loop instead, so this seems safer. [ Fixed up so we don't return EAGAIN to userspace. --tytso ] Fixes: 8556e8f3b6 ("ext4: Don't allow new groups to be added during block allocation") Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Cc: Aneesh Kumar K.V Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf734170daa9..c4dcac8a018d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2932,7 +2932,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error - * Fix the bitmap and repeat the block allocation + * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); @@ -2941,7 +2941,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) - err = -EAGAIN; + err = -EFSCORRUPTED; goto out_err; } @@ -4506,18 +4506,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); - if (*errp == -EAGAIN) { - /* - * drop the reference that we took - * in ext4_mb_use_best_found - */ - ext4_mb_release_context(ac); - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - goto repeat; - } else if (*errp) { + if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { From 8627c7750a66a46d56d3564e1e881aa53764497c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jul 2016 15:44:57 -0700 Subject: [PATCH 117/813] mm: memcontrol: fix cgroup creation failure after many small jobs commit 73f576c04b9410ed19660f74f97521bee6e1c546 upstream. The memory controller has quite a bit of state that usually outlives the cgroup and pins its CSS until said state disappears. At the same time it imposes a 16-bit limit on the CSS ID space to economically store IDs in the wild. Consequently, when we use cgroups to contain frequent but small and short-lived jobs that leave behind some page cache, we quickly run into the 64k limitations of outstanding CSSs. Creating a new cgroup fails with -ENOSPC while there are only a few, or even no user-visible cgroups in existence. Although pinning CSSs past cgroup removal is common, there are only two instances that actually need an ID after a cgroup is deleted: cache shadow entries and swapout records. Cache shadow entries reference the ID weakly and can deal with the CSS having disappeared when it's looked up later. They pose no hurdle. Swap-out records do need to pin the css to hierarchically attribute swapins after the cgroup has been deleted; though the only pages that remain swapped out after offlining are tmpfs/shmem pages. And those references are under the user's control, so they are manageable. This patch introduces a private 16-bit memcg ID and switches swap and cache shadow entries over to using that. This ID can then be recycled after offlining when the CSS remains pinned only by objects that don't specifically need it. This script demonstrates the problem by faulting one cache page in a new cgroup and deleting it again: set -e mkdir -p pages for x in `seq 128000`; do [ $((x % 1000)) -eq 0 ] && echo $x mkdir /cgroup/foo echo $$ >/cgroup/foo/cgroup.procs echo trex >pages/$x echo $$ >/cgroup/cgroup.procs rmdir /cgroup/foo done When run on an unpatched kernel, we eventually run out of possible IDs even though there are no visible cgroups: [root@ham ~]# ./cssidstress.sh [...] 65000 mkdir: cannot create directory '/cgroup/foo': No space left on device After this patch, the IDs get released upon cgroup destruction and the cache and css objects get released once memory reclaim kicks in. [hannes@cmpxchg.org: init the IDR] Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: John Garcia Reviewed-by: Vladimir Davydov Acked-by: Tejun Heo Cc: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- include/linux/memcontrol.h | 8 ++++ mm/memcontrol.c | 91 ++++++++++++++++++++++++++++++-------- mm/slab_common.c | 4 +- 3 files changed, 83 insertions(+), 20 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cd0e2413c358..435fd8426b8a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -174,6 +174,11 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -183,6 +188,9 @@ struct mem_cgroup_thresholds { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter memsw; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 67648e6b2ac8..e139c982b143 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -272,21 +272,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } /* Writing them here to avoid exposing memcg's inner layout */ @@ -4124,6 +4110,60 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4178,6 +4218,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto out_free_stat; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto out_free_stat; + return memcg; out_free_stat: @@ -4263,9 +4309,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return &memcg->css; free_out: + idr_remove(&mem_cgroup_idr, memcg->id.id); __mem_cgroup_free(memcg); return ERR_PTR(error); } @@ -4277,8 +4325,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); int ret; - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); if (!parent) return 0; @@ -4352,6 +4401,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_deactivate_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5685,6 +5736,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5703,6 +5755,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /** @@ -5726,7 +5781,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memsw, 1); mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 3c6a86b4ec25..bec2fce9fafc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -521,8 +521,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock; From a0fddee3fb342a4150c83c36e317660663691a72 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:00 -0700 Subject: [PATCH 118/813] mm: memcontrol: fix swap counter leak on swapout from offline cgroup commit 1f47b61fb4077936465dcde872a4e5cc4fe708da upstream. An offline memory cgroup might have anonymous memory or shmem left charged to it and no swap. Since only swap entries pin the id of an offline cgroup, such a cgroup will have no id and so an attempt to swapout its anon/shmem will not store memory cgroup info in the swap cgroup map. As a result, memcg->swap or memcg->memsw will never get uncharged from it and any of its ascendants. Fix this by always charging swapout to the first ancestor cgroup that hasn't released its id yet. [hannes@cmpxchg.org: add comment to mem_cgroup_swapout] [vdavydov@virtuozzo.com: use WARN_ON_ONCE() in mem_cgroup_id_get_online()] Link: http://lkml.kernel.org/r/20160803123445.GJ13263@esperanza Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/5336daa5c9a32e776067773d9da655d2dc126491.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e139c982b143..27eaee2acaae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4141,6 +4141,24 @@ static void mem_cgroup_id_get(struct mem_cgroup *memcg) atomic_inc(&memcg->id.ref); } +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + static void mem_cgroup_id_put(struct mem_cgroup *memcg) { if (atomic_dec_and_test(&memcg->id.ref)) { @@ -5721,7 +5739,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5736,16 +5754,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is From eccccb42d44f44badcfbdbb4e21a4f30d9694666 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:03 -0700 Subject: [PATCH 119/813] mm: memcontrol: fix memcg id ref counter on swap charge move commit 615d66c37c755c49ce022c9e5ac0875d27d2603d upstream. Since commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") swap entries do not pin memcg->css.refcnt directly. Instead, they pin memcg->id.ref. So we should adjust the reference counters accordingly when moving swap charges between cgroups. Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/9ce297c64954a42dc90b543bc76106c4a94f07e8.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 27eaee2acaae..6b90d184e9c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4136,9 +4136,9 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + atomic_add(n, &memcg->id.ref); } static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) @@ -4159,9 +4159,9 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) return memcg; } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4170,6 +4170,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4854,6 +4864,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4861,9 +4873,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); From e4884275a4bb1cbce5a24a507c3e267c887dc1bd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 Aug 2016 09:31:54 +0200 Subject: [PATCH 120/813] Linux 4.4.18 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 76d34f763a41..eaedea88a8a7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 17 +SUBLEVEL = 18 EXTRAVERSION = NAME = Blurry Fish Butt From df9e01a6c5d4d31aeb109c60fd964da6b5312629 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 12 Jun 2016 17:37:52 -0700 Subject: [PATCH 121/813] android-recommended.cfg: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Bug: 28967314 Change-Id: I588c2d544250e9e4b5082b43c237b8f85b7313ca Signed-off-by: Jeff Vander Stoep --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index c3222a77ba24..2f1ef077aa9e 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -11,6 +11,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y CONFIG_DM_UEVENT=y From 6fcda640710c9972321673f8290cdbb5bdfd06b5 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jun 2016 10:28:49 -0700 Subject: [PATCH 122/813] ANDROID: sdcardfs: fix itnull.cocci warnings List_for_each_entry has the property that the first argument is always bound to a real list element, never NULL, so testing dentry is not needed. Generated by: scripts/coccinelle/iterators/itnull.cocci Cc: Daniel Rosenberg Signed-off-by: Julia Lawall Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- fs/sdcardfs/derived_perm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 128b3e56851f..41e0e11b3c35 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -112,7 +112,7 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) void get_derive_permissions_recursive(struct dentry *parent) { struct dentry *dentry; list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (dentry && dentry->d_inode) { + if (dentry->d_inode) { mutex_lock(&dentry->d_inode->i_mutex); get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); From d1b7a4749f8b9c3bc020413f9a4c02524b15c96b Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Feb 2016 11:03:12 +0000 Subject: [PATCH 123/813] UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing (cherry pick from commit 23c8a812dc3c621009e4f0e5342aa4e2ede1ceaa) This fixes CVE-2016-0758. In the ASN.1 decoder, when the length field of an ASN.1 value is extracted, it isn't validated against the remaining amount of data before being added to the cursor. With a sufficiently large size indicated, the check: datalen - dp < 2 may then fail due to integer overflow. Fix this by checking the length indicated against the amount of remaining data in both places a definite length is determined. Whilst we're at it, make the following changes: (1) Check the maximum size of extended length does not exceed the capacity of the variable it's being stored in (len) rather than the type that variable is assumed to be (size_t). (2) Compare the EOC tag to the symbolic constant ASN1_EOC rather than the integer 0. (3) To reduce confusion, move the initialisation of len outside of: for (len = 0; n > 0; n--) { since it doesn't have anything to do with the loop counter n. Signed-off-by: David Howells Reviewed-by: Mimi Zohar Acked-by: David Woodhouse Acked-by: Peter Jones Change-Id: If760bc3b8ab0e59fefc24fa687514324348fb8e8 Bug: 29814470 --- lib/asn1_decoder.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2b3f46c049d4..554522934c44 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -74,7 +74,7 @@ next_tag: /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -96,10 +96,8 @@ next_tag: /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -110,14 +108,18 @@ next_tag: } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; From 2f6175aefb22ea7bbc1244f32c8b80abe599b72c Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Fri, 5 Jun 2015 18:59:29 -0700 Subject: [PATCH 124/813] cpuset: Add allow_attach hook for cpusets on android. This patch provides a allow_attach hook for cpusets, which resolves lots of the following logcat noise. W SchedPolicy: add_tid_to_cgroup failed to write '2816' (Permission denied); fd=29 W ActivityManager: Failed setting process group of 2816 to 0 W System.err: java.lang.IllegalArgumentException W System.err: at android.os.Process.setProcessGroup(Native Method) W System.err: at com.android.server.am.ActivityManagerService.applyOomAdjLocked(ActivityManagerService.java:18763) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19028) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19106) W System.err: at com.android.server.am.ActiveServices.serviceDoneExecutingLocked(ActiveServices.java:2015) W System.err: at com.android.server.am.ActiveServices.publishServiceLocked(ActiveServices.java:905) W System.err: at com.android.server.am.ActivityManagerService.publishService(ActivityManagerService.java:16065) W System.err: at android.app.ActivityManagerNative.onTransact(ActivityManagerNative.java:1007) W System.err: at com.android.server.am.ActivityManagerService.onTransact(ActivityManagerService.java:2493) W System.err: at android.os.Binder.execTransact(Binder.java:453) Change-Id: Ic1b61b2bbb7ce74c9e9422b5e22ee9078251de21 [Ported to 4.4, added commit message] Signed-off-by: John Stultz --- kernel/cpuset.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 11eaf14b52c2..a65d63463420 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2074,12 +2074,30 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +static int cpuset_allow_attach(struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); + + if ((current != task) && !capable(CAP_SYS_ADMIN) && + cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) + return -EACCES; + } + + return 0; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .post_attach = cpuset_post_attach, From 0e1b4ef27e57a8c3c9687c5c2da9dc773ab46672 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 5 Jul 2016 22:12:36 -0700 Subject: [PATCH 125/813] UPSTREAM: ppp: defer netns reference release for ppp channel (cherry pick from commit 205e1e255c479f3fd77446415706463b282f94e4) Matt reported that we have a NULL pointer dereference in ppp_pernet() from ppp_connect_channel(), i.e. pch->chan_net is NULL. This is due to that a parallel ppp_unregister_channel() could happen while we are in ppp_connect_channel(), during which pch->chan_net set to NULL. Since we need a reference to net per channel, it makes sense to sync the refcnt with the life time of the channel, therefore we should release this reference when we destroy it. Fixes: 1f461dcdd296 ("ppp: take reference on channels netns") Reported-by: Matt Bennett Cc: Paul Mackerras Cc: linux-ppp@vger.kernel.org Cc: Guillaume Nault Cc: Cyrill Gorcunov Signed-off-by: Cong Wang Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Fixes: Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b ("UPSTREAM: ppp: take reference on channels netns") Signed-off-by: Amit Pundir Change-Id: I24d0bb6f349ab3829f63cfe935ed97b6913a3508 --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 174e06ec7c2f..e5bb870b5461 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2390,8 +2390,6 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); - put_net(pch->chan_net); - pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); @@ -2984,6 +2982,9 @@ ppp_disconnect_channel(struct channel *pch) */ static void ppp_destroy_channel(struct channel *pch) { + put_net(pch->chan_net); + pch->chan_net = NULL; + atomic_dec(&channel_count); if (!pch->file.dead) { From a819ad8088b54cd5f904a134d0775a1ad1f0246f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2016 11:20:55 -0700 Subject: [PATCH 126/813] FROMLIST: proc: Relax /proc//timerslack_ns capability requirements When an interface to allow a task to change another tasks timerslack was first proposed, it was suggested that something greater then CAP_SYS_NICE would be needed, as a task could be delayed further then what normally could be done with nice adjustments. So CAP_SYS_PTRACE was adopted instead for what became the /proc//timerslack_ns interface. However, for Android (where this feature originates), giving the system_server CAP_SYS_PTRACE would allow it to observe and modify all tasks memory. This is considered too high a privilege level for only needing to change the timerslack. After some discussion, it was realized that a CAP_SYS_NICE process can set a task as SCHED_FIFO, so they could fork some spinning processes and set them all SCHED_FIFO 99, in effect delaying all other tasks for an infinite amount of time. So as a CAP_SYS_NICE task can already cause trouble for other tasks, using it as a required capability for accessing and modifying /proc//timerslack_ns seems sufficient. Thus, this patch loosens the capability requirements to CAP_SYS_NICE and removes CAP_SYS_PTRACE, simplifying some of the code flow as well. This is technically an ABI change, but as the feature just landed in 4.6, I suspect no one is yet using it. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Reviewed-by: Nick Kralevich Acked-by: Serge Hallyn Acked-by: Kees Cook Signed-off-by: John Stultz FROMLIST URL: https://lkml.org/lkml/2016/7/21/522 Change-Id: Ia75481402e3948165a1b7c1551c539530cb25509 (Cherry picked against common/android-4.4) Signed-off-by: John Stultz --- fs/proc/base.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 67b6d7e2313f..f9600828c2e0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2261,16 +2261,19 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { count = -EPERM; + goto out; + } + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + +out: put_task_struct(p); return count; @@ -2280,19 +2283,22 @@ static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; - int err = 0; + int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - seq_printf(m, "%llu\n", p->timer_slack_ns); - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { err = -EPERM; + goto out; + } + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + +out: put_task_struct(p); return err; From f3758151ca099b3988de8c1cb12f0716a5cb99b4 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2016 17:22:19 -0700 Subject: [PATCH 127/813] FROMLIST: proc: Add LSM hook checks to /proc//timerslack_ns As requested, this patch checks the existing LSM hooks task_getscheduler/task_setscheduler when reading or modifying the task's timerslack value. Previous versions added new get/settimerslack LSM hooks, but since they checked the same PROCESS__SET/GETSCHED values as existing hooks, it was suggested we just use the existing ones. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: James Morris Cc: Android Kernel Team Cc: linux-security-module@vger.kernel.org Cc: selinux@tycho.nsa.gov Signed-off-by: John Stultz FROMLIST URL: https://lkml.org/lkml/2016/7/21/523 Change-Id: Id157d10e2fe0b85f1be45035a6117358a42af028 (Cherry picked back to common/android-4.4) Signed-off-by: John Stultz --- fs/proc/base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index f9600828c2e0..308d4a321874 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2266,6 +2266,12 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, goto out; } + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } + task_lock(p); if (slack_ns == 0) p->timer_slack_ns = p->default_timer_slack_ns; @@ -2294,6 +2300,10 @@ static int timerslack_ns_show(struct seq_file *m, void *v) goto out; } + err = security_task_getscheduler(p); + if (err) + goto out; + task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); From 1b1e3f6d7c321a8df0807c719bb63a49ed96e0e9 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 15 Jul 2016 12:39:13 +0200 Subject: [PATCH 128/813] BACKPORT: brcmfmac: defer DPC processing during probe The sdio dpc starts processing when in SDIOD_STATE_DATA. This state was entered right after firmware download. This patch moves that transition just before enabling sdio interrupt handling thus avoiding watchdog expiry which would put the bus to sleep while probing. Change-Id: I09c60752374b8145da78000935062be08c5c8a52 Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Dmitry Shmidt --- drivers/net/wireless/brcm80211/brcmfmac/sdio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c index 7e74ac3ad815..bcf29bf6f727 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c @@ -3401,10 +3401,6 @@ static int brcmf_sdio_download_firmware(struct brcmf_sdio *bus, goto err; } - /* Allow full data communication using DPC from now on. */ - brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA); - bcmerror = 0; - err: brcmf_sdio_clkctl(bus, CLK_SDONLY, false); sdio_release_host(bus->sdiodev->func[1]); @@ -4112,6 +4108,9 @@ static void brcmf_sdio_firmware_callback(struct device *dev, } if (err == 0) { + /* Allow full data communication using DPC from now on. */ + brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA); + err = brcmf_sdiod_intr_register(sdiodev); if (err != 0) brcmf_err("intr register failed:%d\n", err); From 58612bdce3a8c547ae648f035dbe15c476576bae Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Sun, 31 Jul 2016 22:30:14 -0400 Subject: [PATCH 129/813] usb: gadget: f_accessory: remove duplicate endpoint alloc usb_ep_autoconfig is called twice for allocating bulk out endpoint. Removed the unwanted call. Fixes Issue: 67180 Change-Id: I03e87a86fbbbc85831ff7f0496adf038d1de2956 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index c62123560143..2ca16a577542 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -531,15 +531,6 @@ static int create_bulk_endpoints(struct acc_dev *dev, ep->driver_data = dev; /* claim the endpoint */ dev->ep_out = ep; - ep = usb_ep_autoconfig(cdev->gadget, out_desc); - if (!ep) { - DBG(cdev, "usb_ep_autoconfig for ep_out failed\n"); - return -ENODEV; - } - DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name); - ep->driver_data = dev; /* claim the endpoint */ - dev->ep_out = ep; - /* now allocate requests for our endpoints */ for (i = 0; i < TX_REQ_MAX; i++) { req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE); From ee665a37d5251d8af6e2904f08f7cfd6fd35ca72 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Sun, 31 Jul 2016 17:07:46 +0530 Subject: [PATCH 130/813] Revert "panic: Add board ID to panic output" This reverts commit 4e09c510185cb4db2277ce81cce81b7aa06bea45. I checked for the usage of this debug helper in AOSP common kernels as well as vendor kernels (e.g exynos, msm, mediatek, omap, tegra, x86, x86_64) hosted at https://android.googlesource.com/kernel/ and I found out that other than few fairly obsolete Omap trees (for tuna & Glass) and Exynos tree (for Manta), there is no active user of this debug helper. So we can safely remove this helper code. Signed-off-by: Amit Pundir --- include/linux/kernel.h | 4 ---- kernel/panic.c | 8 -------- 2 files changed, 12 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2955e672391d..924853d33a13 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -830,8 +830,4 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } /* OTHER_WRITABLE? Generally considered a bad idea. */ \ BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) - -/* To identify board information in panic logs, set this */ -extern char *mach_panic_string; - #endif diff --git a/kernel/panic.c b/kernel/panic.c index 223564d3e1f8..41e2b54f36b5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -28,9 +28,6 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -/* Machine specific panic information string */ -char *mach_panic_string; - int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; @@ -415,11 +412,6 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - - if (mach_panic_string) - printk(KERN_WARNING "Board Information: %s\n", - mach_panic_string); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } From 96cb71b8c592d760ad2e22432052510615987c41 Mon Sep 17 00:00:00 2001 From: James Carr Date: Fri, 29 Jul 2016 19:02:16 -0700 Subject: [PATCH 131/813] Implement memory_state_time, used by qcom,cpubw New driver memory_state_time tracks time spent in different DDR frequency and bandwidth states. Memory drivers such as qcom,cpubw can post updated state to the driver after registering a callback. Processed by a workqueue Bandwidth buckets are read in from device tree in the relevant qualcomm section, can be defined in any quantity and spacing. The data is exposed at /sys/kernel/memory_state_time, able to be read by the Android framework. Functionality is behind a config option CONFIG_MEMORY_STATE_TIME Change-Id: I4fee165571cb975fb9eacbc9aada5e6d7dd748f0 Signed-off-by: James Carr --- .../bindings/misc/memory-state-time.txt | 8 + android/configs/android-recommended.cfg | 1 + drivers/misc/Kconfig | 6 + drivers/misc/Makefile | 1 + drivers/misc/memory_state_time.c | 454 ++++++++++++++++++ include/linux/memory-state-time.h | 42 ++ 6 files changed, 512 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/memory-state-time.txt create mode 100644 drivers/misc/memory_state_time.c create mode 100644 include/linux/memory-state-time.h diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt new file mode 100644 index 000000000000..c99a506c030d --- /dev/null +++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt @@ -0,0 +1,8 @@ +Memory bandwidth and frequency state tracking + +Required properties: +- compatible : should be: + "memory-state-time" +- freq-tbl: Should contain entries with each frequency in Hz. +- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps. + Must match the framework power_profile.xml for the device. diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 2f1ef077aa9e..3465a848d74d 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -119,6 +119,7 @@ CONFIG_TIMER_STATS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y +CONFIG_MEMORY_STATE_TIME=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index ca7463544c72..06eddc0cb24f 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -531,6 +531,12 @@ config UID_CPUTIME help Per UID based cpu time statistics exported to /proc/uid_cputime +config MEMORY_STATE_TIME + tristate "Memory freq/bandwidth time statistics" + depends on PROFILING + help + Memory time statistics exported to /sys/kernel/memory_state_time + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index e5142b836aee..b76b4c9fe104 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -57,3 +57,4 @@ obj-$(CONFIG_ECHO) += echo/ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o obj-$(CONFIG_CXL_BASE) += cxl/ obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o +obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c new file mode 100644 index 000000000000..34c797a06a31 --- /dev/null +++ b/drivers/misc/memory_state_time.c @@ -0,0 +1,454 @@ +/* drivers/misc/memory_state_time.c + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define KERNEL_ATTR_RW(_name) \ +static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +#define FREQ_HASH_BITS 4 +DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS); + +static DEFINE_MUTEX(mem_lock); + +#define TAG "memory_state_time" +#define BW_NODE "/soc/memory-state-time" +#define FREQ_TBL "freq-tbl" +#define BW_TBL "bw-buckets" +#define NUM_SOURCES "num-sources" + +#define LOWEST_FREQ 2 + +static int curr_bw; +static int curr_freq; +static u32 *bw_buckets; +static u32 *freq_buckets; +static int num_freqs; +static int num_buckets; +static int registered_bw_sources; +static u64 last_update; +static bool init_success; +static struct workqueue_struct *memory_wq; +static u32 num_sources = 10; +static int *bandwidths; + +struct freq_entry { + int freq; + u64 *buckets; /* Bandwidth buckets. */ + struct hlist_node hash; +}; + +struct queue_container { + struct work_struct update_state; + int value; + u64 time_now; + int id; + struct mutex *lock; +}; + +static int find_bucket(int bw) +{ + int i; + + if (bw_buckets != NULL) { + for (i = 0; i < num_buckets; i++) { + if (bw_buckets[i] > bw) { + pr_debug("Found bucket %d for bandwidth %d\n", + i, bw); + return i; + } + } + return num_buckets - 1; + } + return 0; +} + +static u64 get_time_diff(u64 time_now) +{ + u64 ms; + + ms = time_now - last_update; + last_update = time_now; + return ms; +} + +static ssize_t show_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, j; + int len = 0; + struct freq_entry *freq_entry; + + for (i = 0; i < num_freqs; i++) { + hash_for_each_possible(freq_hash_table, freq_entry, hash, + freq_buckets[i]) { + if (freq_entry->freq == freq_buckets[i]) { + len += scnprintf(buf + len, PAGE_SIZE - len, + "%d ", freq_buckets[i]); + if (len >= PAGE_SIZE) + break; + for (j = 0; j < num_buckets; j++) { + len += scnprintf(buf + len, + PAGE_SIZE - len, + "%llu ", + freq_entry->buckets[j]); + } + len += scnprintf(buf + len, PAGE_SIZE - len, + "\n"); + } + } + } + pr_debug("Current Time: %llu\n", ktime_get_boot_ns()); + return len; +} +KERNEL_ATTR_RO(show_stat); + +static void update_table(u64 time_now) +{ + struct freq_entry *freq_entry; + + pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq); + hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) { + if (curr_freq == freq_entry->freq) { + freq_entry->buckets[find_bucket(curr_bw)] + += get_time_diff(time_now); + break; + } + } +} + +static bool freq_exists(int freq) +{ + int i; + + for (i = 0; i < num_freqs; i++) { + if (freq == freq_buckets[i]) + return true; + } + return false; +} + +static int calculate_total_bw(int bw, int index) +{ + int i; + int total_bw = 0; + + pr_debug("memory_state_time New bw %d for id %d\n", bw, index); + bandwidths[index] = bw; + for (i = 0; i < registered_bw_sources; i++) + total_bw += bandwidths[i]; + return total_bw; +} + +static void freq_update_do_work(struct work_struct *work) +{ + struct queue_container *freq_state_update + = container_of(work, struct queue_container, + update_state); + if (freq_state_update) { + mutex_lock(&mem_lock); + update_table(freq_state_update->time_now); + curr_freq = freq_state_update->value; + mutex_unlock(&mem_lock); + kfree(freq_state_update); + } +} + +static void bw_update_do_work(struct work_struct *work) +{ + struct queue_container *bw_state_update + = container_of(work, struct queue_container, + update_state); + if (bw_state_update) { + mutex_lock(&mem_lock); + update_table(bw_state_update->time_now); + curr_bw = calculate_total_bw(bw_state_update->value, + bw_state_update->id); + mutex_unlock(&mem_lock); + kfree(bw_state_update); + } +} + +static void memory_state_freq_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (freq_exists(value) && init_success) { + struct queue_container *freq_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!freq_container) + return; + INIT_WORK(&freq_container->update_state, + freq_update_do_work); + freq_container->time_now = ktime_get_boot_ns(); + freq_container->value = value; + pr_debug("Scheduling freq update in work queue\n"); + queue_work(memory_wq, &freq_container->update_state); + } else { + pr_debug("Freq does not exist.\n"); + } + } +} + +static void memory_state_bw_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (init_success) { + struct queue_container *bw_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!bw_container) + return; + INIT_WORK(&bw_container->update_state, + bw_update_do_work); + bw_container->time_now = ktime_get_boot_ns(); + bw_container->value = value; + bw_container->id = ub->id; + pr_debug("Scheduling bandwidth update in work queue\n"); + queue_work(memory_wq, &bw_container->update_state); + } + } +} + +struct memory_state_update_block *memory_state_register_frequency_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating frequency source\n"); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_freq_update; + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_frequency_source); + +struct memory_state_update_block *memory_state_register_bandwidth_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating bandwidth source %d\n", + registered_bw_sources); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_bw_update; + if (registered_bw_sources < num_sources) { + block->id = registered_bw_sources++; + } else { + pr_err("Unable to allocate source; max number reached\n"); + kfree(block); + return NULL; + } + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source); + +/* Buckets are designated by their maximum. + * Returns the buckets decided by the capability of the device. + */ +static int get_bw_buckets(struct device *dev) +{ + int ret, lenb; + struct device_node *node = dev->of_node; + + of_property_read_u32(node, NUM_SOURCES, &num_sources); + if (of_find_property(node, BW_TBL, &lenb)) { + bandwidths = devm_kzalloc(dev, + sizeof(*bandwidths) * num_sources, GFP_KERNEL); + if (!bandwidths) + return -ENOMEM; + lenb /= sizeof(*bw_buckets); + bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets), + GFP_KERNEL); + if (!bw_buckets) { + devm_kfree(dev, bandwidths); + return -ENOMEM; + } + ret = of_property_read_u32_array(node, BW_TBL, bw_buckets, + lenb); + if (ret < 0) { + devm_kfree(dev, bandwidths); + devm_kfree(dev, bw_buckets); + pr_err("Unable to read bandwidth table from device tree.\n"); + return ret; + } + } + curr_bw = 0; + num_buckets = lenb; + return 0; +} + +/* Adds struct freq_entry nodes to the hashtable for each compatible frequency. + * Returns the supported number of frequencies. + */ +static int freq_buckets_init(struct device *dev) +{ + struct freq_entry *freq_entry; + int i; + int ret, lenf; + struct device_node *node = dev->of_node; + + if (of_find_property(node, FREQ_TBL, &lenf)) { + lenf /= sizeof(*freq_buckets); + freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets), + GFP_KERNEL); + if (!freq_buckets) + return -ENOMEM; + pr_debug("freqs found len %d\n", lenf); + ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets, + lenf); + if (ret < 0) { + devm_kfree(dev, freq_buckets); + pr_err("Unable to read frequency table from device tree.\n"); + return ret; + } + pr_debug("ret freq %d\n", ret); + } + num_freqs = lenf; + curr_freq = freq_buckets[LOWEST_FREQ]; + + for (i = 0; i < num_freqs; i++) { + freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry), + GFP_KERNEL); + if (!freq_entry) + return -ENOMEM; + freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets, + GFP_KERNEL); + if (!freq_entry->buckets) { + devm_kfree(dev, freq_entry); + return -ENOMEM; + } + pr_debug("memory_state_time Adding freq to ht %d\n", + freq_buckets[i]); + freq_entry->freq = freq_buckets[i]; + hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]); + } + return 0; +} + +struct kobject *memory_kobj; +EXPORT_SYMBOL_GPL(memory_kobj); + +static struct attribute *memory_attrs[] = { + &show_stat_attr.attr, + NULL +}; + +static struct attribute_group memory_attr_group = { + .attrs = memory_attrs, +}; + +static int memory_state_time_probe(struct platform_device *pdev) +{ + int error; + + error = get_bw_buckets(&pdev->dev); + if (error) + return error; + error = freq_buckets_init(&pdev->dev); + if (error) + return error; + last_update = ktime_get_boot_ns(); + init_success = true; + + pr_debug("memory_state_time initialized with num_freqs %d\n", + num_freqs); + return 0; +} + +static const struct of_device_id match_table[] = { + { .compatible = "memory-state-time" }, + {} +}; + +static struct platform_driver memory_state_time_driver = { + .probe = memory_state_time_probe, + .driver = { + .name = "memory-state-time", + .of_match_table = match_table, + .owner = THIS_MODULE, + }, +}; + +static int __init memory_state_time_init(void) +{ + int error; + + hash_init(freq_hash_table); + memory_wq = create_singlethread_workqueue("memory_wq"); + if (!memory_wq) { + pr_err("Unable to create workqueue.\n"); + return -EINVAL; + } + /* + * Create sys/kernel directory for memory_state_time. + */ + memory_kobj = kobject_create_and_add(TAG, kernel_kobj); + if (!memory_kobj) { + pr_err("Unable to allocate memory_kobj for sysfs directory.\n"); + error = -ENOMEM; + goto wq; + } + error = sysfs_create_group(memory_kobj, &memory_attr_group); + if (error) { + pr_err("Unable to create sysfs folder.\n"); + goto kobj; + } + + error = platform_driver_register(&memory_state_time_driver); + if (error) { + pr_err("Unable to register memory_state_time platform driver.\n"); + goto group; + } + return 0; + +group: sysfs_remove_group(memory_kobj, &memory_attr_group); +kobj: kobject_put(memory_kobj); +wq: destroy_workqueue(memory_wq); + return error; +} +module_init(memory_state_time_init); diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h new file mode 100644 index 000000000000..d2212b027866 --- /dev/null +++ b/include/linux/memory-state-time.h @@ -0,0 +1,42 @@ +/* include/linux/memory-state-time.h + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include + +#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE) + +struct memory_state_update_block; + +typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub, + int value); + +/* This struct is populated when you pass it to a memory_state_register* + * function. The update_call function is used for an update and defined in the + * typedef memory_state_update_fn_t + */ +struct memory_state_update_block { + memory_state_update_fn_t update_call; + int id; +}; + +/* Register a frequency struct memory_state_update_block to provide updates to + * memory_state_time about frequency changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_frequency_source(void); + +/* Register a bandwidth struct memory_state_update_block to provide updates to + * memory_state_time about bandwidth changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_bandwidth_source(void); From 5d4ac07c90e822c1bd60d8cbc4a472c4d3a097ee Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Wed, 9 Jun 2010 17:47:38 -0500 Subject: [PATCH 132/813] CHROMIUM: dm: boot time specification of dm= This is a wrap-up of three patches pending upstream approval. I'm bundling them because they are interdependent, and it'll be easier to drop it on rebase later. 1. dm: allow a dm-fs-style device to be shared via dm-ioctl Integrates feedback from Alisdair, Mike, and Kiyoshi. Two main changes occur here: - One function is added which allows for a programmatically created mapped device to be inserted into the dm-ioctl hash table. This binds the device to a name and, optional, uuid which is needed by udev and allows for userspace management of the mapped device. - dm_table_complete() was extended to handle all of the final functional changes required for the table to be operational once called. 2. init: boot to device-mapper targets without an initr* Add a dm= kernel parameter modeled after the md= parameter from do_mounts_md. It allows for device-mapper targets to be configured at boot time for use early in the boot process (as the root device or otherwise). It also replaces /dev/XXX calls with major:minor opportunistically. The format is dm="name uuid ro,table line 1,table line 2,...". The parser expects the comma to be safe to use as a newline substitute but, otherwise, uses the normal separator of space. Some attempt has been made to make it forgiving of additional spaces (using skip_spaces()). A mapped device created during boot will be assigned a minor of 0 and may be access via /dev/dm-0. An example dm-linear root with no uuid may look like: root=/dev/dm-0 dm="lroot none ro, 0 4096 linear /dev/ubdb 0, 4096 4096 linear /dv/ubdc 0" Once udev is started, /dev/dm-0 will become /dev/mapper/lroot. Older upstream threads: http://marc.info/?l=dm-devel&m=127429492521964&w=2 http://marc.info/?l=dm-devel&m=127429499422096&w=2 http://marc.info/?l=dm-devel&m=127429493922000&w=2 Latest upstream threads: https://patchwork.kernel.org/patch/104859/ https://patchwork.kernel.org/patch/104860/ https://patchwork.kernel.org/patch/104861/ Bug: 27175947 Signed-off-by: Will Drewry Review URL: http://codereview.chromium.org/2020011 Change-Id: I92bd53432a11241228d2e5ac89a3b20d19b05a31 --- Documentation/device-mapper/boot.txt | 42 +++ Documentation/kernel-parameters.txt | 6 + drivers/md/dm-ioctl.c | 39 +++ drivers/md/dm-table.c | 1 + include/linux/device-mapper.h | 6 + init/Makefile | 1 + init/do_mounts.c | 1 + init/do_mounts.h | 10 + init/do_mounts_dm.c | 410 +++++++++++++++++++++++++++ 9 files changed, 516 insertions(+) create mode 100644 Documentation/device-mapper/boot.txt create mode 100644 init/do_mounts_dm.c diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt new file mode 100644 index 000000000000..adcaad5e5e32 --- /dev/null +++ b/Documentation/device-mapper/boot.txt @@ -0,0 +1,42 @@ +Boot time creation of mapped devices +=================================== + +It is possible to configure a device mapper device to act as the root +device for your system in two ways. + +The first is to build an initial ramdisk which boots to a minimal +userspace which configures the device, then pivot_root(8) in to it. + +For simple device mapper configurations, it is possible to boot directly +using the following kernel command line: + +dm=" ,table line 1,...,table line n" + +name = the name to associate with the device + after boot, udev, if used, will use that name to label + the device node. +uuid = may be 'none' or the UUID desired for the device. +ro = may be "ro" or "rw". If "ro", the device and device table will be + marked read-only. + +Each table line may be as normal when using the dmsetup tool except for +two variations: +1. Any use of commas will be interpreted as a newline +2. Quotation marks cannot be escaped and cannot be used without + terminating the dm= argument. + +Unless renamed by udev, the device node created will be dm-0 as the +first minor number for the device-mapper is used during early creation. + +Example +======= + +- Booting to a linear array made up of user-mode linux block devices: + + dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \ + root=/dev/dm-0 + +Will boot to a rw dm-linear target of 8192 sectors split across two +block devices identified by their major:minor numbers. After boot, udev +will rename this target to /dev/mapper/lroot (depending on the rules). +No uuid was assigned. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 554f3844d499..77a0d624b786 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -56,6 +56,7 @@ parameter is applicable: BLACKFIN Blackfin architecture is enabled. CLK Common clock infrastructure is enabled. CMA Contiguous Memory Area support is enabled. + DM Device mapper support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime EDD BIOS Enhanced Disk Drive Services (EDD) is enabled @@ -915,6 +916,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. dis_ucode_ldr [X86] Disable the microcode loader. + dm= [DM] Allows early creation of a device-mapper device. + See Documentation/device-mapper/boot.txt. + + dmasound= [HW,OSS] Sound subsystem buff + dma_debug=off If the kernel is compiled with DMA_API_DEBUG support, this option disables the debugging code at boot. diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 80a439543259..bc5e9a5b1f30 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1923,6 +1923,45 @@ void dm_interface_exit(void) dm_hash_exit(); } + +/** + * dm_ioctl_export - Permanently export a mapped device via the ioctl interface + * @md: Pointer to mapped_device + * @name: Buffer (size DM_NAME_LEN) for name + * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired + */ +int dm_ioctl_export(struct mapped_device *md, const char *name, + const char *uuid) +{ + int r = 0; + struct hash_cell *hc; + + if (!md) { + r = -ENXIO; + goto out; + } + + /* The name and uuid can only be set once. */ + mutex_lock(&dm_hash_cells_mutex); + hc = dm_get_mdptr(md); + mutex_unlock(&dm_hash_cells_mutex); + if (hc) { + DMERR("%s: already exported", dm_device_name(md)); + r = -ENXIO; + goto out; + } + + r = dm_hash_insert(name, uuid, md); + if (r) { + DMERR("%s: could not bind to '%s'", dm_device_name(md), name); + goto out; + } + + /* Let udev know we've changed. */ + dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md)); +out: + return r; +} /** * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers * @md: Pointer to mapped_device diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cb5d0daf53bb..b3d78bba3a79 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 899ab9f8549e..b874d5b61ffc 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -382,6 +382,12 @@ void dm_put(struct mapped_device *md); void dm_set_mdptr(struct mapped_device *md, void *ptr); void *dm_get_mdptr(struct mapped_device *md); +/* + * Export the device via the ioctl interface (uses mdptr). + */ +int dm_ioctl_export(struct mapped_device *md, const char *name, + const char *uuid); + /* * A device can still be used while suspended, but I/O is deferred. */ diff --git a/init/Makefile b/init/Makefile index 692b91f1c1d4..243f61de2cba 100644 --- a/init/Makefile +++ b/init/Makefile @@ -15,6 +15,7 @@ mounts-y := do_mounts.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o +mounts-$(CONFIG_BLK_DEV_DM) += do_mounts_dm.o # dependencies on generated files need to be listed explicitly $(obj)/version.o: include/generated/compile.h diff --git a/init/do_mounts.c b/init/do_mounts.c index dea5de95c2dd..1902a1c80831 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -566,6 +566,7 @@ void __init prepare_namespace(void) wait_for_device_probe(); md_run_setup(); + dm_run_setup(); if (saved_root_name[0]) { root_device_name = saved_root_name; diff --git a/init/do_mounts.h b/init/do_mounts.h index f5b978a9bb92..09d22862e8c3 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -74,3 +74,13 @@ void md_run_setup(void); static inline void md_run_setup(void) {} #endif + +#ifdef CONFIG_BLK_DEV_DM + +void dm_run_setup(void); + +#else + +static inline void dm_run_setup(void) {} + +#endif diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c new file mode 100644 index 000000000000..0fd3411533f3 --- /dev/null +++ b/init/do_mounts_dm.c @@ -0,0 +1,410 @@ +/* do_mounts_dm.c + * Copyright (C) 2010 The Chromium OS Authors + * All Rights Reserved. + * Based on do_mounts_md.c + * + * This file is released under the GPL. + */ +#include +#include +#include + +#include "do_mounts.h" + +#define DM_MAX_NAME 32 +#define DM_MAX_UUID 129 +#define DM_NO_UUID "none" + +#define DM_MSG_PREFIX "init" + +/* Separators used for parsing the dm= argument. */ +#define DM_FIELD_SEP ' ' +#define DM_LINE_SEP ',' + +/* + * When the device-mapper and any targets are compiled into the kernel + * (not a module), one target may be created and used as the root device at + * boot time with the parameters given with the boot line dm=... + * The code for that is here. + */ + +struct dm_setup_target { + sector_t begin; + sector_t length; + char *type; + char *params; + /* simple singly linked list */ + struct dm_setup_target *next; +}; + +static struct { + int minor; + int ro; + char name[DM_MAX_NAME]; + char uuid[DM_MAX_UUID]; + char *targets; + struct dm_setup_target *target; + int target_count; +} dm_setup_args __initdata; + +static __initdata int dm_early_setup; + +static size_t __init get_dm_option(char *str, char **next, char sep) +{ + size_t len = 0; + char *endp = NULL; + + if (!str) + return 0; + + endp = strchr(str, sep); + if (!endp) { /* act like strchrnul */ + len = strlen(str); + endp = str + len; + } else { + len = endp - str; + } + + if (endp == str) + return 0; + + if (!next) + return len; + + if (*endp == 0) { + /* Don't advance past the nul. */ + *next = endp; + } else { + *next = endp + 1; + } + return len; +} + +static int __init dm_setup_args_init(void) +{ + dm_setup_args.minor = 0; + dm_setup_args.ro = 0; + dm_setup_args.target = NULL; + dm_setup_args.target_count = 0; + return 0; +} + +static int __init dm_setup_cleanup(void) +{ + struct dm_setup_target *target = dm_setup_args.target; + struct dm_setup_target *old_target = NULL; + while (target) { + kfree(target->type); + kfree(target->params); + old_target = target; + target = target->next; + kfree(old_target); + dm_setup_args.target_count--; + } + BUG_ON(dm_setup_args.target_count); + return 0; +} + +static char * __init dm_setup_parse_device_args(char *str) +{ + char *next = NULL; + size_t len = 0; + + /* Grab the logical name of the device to be exported to udev */ + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len) { + DMERR("failed to parse device name"); + goto parse_fail; + } + len = min(len + 1, sizeof(dm_setup_args.name)); + strlcpy(dm_setup_args.name, str, len); /* includes nul */ + str = skip_spaces(next); + + /* Grab the UUID value or "none" */ + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len) { + DMERR("failed to parse device uuid"); + goto parse_fail; + } + len = min(len + 1, sizeof(dm_setup_args.uuid)); + strlcpy(dm_setup_args.uuid, str, len); + str = skip_spaces(next); + + /* Determine if the table/device will be read only or read-write */ + if (!strncmp("ro,", str, 3)) { + dm_setup_args.ro = 1; + } else if (!strncmp("rw,", str, 3)) { + dm_setup_args.ro = 0; + } else { + DMERR("failed to parse table mode"); + goto parse_fail; + } + str = skip_spaces(str + 3); + + return str; + +parse_fail: + return NULL; +} + +static void __init dm_substitute_devices(char *str, size_t str_len) +{ + char *candidate = str; + char *candidate_end = str; + char old_char; + size_t len = 0; + dev_t dev; + + if (str_len < 3) + return; + + while (str && *str) { + candidate = strchr(str, '/'); + if (!candidate) + break; + + /* Avoid embedded slashes */ + if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) { + str = strchr(candidate, DM_FIELD_SEP); + continue; + } + + len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP); + str = skip_spaces(candidate_end); + if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */ + continue; + + /* Temporarily terminate with a nul */ + candidate_end--; + old_char = *candidate_end; + *candidate_end = '\0'; + + DMDEBUG("converting candidate device '%s' to dev_t", candidate); + /* Use the boot-time specific device naming */ + dev = name_to_dev_t(candidate); + *candidate_end = old_char; + + DMDEBUG(" -> %u", dev); + /* No suitable replacement found */ + if (!dev) + continue; + + /* Rewrite the /dev/path as a major:minor */ + len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev)); + if (!len) { + DMERR("error substituting device major/minor."); + break; + } + candidate += len; + /* Pad out with spaces (fixing our nul) */ + while (candidate < candidate_end) + *(candidate++) = DM_FIELD_SEP; + } +} + +static int __init dm_setup_parse_targets(char *str) +{ + char *next = NULL; + size_t len = 0; + struct dm_setup_target **target = NULL; + + /* Targets are defined as per the table format but with a + * comma as a newline separator. */ + target = &dm_setup_args.target; + while (str && *str) { + *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL); + if (!*target) { + DMERR("failed to allocate memory for target %d", + dm_setup_args.target_count); + goto parse_fail; + } + dm_setup_args.target_count++; + + (*target)->begin = simple_strtoull(str, &next, 10); + if (!next || *next != DM_FIELD_SEP) { + DMERR("failed to parse starting sector for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next + 1); + + (*target)->length = simple_strtoull(str, &next, 10); + if (!next || *next != DM_FIELD_SEP) { + DMERR("failed to parse length for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next + 1); + + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len || + !((*target)->type = kstrndup(str, len, GFP_KERNEL))) { + DMERR("failed to parse type for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next); + + len = get_dm_option(str, &next, DM_LINE_SEP); + if (!len || + !((*target)->params = kstrndup(str, len, GFP_KERNEL))) { + DMERR("failed to parse params for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next); + + /* Before moving on, walk through the copied target and + * attempt to replace all /dev/xxx with the major:minor number. + * It may not be possible to resolve them traditionally at + * boot-time. */ + dm_substitute_devices((*target)->params, len); + + target = &((*target)->next); + } + DMDEBUG("parsed %d targets", dm_setup_args.target_count); + + return 0; + +parse_fail: + return 1; +} + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the DM device now; that is handled by + * dm_setup_drive after the low-level disk drivers have initialised. + * dm format is as follows: + * dm="name uuid fmode,[table line 1],[table line 2],..." + * May be used with root=/dev/dm-0 as it always uses the first dm minor. + */ + +static int __init dm_setup(char *str) +{ + dm_setup_args_init(); + + str = dm_setup_parse_device_args(str); + if (!str) { + DMDEBUG("str is NULL"); + goto parse_fail; + } + + /* Target parsing is delayed until we have dynamic memory */ + dm_setup_args.targets = str; + + printk(KERN_INFO "dm: will configure '%s' on dm-%d\n", + dm_setup_args.name, dm_setup_args.minor); + + dm_early_setup = 1; + return 1; + +parse_fail: + printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n"); + return 0; +} + + +static void __init dm_setup_drive(void) +{ + struct mapped_device *md = NULL; + struct dm_table *table = NULL; + struct dm_setup_target *target; + char *uuid = dm_setup_args.uuid; + fmode_t fmode = FMODE_READ; + + /* Finish parsing the targets. */ + if (dm_setup_parse_targets(dm_setup_args.targets)) + goto parse_fail; + + if (dm_create(dm_setup_args.minor, &md)) { + DMDEBUG("failed to create the device"); + goto dm_create_fail; + } + DMDEBUG("created device '%s'", dm_device_name(md)); + + /* In addition to flagging the table below, the disk must be + * set explicitly ro/rw. */ + set_disk_ro(dm_disk(md), dm_setup_args.ro); + + if (!dm_setup_args.ro) + fmode |= FMODE_WRITE; + if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) { + DMDEBUG("failed to create the table"); + goto dm_table_create_fail; + } + + target = dm_setup_args.target; + while (target) { + DMINFO("adding target '%llu %llu %s %s'", + (unsigned long long) target->begin, + (unsigned long long) target->length, target->type, + target->params); + if (dm_table_add_target(table, target->type, target->begin, + target->length, target->params)) { + DMDEBUG("failed to add the target to the table"); + goto add_target_fail; + } + target = target->next; + } + + if (dm_table_complete(table)) { + DMDEBUG("failed to complete the table"); + goto table_complete_fail; + } + + /* Suspend the device so that we can bind it to the table. */ + if (dm_suspend(md, 0)) { + DMDEBUG("failed to suspend the device pre-bind"); + goto suspend_fail; + } + + /* Bind the table to the device. This is the only way to associate + * md->map with the table and set the disk capacity directly. */ + if (dm_swap_table(md, table)) { /* should return NULL. */ + DMDEBUG("failed to bind the device to the table"); + goto table_bind_fail; + } + + /* Finally, resume and the device should be ready. */ + if (dm_resume(md)) { + DMDEBUG("failed to resume the device"); + goto resume_fail; + } + + /* Export the dm device via the ioctl interface */ + if (!strcmp(DM_NO_UUID, dm_setup_args.uuid)) + uuid = NULL; + if (dm_ioctl_export(md, dm_setup_args.name, uuid)) { + DMDEBUG("failed to export device with given name and uuid"); + goto export_fail; + } + printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + + dm_setup_cleanup(); + return; + +export_fail: +resume_fail: +table_bind_fail: +suspend_fail: +table_complete_fail: +add_target_fail: + dm_table_put(table); +dm_table_create_fail: + dm_put(md); +dm_create_fail: + dm_setup_cleanup(); +parse_fail: + printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n", + dm_setup_args.minor, dm_setup_args.name); +} + +__setup("dm=", dm_setup); + +void __init dm_run_setup(void) +{ + if (!dm_early_setup) + return; + printk(KERN_INFO "dm: attempting early device configuration.\n"); + dm_setup_drive(); +} From e33d750b4ebc99a7804d096a28c29beccd77ee85 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 8 Feb 2016 16:47:41 -0800 Subject: [PATCH 133/813] ANDROID: dm: Rebase on top of 4.1 1. "dm: optimize use SRCU and RCU" removes the use of dm_table_put. 2. "dm: remove request-based logic from make_request_fn wrapper" necessitates calling dm_setup_md_queue or else the request_queue's make_request_fn pointer ends being unset. [ 7.711600] Internal error: Oops - bad mode: 0 [#1] PREEMPT SMP [ 7.717519] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G W 4.1.15-02273-gb057d16-dirty #33 [ 7.726559] Hardware name: HiKey Development Board (DT) [ 7.731779] task: ffffffc005f8acc0 ti: ffffffc005f8c000 task.ti: ffffffc005f8c000 [ 7.739257] PC is at 0x0 [ 7.741787] LR is at generic_make_request+0x8c/0x108 .... [ 9.082931] Call trace: [ 9.085372] [< (null)>] (null) [ 9.090074] [] submit_bio+0x98/0x1e0 [ 9.095212] [] _submit_bh+0x120/0x1f0 [ 9.096165] cfg80211: Calling CRDA to update world regulatory domain [ 9.106781] [] __bread_gfp+0x94/0x114 [ 9.112004] [] ext4_fill_super+0x18c/0x2d64 [ 9.117750] [] mount_bdev+0x194/0x1c0 [ 9.122973] [] ext4_mount+0x14/0x1c [ 9.128021] [] mount_fs+0x3c/0x194 [ 9.132985] [] vfs_kern_mount+0x4c/0x134 [ 9.138467] [] do_mount+0x204/0xbbc [ 9.143514] [] SyS_mount+0x94/0xe8 [ 9.148479] [] mount_block_root+0x120/0x24c [ 9.154222] [] mount_root+0x110/0x12c [ 9.159443] [] prepare_namespace+0x170/0x1b8 [ 9.165273] [] kernel_init_freeable+0x23c/0x260 [ 9.171365] [] kernel_init+0x10/0x118 [ 9.176589] Code: bad PC value [ 9.179807] ---[ end trace 75e1bc52ba364d13 ]--- Bug: 27175947 Signed-off-by: Badhri Jagan Sridharan Change-Id: I952d86fd1475f0825f9be1386e3497b36127abd0 --- init/do_mounts_dm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index 0fd3411533f3..f521bc5ae248 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -10,6 +10,7 @@ #include #include "do_mounts.h" +#include "../drivers/md/dm.h" #define DM_MAX_NAME 32 #define DM_MAX_UUID 129 @@ -333,6 +334,7 @@ static void __init dm_setup_drive(void) goto dm_table_create_fail; } + dm_lock_md_type(md); target = dm_setup_args.target; while (target) { DMINFO("adding target '%llu %llu %s %s'", @@ -352,6 +354,17 @@ static void __init dm_setup_drive(void) goto table_complete_fail; } + if (dm_get_md_type(md) == DM_TYPE_NONE) { + dm_set_md_type(md, dm_table_get_type(table)); + if (dm_setup_md_queue(md)) { + DMWARN("unable to set up device queue for new table."); + goto setup_md_queue_fail; + } + } else if (dm_get_md_type(md) != dm_table_get_type(table)) { + DMWARN("can't change device type after initial table load."); + goto setup_md_queue_fail; + } + /* Suspend the device so that we can bind it to the table. */ if (dm_suspend(md, 0)) { DMDEBUG("failed to suspend the device pre-bind"); @@ -380,6 +393,7 @@ static void __init dm_setup_drive(void) } printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + dm_unlock_md_type(md); dm_setup_cleanup(); return; @@ -387,9 +401,10 @@ export_fail: resume_fail: table_bind_fail: suspend_fail: +setup_md_queue_fail: table_complete_fail: add_target_fail: - dm_table_put(table); + dm_unlock_md_type(md); dm_table_create_fail: dm_put(md); dm_create_fail: From b8681ffa6859d3f267d625d4770987df5832a42b Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Mon, 2 May 2016 17:29:28 +0200 Subject: [PATCH 134/813] ANDROID: dm: fix dm_substitute_devices() When candidate is the last parameter, candidate_end points to the '\0' character and not the DM_FIELD_SEP character. In such a situation, we should not move the candidate_end pointer one character backward. Signed-off-by: Jeremy Compostella --- init/do_mounts_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index f521bc5ae248..ecda58df9a19 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -176,7 +176,8 @@ static void __init dm_substitute_devices(char *str, size_t str_len) continue; /* Temporarily terminate with a nul */ - candidate_end--; + if (*candidate_end) + candidate_end--; old_char = *candidate_end; *candidate_end = '\0'; From 7ca5a9b8bb75bd21ec8cab38f8c6d285766c43a6 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 14 Dec 2015 20:09:39 -0800 Subject: [PATCH 135/813] ANDROID: dm: Add android verity target This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked to the actual data blocks in the block device. The signature of the metadata contents are verified against the key included in the system keyring. Upon success, the underlying verity target is setup. BUG: 27175947 Change-Id: I7e99644a0960ac8279f02c0158ed20999510ea97 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/Kconfig | 16 + drivers/md/Makefile | 4 + drivers/md/dm-android-verity.c | 771 +++++++++++++++++++++++++++++++++ drivers/md/dm-android-verity.h | 92 ++++ drivers/md/dm-verity-target.c | 12 +- drivers/md/dm-verity.h | 12 + 6 files changed, 901 insertions(+), 6 deletions(-) create mode 100644 drivers/md/dm-android-verity.c create mode 100644 drivers/md/dm-android-verity.h diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d8b0ab6f3753..96b419b544ed 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -500,4 +500,20 @@ config DM_LOG_WRITES If unsure, say N. +config DM_ANDROID_VERITY + bool "Android verity target support" + depends on DM_VERITY + depends on X509_CERTIFICATE_PARSER + depends on SYSTEM_TRUSTED_KEYRING + depends on PUBLIC_KEY_ALGO_RSA + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE + depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + ---help--- + This device-mapper target is virtually a VERITY target. This + target is setup by reading the metadata contents piggybacked + to the actual data blocks in the block device. The signature + of the metadata contents are verified against the key included + in the system keyring. Upon success, the underlying verity + target is setup. endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e8e0..c8fb00d8cc36 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -68,3 +68,7 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif + +ifeq ($(CONFIG_DM_ANDROID_VERITY),y) +dm-verity-objs += dm-android-verity.o +endif diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c new file mode 100644 index 000000000000..c77c9fa7a962 --- /dev/null +++ b/drivers/md/dm-android-verity.c @@ -0,0 +1,771 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "dm-verity.h" +#include "dm-android-verity.h" + +static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; + +static int __init verified_boot_state_param(char *line) +{ + strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); + return 1; +} + +__setup("androidboot.verifiedbootstate=", verified_boot_state_param); + +static int __init verity_mode_param(char *line) +{ + strlcpy(veritymode, line, sizeof(veritymode)); + return 1; +} + +__setup("androidboot.veritymode=", verity_mode_param); + +static int table_extract_mpi_array(struct public_key_signature *pks, + const void *data, size_t len) +{ + MPI mpi = mpi_read_raw_data(data, len); + + if (!mpi) { + DMERR("Error while allocating mpi array"); + return -ENOMEM; + } + + pks->mpi[0] = mpi; + pks->nr_mpi = 1; + return 0; +} + +static struct public_key_signature *table_make_digest( + enum pkey_hash_algo hash, + const void *table, + unsigned long table_len) +{ + struct public_key_signature *pks = NULL; + struct crypto_shash *tfm; + struct shash_desc *desc; + size_t digest_size, desc_size; + int ret; + + /* Allocate the hashing algorithm we're going to need and find out how + * big the hash operational data will be. + */ + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + if (IS_ERR(tfm)) + return ERR_CAST(tfm); + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + digest_size = crypto_shash_digestsize(tfm); + + /* We allocate the hash operational data storage on the end of out + * context data and the digest output buffer on the end of that. + */ + ret = -ENOMEM; + pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); + if (!pks) + goto error; + + pks->pkey_hash_algo = hash; + pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; + pks->digest_size = digest_size; + + desc = (struct shash_desc *)(pks + 1); + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto error; + + ret = crypto_shash_finup(desc, table, table_len, pks->digest); + if (ret < 0) + goto error; + + crypto_free_shash(tfm); + return pks; + +error: + kfree(pks); + crypto_free_shash(tfm); + return ERR_PTR(ret); +} + +static int read_block_dev(struct bio_read *payload, struct block_device *bdev, + sector_t offset, int length) +{ + struct bio *bio; + int err = 0, i; + + payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE); + + bio = bio_alloc(GFP_KERNEL, payload->number_of_pages); + if (!bio) { + DMERR("Error while allocating bio"); + return -ENOMEM; + } + + bio->bi_bdev = bdev; + bio->bi_sector = offset; + + payload->page_io = kzalloc(sizeof(struct page *) * + payload->number_of_pages, GFP_KERNEL); + if (!payload->page_io) { + DMERR("page_io array alloc failed"); + err = -ENOMEM; + goto free_bio; + } + + for (i = 0; i < payload->number_of_pages; i++) { + payload->page_io[i] = alloc_page(GFP_KERNEL); + if (!payload->page_io[i]) { + DMERR("alloc_page failed"); + err = -ENOMEM; + goto free_pages; + } + if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) { + DMERR("bio_add_page error"); + err = -EIO; + goto free_pages; + } + } + + if (!submit_bio_wait(READ, bio)) + /* success */ + goto free_bio; + DMERR("bio read failed"); + err = -EIO; + +free_pages: + for (i = 0; i < payload->number_of_pages; i++) + if (payload->page_io[i]) + __free_page(payload->page_io[i]); + kfree(payload->page_io); +free_bio: + bio_put(bio); + return err; +} + +static inline u64 fec_div_round_up(u64 x, u64 y) +{ + u64 remainder; + + return div64_u64_rem(x, y, &remainder) + + (remainder > 0 ? 1 : 0); +} + +static inline void populate_fec_metadata(struct fec_header *header, + struct fec_ecc_metadata *ecc) +{ + ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size), + FEC_BLOCK_SIZE); + ecc->roots = le32_to_cpu(header->roots); + ecc->start = le64_to_cpu(header->inp_size); +} + +static inline int validate_fec_header(struct fec_header *header, u64 offset) +{ + /* move offset to make the sanity check work for backup header + * as well. */ + offset -= offset % FEC_BLOCK_SIZE; + if (le32_to_cpu(header->magic) != FEC_MAGIC || + le32_to_cpu(header->version) != FEC_VERSION || + le32_to_cpu(header->size) != sizeof(struct fec_header) || + le32_to_cpu(header->roots) == 0 || + le32_to_cpu(header->roots) >= FEC_RSM || + offset < le32_to_cpu(header->fec_size) || + offset - le32_to_cpu(header->fec_size) != + le64_to_cpu(header->inp_size)) + return -EINVAL; + + return 0; +} + +static int extract_fec_header(dev_t dev, struct fec_header *fec, + struct fec_ecc_metadata *ecc) +{ + u64 device_size; + struct bio_read payload; + int i, err = 0; + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + + if (IS_ERR(bdev)) { + DMERR("bdev get error"); + return PTR_ERR(bdev); + } + + device_size = i_size_read(bdev->bd_inode); + + /* fec metadata size is a power of 2 and PAGE_SIZE + * is a power of 2 as well. + */ + BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE); + /* 512 byte sector alignment */ + BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0); + + err = read_block_dev(&payload, bdev, (device_size - + FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE); + if (err) { + DMERR("Error while reading verity metadata"); + goto error; + } + + BUG_ON(sizeof(struct fec_header) > PAGE_SIZE); + memcpy(fec, page_address(payload.page_io[0]), + sizeof(*fec)); + + ecc->valid = true; + if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) { + /* Try the backup header */ + memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE + - sizeof(*fec) , + sizeof(*fec)); + if (validate_fec_header(fec, device_size - + sizeof(struct fec_header))) + ecc->valid = false; + } + + if (ecc->valid) + populate_fec_metadata(fec, ecc); + + for (i = 0; i < payload.number_of_pages; i++) + __free_page(payload.page_io[i]); + kfree(payload.page_io); + +error: + blkdev_put(bdev, FMODE_READ); + return err; +} +static void find_metadata_offset(struct fec_header *fec, + struct block_device *bdev, u64 *metadata_offset) +{ + u64 device_size; + + device_size = i_size_read(bdev->bd_inode); + + if (le32_to_cpu(fec->magic) == FEC_MAGIC) + *metadata_offset = le64_to_cpu(fec->inp_size) - + VERITY_METADATA_SIZE; + else + *metadata_offset = device_size - VERITY_METADATA_SIZE; +} + +static struct android_metadata *extract_metadata(dev_t dev, + struct fec_header *fec) +{ + struct block_device *bdev; + struct android_metadata_header *header; + struct android_metadata *uninitialized_var(metadata); + int i; + u32 table_length, copy_length, offset; + u64 metadata_offset; + struct bio_read payload; + int err = 0; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + + if (IS_ERR(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return ERR_CAST(bdev); + } + + find_metadata_offset(fec, bdev, &metadata_offset); + + /* Verity metadata size is a power of 2 and PAGE_SIZE + * is a power of 2 as well. + * PAGE_SIZE is also a multiple of 512 bytes. + */ + if (VERITY_METADATA_SIZE > PAGE_SIZE) + BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0); + /* 512 byte sector alignment */ + BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0); + + err = read_block_dev(&payload, bdev, metadata_offset / + (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); + if (err) { + DMERR("Error while reading verity metadata"); + metadata = ERR_PTR(err); + goto blkdev_release; + } + + header = kzalloc(sizeof(*header), GFP_KERNEL); + if (!header) { + DMERR("kzalloc failed for header"); + err = -ENOMEM; + goto free_payload; + } + + memcpy(header, page_address(payload.page_io[0]), + sizeof(*header)); + + DMINFO("bio magic_number:%u protocol_version:%d table_length:%u", + le32_to_cpu(header->magic_number), + le32_to_cpu(header->protocol_version), + le32_to_cpu(header->table_length)); + + metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); + if (!metadata) { + DMERR("kzalloc for metadata failed"); + err = -ENOMEM; + goto free_header; + } + + metadata->header = header; + table_length = le32_to_cpu(header->table_length); + + if (table_length == 0 || + table_length > (VERITY_METADATA_SIZE - + sizeof(struct android_metadata_header))) + goto free_metadata; + + metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + + if (!metadata->verity_table) { + DMERR("kzalloc verity_table failed"); + err = -ENOMEM; + goto free_metadata; + } + + if (sizeof(struct android_metadata_header) + + table_length <= PAGE_SIZE) { + memcpy(metadata->verity_table, page_address(payload.page_io[0]) + + sizeof(struct android_metadata_header), + table_length); + } else { + copy_length = PAGE_SIZE - + sizeof(struct android_metadata_header); + memcpy(metadata->verity_table, page_address(payload.page_io[0]) + + sizeof(struct android_metadata_header), + copy_length); + table_length -= copy_length; + offset = copy_length; + i = 1; + while (table_length != 0) { + if (table_length > PAGE_SIZE) { + memcpy(metadata->verity_table + offset, + page_address(payload.page_io[i]), + PAGE_SIZE); + offset += PAGE_SIZE; + table_length -= PAGE_SIZE; + } else { + memcpy(metadata->verity_table + offset, + page_address(payload.page_io[i]), + table_length); + table_length = 0; + } + i++; + } + } + metadata->verity_table[table_length] = '\0'; + + goto free_payload; + +free_metadata: + kfree(metadata); +free_header: + kfree(header); + metadata = ERR_PTR(err); +free_payload: + for (i = 0; i < payload.number_of_pages; i++) + if (payload.page_io[i]) + __free_page(payload.page_io[i]); + kfree(payload.page_io); + + DMINFO("verity_table: %s", metadata->verity_table); +blkdev_release: + blkdev_put(bdev, FMODE_READ); + return metadata; +} + +/* helper functions to extract properties from dts */ +const char *find_dt_value(const char *name) +{ + struct device_node *firmware; + const char *value; + + firmware = of_find_node_by_path("/firmware/android"); + if (!firmware) + return NULL; + value = of_get_property(firmware, name, NULL); + of_node_put(firmware); + + return value; +} + +static bool is_unlocked(void) +{ + static const char unlocked[] = "orange"; + static const char verified_boot_prop[] = "verifiedbootstate"; + const char *value; + + value = find_dt_value(verified_boot_prop); + if (!value) + value = verifiedbootstate; + + return !strncmp(value, unlocked, sizeof(unlocked) - 1); +} + +static int verity_mode(void) +{ + static const char enforcing[] = "enforcing"; + static const char verified_mode_prop[] = "veritymode"; + const char *value; + + value = find_dt_value(verified_mode_prop); + if (!value) + value = veritymode; + if (!strncmp(value, enforcing, sizeof(enforcing) - 1)) + return DM_VERITY_MODE_RESTART; + + return DM_VERITY_MODE_EIO; +} + +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_unlocked() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) { + retval = VERITY_STATE_DISABLE; + return retval; + } + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int verify_verity_signature(char *key_id, + struct android_metadata *metadata) +{ + key_ref_t key_ref; + struct key *key; + struct public_key_signature *pks = NULL; + int retval = -EINVAL; + + key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1), + &key_type_asymmetric, key_id); + + if (IS_ERR(key_ref)) { + DMERR("keyring: key not found"); + return -ENOKEY; + } + + key = key_ref_to_ptr(key_ref); + + pks = table_make_digest(PKEY_HASH_SHA256, + (const void *)metadata->verity_table, + le32_to_cpu(metadata->header->table_length)); + + if (IS_ERR(pks)) { + DMERR("hashing failed"); + goto error; + } + + retval = table_extract_mpi_array(pks, &metadata->header->signature[0], + RSANUMBYTES); + if (retval < 0) { + DMERR("Error extracting mpi %d", retval); + goto error; + } + + retval = verify_signature(key, pks); + mpi_free(pks->rsa.s); +error: + kfree(pks); + key_put(key); + + return retval; +} + +static void handle_error(void) +{ + int mode = verity_mode(); + if (mode == DM_VERITY_MODE_RESTART) { + DMERR("triggering restart"); + kernel_restart("dm-verity device corrupted"); + } else { + DMERR("Mounting verity root failed"); + } +} + +static inline bool test_mult_overflow(sector_t a, u32 b) +{ + sector_t r = (sector_t)~0ULL; + + sector_div(r, b); + return a > r; +} + +/* + * Target parameters: + * Key id of the public key in the system keyring. + * Verity metadata's signature would be verified against + * this. If the key id contains spaces, replace them + * with '#'. + * The block device for which dm-verity is being setup. + */ +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + dev_t uninitialized_var(dev); + struct android_metadata *uninitialized_var(metadata); + int err = 0, i, mode; + char *key_id, *table_ptr, dummy, + *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; + /* One for specifying number of opt args and one for mode */ + sector_t data_sectors; + u32 data_block_size; + unsigned int major, minor, + no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + struct fec_header fec; + struct fec_ecc_metadata uninitialized_var(ecc); + char buf[FEC_ARG_LENGTH], *buf_ptr; + unsigned long long tmpll; + + if (argc != 2) { + DMERR("Incorrect number of arguments"); + handle_error(); + return -EINVAL; + } + + /* should come as one of the arguments for the verity target */ + key_id = argv[0]; + strreplace(argv[0], '#', ' '); + + if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) { + DMERR("Incorrect bdev major minor number"); + handle_error(); + return -EOVERFLOW; + } + } + + DMINFO("key:%s dev:%s", argv[0], argv[1]); + + if (extract_fec_header(dev, &fec, &ecc)) { + DMERR("Error while extracting fec header"); + handle_error(); + return -EINVAL; + } + + metadata = extract_metadata(dev, &fec); + + if (IS_ERR(metadata)) { + DMERR("Error while extracting metadata"); + handle_error(); + return -EINVAL; + } + + err = verify_header(metadata->header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + return -EINVAL; + } else if (err) { + DMERR("Verity header handle error"); + handle_error(); + goto free_metadata; + } + + err = verify_verity_signature(key_id, metadata); + + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + + table_ptr = metadata->verity_table; + + for (i = 0; i < VERITY_TABLE_ARGS; i++) { + verity_table_args[i] = strsep(&table_ptr, " "); + if (verity_table_args[i] == NULL) + break; + } + + if (i != VERITY_TABLE_ARGS) { + DMERR("Verity table not in the expected format"); + err = -EINVAL; + handle_error(); + goto free_metadata; + } + + if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy) + != 1) { + DMERR("Verity table not in the expected format"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + if (tmpll > ULONG_MAX) { + DMERR(" too large. Forgot to turn on CONFIG_LBDAF?"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + data_sectors = tmpll; + + if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy) + != 1) { + DMERR("Verity table not in the expected format"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + if (test_mult_overflow(data_sectors, data_block_size >> + SECTOR_SHIFT)) { + DMERR("data_sectors too large"); + handle_error(); + err = -EOVERFLOW; + goto free_metadata; + } + + data_sectors *= data_block_size >> SECTOR_SHIFT; + DMINFO("Data sectors %llu", (unsigned long long)data_sectors); + + /* update target length */ + ti->len = data_sectors; + + /*substitute data_dev and hash_dev*/ + verity_table_args[1] = argv[1]; + verity_table_args[2] = argv[1]; + + mode = verity_mode(); + + if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { + if (mode) { + err = snprintf(buf, FEC_ARG_LENGTH, + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, + argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); + } else { + err = snprintf(buf, FEC_ARG_LENGTH, + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, argv[1], + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + } + } else if (mode) { + err = snprintf(buf, FEC_ARG_LENGTH, + "2 " VERITY_TABLE_OPT_IGNZERO " %s", + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING); + } else { + err = snprintf(buf, FEC_ARG_LENGTH, "1 %s", + "ignore_zero_blocks"); + } + + if (err < 0 || err >= FEC_ARG_LENGTH) + goto free_metadata; + + buf_ptr = buf; + + for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS + + VERITY_TABLE_OPT_FEC_ARGS + 2); i++) { + verity_table_args[i] = strsep(&buf_ptr, " "); + if (verity_table_args[i] == NULL) { + no_of_args = i; + break; + } + } + + err = verity_ctr(ti, no_of_args, verity_table_args); + +free_metadata: + kfree(metadata->header); + kfree(metadata->verity_table); + kfree(metadata); + return err; +} + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_android_verity_init(void) +{ + int r; + + r = dm_register_target(&android_verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_android_verity_exit(void) +{ + dm_unregister_target(&android_verity_target); +} + +module_init(dm_android_verity_init); +module_exit(dm_android_verity_exit); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h new file mode 100644 index 000000000000..11477ffd2243 --- /dev/null +++ b/drivers/md/dm-android-verity.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef DM_ANDROID_VERITY_H +#define DM_ANDROID_VERITY_H + +#include + +#define RSANUMBYTES 256 +#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001 +#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56 +#define VERITY_METADATA_VERSION 0 +#define VERITY_STATE_DISABLE 1 +#define DATA_BLOCK_SIZE (4 * 1024) +#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) +#define VERITY_TABLE_ARGS 10 +#define VERITY_COMMANDLINE_PARAM_LENGTH 20 + +#define FEC_MAGIC 0xFECFECFE +#define FEC_BLOCK_SIZE (4 * 1024) +#define FEC_VERSION 0 +#define FEC_RSM 255 +#define FEC_ARG_LENGTH 300 + +#define VERITY_TABLE_OPT_RESTART "restart_on_corruption" +#define VERITY_TABLE_OPT_LOGGING "ignore_corruption" +#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks" + +#define VERITY_TABLE_OPT_FEC_FORMAT \ + "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks" +#define VERITY_TABLE_OPT_FEC_ARGS 9 + +#define VERITY_DEBUG 0 + +#define DM_MSG_PREFIX "android-verity" +/* + * There can be two formats. + * if fec is present + * + * if fec is not present + * + */ +/* TODO: rearrange structure to reduce memory holes + * depends on userspace change. + */ +struct fec_header { + __le32 magic; + __le32 version; + __le32 size; + __le32 roots; + __le32 fec_size; + __le64 inp_size; + u8 hash[SHA256_DIGEST_SIZE]; +}; + +struct android_metadata_header { + __le32 magic_number; + __le32 protocol_version; + char signature[RSANUMBYTES]; + __le32 table_length; +}; + +struct android_metadata { + struct android_metadata_header *header; + char *verity_table; +}; + +struct fec_ecc_metadata { + bool valid; + u32 roots; + u64 blocks; + u64 rounds; + u64 start; +}; + +struct bio_read { + struct page **page_io; + int number_of_pages; +}; + +#endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c5d30cb6ec5..65835f15a116 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -551,7 +551,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) * Bio map function. It allocates dm_verity_io structure and bio vector and * fills them. Then it issues prefetches and the I/O. */ -static int verity_map(struct dm_target *ti, struct bio *bio) +int verity_map(struct dm_target *ti, struct bio *bio) { struct dm_verity *v = ti->private; struct dm_verity_io *io; @@ -596,7 +596,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) /* * Status: V (valid) or C (corruption found) */ -static void verity_status(struct dm_target *ti, status_type_t type, +void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct dm_verity *v = ti->private; @@ -669,7 +669,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, return 0; } -static int verity_iterate_devices(struct dm_target *ti, +int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dm_verity *v = ti->private; @@ -677,7 +677,7 @@ static int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } -static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dm_verity *v = ti->private; @@ -690,7 +690,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } -static void verity_dtr(struct dm_target *ti) +void verity_dtr(struct dm_target *ti) { struct dm_verity *v = ti->private; @@ -817,7 +817,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) * * Hex string or "-" if no salt. */ -static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; struct dm_arg_set as; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index fb419f422d73..d9cf5e4939eb 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -126,4 +126,16 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); +extern void verity_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int verity_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg); +extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); +extern void verity_dtr(struct dm_target *ti); +extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv); +extern int verity_map(struct dm_target *ti, struct bio *bio); #endif /* DM_VERITY_H */ From fb26d63ec51d7929cb174485b17b3fcf9fadf913 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 8 Feb 2016 16:28:43 -0800 Subject: [PATCH 136/813] ANDROID: dm-android-verity: Rebase on top of 4.1 Following CLs in upstream causes minor changes to dm-android-verity target. 1. keys: change asymmetric keys to use common hash definitions 2. block: Abstract out bvec iterator Rebase dm-android-verity on top of these changes. Bug: 27175947 Signed-off-by: Badhri Jagan Sridharan Change-Id: Icfdc3e7b3ead5de335a059cade1aca70414db415 --- drivers/md/dm-android-verity.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index c77c9fa7a962..aeb5045830d9 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -75,7 +75,7 @@ static int table_extract_mpi_array(struct public_key_signature *pks, } static struct public_key_signature *table_make_digest( - enum pkey_hash_algo hash, + enum hash_algo hash, const void *table, unsigned long table_len) { @@ -88,7 +88,7 @@ static struct public_key_signature *table_make_digest( /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return ERR_CAST(tfm); @@ -143,7 +143,7 @@ static int read_block_dev(struct bio_read *payload, struct block_device *bdev, } bio->bi_bdev = bdev; - bio->bi_sector = offset; + bio->bi_iter.bi_sector = offset; payload->page_io = kzalloc(sizeof(struct page *) * payload->number_of_pages, GFP_KERNEL); @@ -505,7 +505,7 @@ static int verify_verity_signature(char *key_id, key = key_ref_to_ptr(key_ref); - pks = table_make_digest(PKEY_HASH_SHA256, + pks = table_make_digest(HASH_ALGO_SHA256, (const void *)metadata->verity_table, le32_to_cpu(metadata->header->table_length)); @@ -569,7 +569,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) u32 data_block_size; unsigned int major, minor, no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; - struct fec_header fec; + struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; From 830a4070ad6398d7251eb44045c83fe0783cae24 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 21 Mar 2016 10:55:23 -0700 Subject: [PATCH 137/813] ANDROID: dm: Mounting root as linear device when verity disabled This CL makes android-verity target to be added as linear dm device if when bootloader is unlocked and verity is disabled. Bug: 27175947 Change-Id: Ic41ca4b8908fb2777263799cf3a3e25934d70f18 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 131 +++++++++++++++++++++++++++------ drivers/md/dm-android-verity.h | 3 + drivers/md/dm-linear.c | 2 +- 3 files changed, 113 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index aeb5045830d9..db4ddf789a39 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -43,6 +44,25 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static bool target_added; +static bool verity_enabled = true; +struct dentry *debug_dir; +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv); + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + static int __init verified_boot_state_param(char *line) { strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); @@ -549,6 +569,35 @@ static inline bool test_mult_overflow(sector_t a, u32 b) return a > r; } +static int add_as_linear_device(struct dm_target *ti, char *dev) +{ + /*Move to linear mapping defines*/ + char *linear_table_args[DM_LINEAR_ARGS]; + char offset[] = "0"; + int err = 0; + + linear_table_args[0] = dev; + linear_table_args[1] = offset; + + android_verity_target.dtr = linear_target.dtr, + android_verity_target.map = linear_target.map, + android_verity_target.status = linear_target.status, + android_verity_target.ioctl = linear_target.ioctl, + android_verity_target.merge = linear_target.merge, + android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.io_hints = NULL; + + err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + + if (!err) { + DMINFO("Added android-verity as a linear target"); + target_added = true; + } else + DMERR("Failed to add android-verity as linear target"); + + return err; +} + /* * Target parameters: * Key id of the public key in the system keyring. @@ -613,21 +662,27 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (err == VERITY_STATE_DISABLE) { DMERR("Mounting root with verity disabled"); - return -EINVAL; + verity_enabled = false; + /* we would still have to parse the args to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + */ } else if (err) { DMERR("Verity header handle error"); handle_error(); goto free_metadata; } - err = verify_verity_signature(key_id, metadata); + if (!verity_enabled) { + err = verify_verity_signature(key_id, metadata); - if (err) { - DMERR("Signature verification failed"); - handle_error(); - goto free_metadata; - } else - DMINFO("Signature verification success"); + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + } table_ptr = metadata->verity_table; @@ -683,6 +738,12 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* update target length */ ti->len = data_sectors; + /* Setup linear target and free */ + if (!verity_enabled) { + err = add_as_linear_device(ti, argv[1]); + goto free_metadata; + } + /*substitute data_dev and hash_dev*/ verity_table_args[1] = argv[1]; verity_table_args[2] = argv[1]; @@ -730,6 +791,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) err = verity_ctr(ti, no_of_args, verity_table_args); + if (err) + DMERR("android-verity failed to mount as verity target"); + else { + target_added = true; + DMINFO("android-verity mounted as verity target"); + } + free_metadata: kfree(metadata->header); kfree(metadata->verity_table); @@ -737,33 +805,52 @@ free_metadata: return err; } -static struct target_type android_verity_target = { - .name = "android-verity", - .version = {1, 0, 0}, - .module = THIS_MODULE, - .ctr = android_verity_ctr, - .dtr = verity_dtr, - .map = verity_map, - .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, - .iterate_devices = verity_iterate_devices, - .io_hints = verity_io_hints, -}; - static int __init dm_android_verity_init(void) { int r; + struct dentry *file; r = dm_register_target(&android_verity_target); if (r < 0) DMERR("register failed %d", r); + /* Tracks the status of the last added target */ + debug_dir = debugfs_create_dir("android_verity", NULL); + + if (IS_ERR_OR_NULL(debug_dir)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + goto end; + } + + file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, + (u32 *)&target_added); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + goto end; + } + + file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, + (u32 *)&verity_enabled); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + } + +end: return r; } static void __exit dm_android_verity_exit(void) { + if (!IS_ERR_OR_NULL(debug_dir)) + debugfs_remove_recursive(debug_dir); + dm_unregister_target(&android_verity_target); } diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 11477ffd2243..2cf7de1b7910 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -44,6 +44,8 @@ #define VERITY_DEBUG 0 #define DM_MSG_PREFIX "android-verity" + +#define DM_LINEAR_ARGS 2 /* * There can be two formats. * if fec is present @@ -89,4 +91,5 @@ struct bio_read { int number_of_pages; }; +extern struct target_type linear_target; #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 05c35aacb3aa..d8bf876e7bed 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -141,7 +141,7 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -static struct target_type linear_target = { +struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, From d7b095701b5ce9199df6f5b7436a8c7825ce282d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 28 Mar 2016 14:41:21 -0700 Subject: [PATCH 138/813] ANDROID: dm: Minor cleanup Compacts the linear device arguments removing the unnecessary variables. Bug: 27175947 Change-Id: I157170eebe3c0f89a68ae05870a1060f188d0da0 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 7 ++----- drivers/md/dm-android-verity.h | 2 ++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index db4ddf789a39..f6ddbee5e2d3 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -572,13 +572,10 @@ static inline bool test_mult_overflow(sector_t a, u32 b) static int add_as_linear_device(struct dm_target *ti, char *dev) { /*Move to linear mapping defines*/ - char *linear_table_args[DM_LINEAR_ARGS]; - char offset[] = "0"; + char *linear_table_args[DM_LINEAR_ARGS] = {dev, + DM_LINEAR_TARGET_OFFSET}; int err = 0; - linear_table_args[0] = dev; - linear_table_args[1] = offset; - android_verity_target.dtr = linear_target.dtr, android_verity_target.map = linear_target.map, android_verity_target.status = linear_target.status, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 2cf7de1b7910..fe53863c664b 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -46,6 +46,8 @@ #define DM_MSG_PREFIX "android-verity" #define DM_LINEAR_ARGS 2 +#define DM_LINEAR_TARGET_OFFSET "0" + /* * There can be two formats. * if fec is present From 5a22b1a31236f6e0c833e2fe7a353e4487fd2e82 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 5 Apr 2016 11:18:16 -0700 Subject: [PATCH 139/813] ANDROID: dm: rename dm-linear methods for dm-android-verity This keeps linear_target as static variable and just exposes the linear target methods for android-verity Cherry-picked: https://android-review.googlesource.com/#/c/212858 Change-Id: I4a377e417b00afd9ecccdb3e605fea31a7df112e Signed-off-by: Badhri Jagan Sridharan (cherry picked from commit a6d1b091f40b25d97849487e29ec097bc5f568dd) --- drivers/md/dm-android-verity.c | 14 +++++++------- drivers/md/dm-android-verity.h | 12 ++++++++++++ drivers/md/dm-linear.c | 26 +++++++++++++------------- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index f6ddbee5e2d3..b7e059595f75 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -576,15 +576,15 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) DM_LINEAR_TARGET_OFFSET}; int err = 0; - android_verity_target.dtr = linear_target.dtr, - android_verity_target.map = linear_target.map, - android_verity_target.status = linear_target.status, - android_verity_target.ioctl = linear_target.ioctl, - android_verity_target.merge = linear_target.merge, - android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.dtr = dm_linear_dtr, + android_verity_target.map = dm_linear_map, + android_verity_target.status = dm_linear_status, + android_verity_target.ioctl = dm_linear_ioctl, + android_verity_target.merge = dm_linear_merge, + android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; - err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args); if (!err) { DMINFO("Added android-verity as a linear target"); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index fe53863c664b..efb796524896 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -94,4 +94,16 @@ struct bio_read { }; extern struct target_type linear_target; + +extern void dm_linear_dtr(struct dm_target *ti); +extern int dm_linear_map(struct dm_target *ti, struct bio *bio); +extern void dm_linear_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg); +extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int dm_linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d8bf876e7bed..74caca2888a6 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -25,7 +25,7 @@ struct linear_c { /* * Construct a linear mapping: */ -static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct linear_c *lc; unsigned long long tmp; @@ -67,7 +67,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) return ret; } -static void linear_dtr(struct dm_target *ti) +void dm_linear_dtr(struct dm_target *ti) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -92,14 +92,14 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) linear_map_sector(ti, bio->bi_iter.bi_sector); } -static int linear_map(struct dm_target *ti, struct bio *bio) +int dm_linear_map(struct dm_target *ti, struct bio *bio) { linear_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } -static void linear_status(struct dm_target *ti, status_type_t type, +void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -116,7 +116,7 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, +static int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -133,7 +133,7 @@ static int linear_prepare_ioctl(struct dm_target *ti, return 0; } -static int linear_iterate_devices(struct dm_target *ti, +int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct linear_c *lc = ti->private; @@ -141,16 +141,16 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -struct target_type linear_target = { +static struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, - .ctr = linear_ctr, - .dtr = linear_dtr, - .map = linear_map, - .status = linear_status, - .prepare_ioctl = linear_prepare_ioctl, - .iterate_devices = linear_iterate_devices, + .ctr = dm_linear_ctr, + .dtr = dm_linear_dtr, + .map = dm_linear_map, + .status = dm_linear_status, + .prepare_ioctl = dm_linear_prepare_ioctl, + .iterate_devices = dm_linear_iterate_devices, }; int __init dm_linear_init(void) From 967ec731fff86e79d89180d56d89761b7a85860f Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Fri, 15 Apr 2016 13:32:54 +0200 Subject: [PATCH 140/813] ANDROID: dm: use name_to_dev_t This patch makes android_verity_ctr() parse its block device string parameter with name_to_dev_t(). It allows the use of less hardware related block device reference like PARTUUID for instance. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf07 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index b7e059595f75..9c26cbb5f179 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -613,8 +613,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* One for specifying number of opt args and one for mode */ sector_t data_sectors; u32 data_block_size; - unsigned int major, minor, - no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; @@ -630,13 +629,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) key_id = argv[0]; strreplace(argv[0], '#', ' '); - if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { - dev = MKDEV(major, minor); - if (MAJOR(dev) != major || MINOR(dev) != minor) { - DMERR("Incorrect bdev major minor number"); - handle_error(); - return -EOVERFLOW; - } + dev = name_to_dev_t(argv[1]); + if (!dev) { + DMERR("no dev found for %s", argv[1]); + handle_error(); + return -EINVAL; } DMINFO("key:%s dev:%s", argv[0], argv[1]); From 091d8e6c051e76171c22d1b98c9aabf05b4f74d6 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:44:19 -0700 Subject: [PATCH 141/813] ANDROID: dm: fix signature verification flag The bug was that the signature verification was only happening when verity was disabled. It should always happen when verity is enabled. Signed-off-by: Badhri Jagan Sridharan Change-Id: I2d9354e240d36ea06fc68c2d18d8e87b823a4c2f (cherry picked from commit 5364b5ca0b1a12a58283b51408e43fc36d4e4fe7) --- drivers/md/dm-android-verity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 9c26cbb5f179..00275a986d03 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -667,7 +667,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto free_metadata; } - if (!verity_enabled) { + if (verity_enabled) { err = verify_verity_signature(key_id, metadata); if (err) { From e14b861c590bff3b3027c75b0b8f9278463f8736 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:45:45 -0700 Subject: [PATCH 142/813] ANDROID: dm: use default verity public key If the dm-android-verity target does not provide a default key try using the default public key from the system keyring. The defualt verity keyid is passed as a kernel command line argument veritykeyid=. The order of the dm-android-verity params have been reversed to facilitate the change. Old format example: dm="system none ro,0 1 android-verity Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f /dev/mmcblk0p43" New formats supported: dm="system none ro,0 1 android-verity /dev/mmcblk0p43 Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f" (or) dm="system none ro,0 1 android-verity /dev/mmcblk0p43" when veritykeyid= is set in the kernel command line. BUG: 28384658 Signed-off-by: Badhri Jagan Sridharan Change-Id: I506c89b053d835ab579e703eef2bc1f8487250de (cherry picked from commit c5c74d0327729f35b576564976885596c6d0e7fb) --- drivers/md/dm-android-verity.c | 67 ++++++++++++++++++++++++---------- drivers/md/dm-android-verity.h | 16 ++++++++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 00275a986d03..097fb2b1de89 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -43,6 +43,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; static bool target_added; static bool verity_enabled = true; @@ -79,6 +80,19 @@ static int __init verity_mode_param(char *line) __setup("androidboot.veritymode=", verity_mode_param); +static int __init verity_keyid_param(char *line) +{ + strlcpy(veritykeyid, line, sizeof(veritykeyid)); + return 1; +} + +__setup("veritykeyid=", verity_keyid_param); + +static inline bool default_verity_key_id(void) +{ + return veritykeyid[0] != '\0'; +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -608,7 +622,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) dev_t uninitialized_var(dev); struct android_metadata *uninitialized_var(metadata); int err = 0, i, mode; - char *key_id, *table_ptr, dummy, + char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; /* One for specifying number of opt args and one for mode */ sector_t data_sectors; @@ -619,24 +633,34 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - if (argc != 2) { + if (argc == 1) { + /* Use the default keyid */ + if (default_verity_key_id()) + key_id = veritykeyid; + else { + DMERR("veritykeyid= is not set"); + handle_error(); + return -EINVAL; + } + } else if (argc == 2) + key_id = argv[1]; + else { DMERR("Incorrect number of arguments"); handle_error(); return -EINVAL; } - /* should come as one of the arguments for the verity target */ - key_id = argv[0]; - strreplace(argv[0], '#', ' '); + strreplace(key_id, '#', ' '); + target_device = argv[0]; - dev = name_to_dev_t(argv[1]); + dev = name_to_dev_t(target_device); if (!dev) { - DMERR("no dev found for %s", argv[1]); + DMERR("no dev found for %s", target_device); handle_error(); return -EINVAL; } - DMINFO("key:%s dev:%s", argv[0], argv[1]); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { DMERR("Error while extracting fec header"); @@ -734,30 +758,33 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* Setup linear target and free */ if (!verity_enabled) { - err = add_as_linear_device(ti, argv[1]); + err = add_as_linear_device(ti, target_device); goto free_metadata; } /*substitute data_dev and hash_dev*/ - verity_table_args[1] = argv[1]; - verity_table_args[2] = argv[1]; + verity_table_args[1] = target_device; + verity_table_args[2] = target_device; mode = verity_mode(); if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, - "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, - 1 + VERITY_TABLE_OPT_FEC_ARGS, - mode == DM_VERITY_MODE_RESTART ? - VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, - argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, - ecc.roots); + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : + VERITY_TABLE_OPT_LOGGING, + target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } else { err = snprintf(buf, FEC_ARG_LENGTH, - "%u " VERITY_TABLE_OPT_FEC_FORMAT, - VERITY_TABLE_OPT_FEC_ARGS, argv[1], - ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } } else if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index efb796524896..43655ee0f813 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -27,6 +27,22 @@ #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +/* + * : is the format for the identifier. + * subject can either be the Common Name(CN) + Organization Name(O) or + * just the CN if the it is prefixed with O + * From https://tools.ietf.org/html/rfc5280#appendix-A + * ub-organization-name-length INTEGER ::= 64 + * ub-common-name-length INTEGER ::= 64 + * + * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278 + * ctx->o_size + 2 + ctx->cn_size + 1 + * + 41 characters for ":" and sha1 id + * 64 + 2 + 64 + 1 + 1 + 40 (172) + * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters. + */ +#define VERITY_DEFAULT_KEY_ID_LENGTH 200 + #define FEC_MAGIC 0xFECFECFE #define FEC_BLOCK_SIZE (4 * 1024) #define FEC_VERSION 0 From d5dc479e87d584f924f44a45082a221f4a21bfa2 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 17 Jun 2016 18:54:35 -0700 Subject: [PATCH 143/813] ANDROID: dm: mount as linear target if eng build eng builds dont have verity enabled i.e it does even have verity metadata appended to the parition. Therefore add rootdev as linear device and map the entire partition if build variant is "eng". (Cherry-picked based on https://partner-android-review.git.corp.google.com/#/c/618690/) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I8f5c2289b842b820ca04f5773525e5449bb3f355 --- drivers/md/dm-android-verity.c | 62 +++++++++++++++++++++++++++++++--- drivers/md/dm-android-verity.h | 1 + 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 097fb2b1de89..e1a8e284e7e4 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -44,6 +44,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; +static char buildvariant[BUILD_VARIANT]; static bool target_added; static bool verity_enabled = true; @@ -88,11 +89,26 @@ static int __init verity_keyid_param(char *line) __setup("veritykeyid=", verity_keyid_param); +static int __init verity_buildvariant(char *line) +{ + strlcpy(buildvariant, line, sizeof(buildvariant)); + return 1; +} + +__setup("buildvariant=", verity_buildvariant); + static inline bool default_verity_key_id(void) { return veritykeyid[0] != '\0'; } +static inline bool is_eng(void) +{ + static const char typeeng[] = "eng"; + + return !strncmp(buildvariant, typeeng, sizeof(typeeng)); +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -262,7 +278,7 @@ static int extract_fec_header(dev_t dev, struct fec_header *fec, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("bdev get error"); return PTR_ERR(bdev); } @@ -323,6 +339,24 @@ static void find_metadata_offset(struct fec_header *fec, *metadata_offset = device_size - VERITY_METADATA_SIZE; } +static int find_size(dev_t dev, u64 *device_size) +{ + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR_OR_NULL(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return PTR_ERR(bdev); + } + + *device_size = i_size_read(bdev->bd_inode); + *device_size >>= SECTOR_SHIFT; + + DMINFO("blkdev size in sectors: %llu", *device_size); + blkdev_put(bdev, FMODE_READ); + return 0; +} + static struct android_metadata *extract_metadata(dev_t dev, struct fec_header *fec) { @@ -337,7 +371,7 @@ static struct android_metadata *extract_metadata(dev_t dev, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); return ERR_CAST(bdev); } @@ -632,12 +666,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; + u64 device_size; if (argc == 1) { /* Use the default keyid */ if (default_verity_key_id()) key_id = veritykeyid; - else { + else if (!is_eng()) { DMERR("veritykeyid= is not set"); handle_error(); return -EINVAL; @@ -650,7 +685,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - strreplace(key_id, '#', ' '); target_device = argv[0]; dev = name_to_dev_t(target_device); @@ -660,6 +694,26 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } + if (is_eng()) { + err = find_size(dev, &device_size); + if (err) { + DMERR("error finding bdev size"); + handle_error(); + return err; + } + + ti->len = device_size; + err = add_as_linear_device(ti, target_device); + if (err) { + handle_error(); + return err; + } + verity_enabled = false; + return 0; + } + + strreplace(key_id, '#', ' '); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 43655ee0f813..782e1c815c67 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -26,6 +26,7 @@ #define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +#define BUILD_VARIANT 20 /* * : is the format for the identifier. From 255fb5b67821db231dcc6e23da14fce3bb7ce2bb Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 27 Jun 2016 16:25:55 -0700 Subject: [PATCH 144/813] ANDROID: dm: allow adb disable-verity only in userdebug adb disable-verity was allowed when the phone is in the unlocked state. Since the driver is now aware of the build variant, honor "adb disable-verity" only in userdebug builds. (Cherry-picked from https://partner-android-review.git.corp.google.com/#/c/622117) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I7ce9f38d8c7a62361392c5a8ccebb288f8a3a2ea --- drivers/md/dm-android-verity.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index e1a8e284e7e4..999e75bf2ba0 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -109,6 +109,14 @@ static inline bool is_eng(void) return !strncmp(buildvariant, typeeng, sizeof(typeeng)); } +static inline bool is_userdebug(void) +{ + static const char typeuserdebug[] = "userdebug"; + + return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug)); +} + + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -499,19 +507,6 @@ const char *find_dt_value(const char *name) return value; } -static bool is_unlocked(void) -{ - static const char unlocked[] = "orange"; - static const char verified_boot_prop[] = "verifiedbootstate"; - const char *value; - - value = find_dt_value(verified_boot_prop); - if (!value) - value = verifiedbootstate; - - return !strncmp(value, unlocked, sizeof(unlocked) - 1); -} - static int verity_mode(void) { static const char enforcing[] = "enforcing"; @@ -531,7 +526,7 @@ static int verify_header(struct android_metadata_header *header) { int retval = -EINVAL; - if (is_unlocked() && le32_to_cpu(header->magic_number) == + if (is_userdebug() && le32_to_cpu(header->magic_number) == VERITY_METADATA_MAGIC_DISABLE) { retval = VERITY_STATE_DISABLE; return retval; From 5e780ef26f1e7d56d33bb61cd32cfb6013c9f31e Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 6 Jul 2016 17:16:19 -0700 Subject: [PATCH 145/813] ANDROID: dm: android-verity: Verify header before fetching table Move header validation logic before reading the verity_table as an invalid header implies the table is invalid as well. (Cherry-picked from: https://partner-android-review.git.corp.google.com/#/c/625203) BUG: 29940612 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ib34d25c0854202f3e70df0a6d0ef1d96f0250c8e --- drivers/md/dm-android-verity.c | 140 +++++++++++++++++---------------- 1 file changed, 71 insertions(+), 69 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 999e75bf2ba0..1f4eb099209d 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -365,12 +365,38 @@ static int find_size(dev_t dev, u64 *device_size) return 0; } -static struct android_metadata *extract_metadata(dev_t dev, - struct fec_header *fec) +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_userdebug() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) + return VERITY_STATE_DISABLE; + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int extract_metadata(dev_t dev, struct fec_header *fec, + struct android_metadata **metadata, + bool *verity_enabled) { struct block_device *bdev; struct android_metadata_header *header; - struct android_metadata *uninitialized_var(metadata); int i; u32 table_length, copy_length, offset; u64 metadata_offset; @@ -381,7 +407,7 @@ static struct android_metadata *extract_metadata(dev_t dev, if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); - return ERR_CAST(bdev); + return -ENODEV; } find_metadata_offset(fec, bdev, &metadata_offset); @@ -399,7 +425,6 @@ static struct android_metadata *extract_metadata(dev_t dev, (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); if (err) { DMERR("Error while reading verity metadata"); - metadata = ERR_PTR(err); goto blkdev_release; } @@ -418,24 +443,42 @@ static struct android_metadata *extract_metadata(dev_t dev, le32_to_cpu(header->protocol_version), le32_to_cpu(header->table_length)); - metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); - if (!metadata) { + err = verify_header(header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + *verity_enabled = false; + /* we would still have to read the metadata to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + * + * Reset error as well as the verity_enabled flag is changed. + */ + err = 0; + } else if (err) + goto free_header; + + *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL); + if (!*metadata) { DMERR("kzalloc for metadata failed"); err = -ENOMEM; goto free_header; } - metadata->header = header; + (*metadata)->header = header; table_length = le32_to_cpu(header->table_length); if (table_length == 0 || table_length > (VERITY_METADATA_SIZE - - sizeof(struct android_metadata_header))) + sizeof(struct android_metadata_header))) { + DMERR("table_length too long"); + err = -EINVAL; goto free_metadata; + } - metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL); - if (!metadata->verity_table) { + if (!(*metadata)->verity_table) { DMERR("kzalloc verity_table failed"); err = -ENOMEM; goto free_metadata; @@ -443,13 +486,15 @@ static struct android_metadata *extract_metadata(dev_t dev, if (sizeof(struct android_metadata_header) + table_length <= PAGE_SIZE) { - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), table_length); } else { copy_length = PAGE_SIZE - sizeof(struct android_metadata_header); - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), copy_length); table_length -= copy_length; @@ -457,13 +502,13 @@ static struct android_metadata *extract_metadata(dev_t dev, i = 1; while (table_length != 0) { if (table_length > PAGE_SIZE) { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), PAGE_SIZE); offset += PAGE_SIZE; table_length -= PAGE_SIZE; } else { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), table_length); table_length = 0; @@ -471,25 +516,23 @@ static struct android_metadata *extract_metadata(dev_t dev, i++; } } - metadata->verity_table[table_length] = '\0'; + (*metadata)->verity_table[table_length] = '\0'; + DMINFO("verity_table: %s", (*metadata)->verity_table); goto free_payload; free_metadata: - kfree(metadata); + kfree(*metadata); free_header: kfree(header); - metadata = ERR_PTR(err); free_payload: for (i = 0; i < payload.number_of_pages; i++) if (payload.page_io[i]) __free_page(payload.page_io[i]); kfree(payload.page_io); - - DMINFO("verity_table: %s", metadata->verity_table); blkdev_release: blkdev_put(bdev, FMODE_READ); - return metadata; + return err; } /* helper functions to extract properties from dts */ @@ -522,34 +565,6 @@ static int verity_mode(void) return DM_VERITY_MODE_EIO; } -static int verify_header(struct android_metadata_header *header) -{ - int retval = -EINVAL; - - if (is_userdebug() && le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE) { - retval = VERITY_STATE_DISABLE; - return retval; - } - - if (!(le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_NUMBER) || - (le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE)) { - DMERR("Incorrect magic number"); - return retval; - } - - if (le32_to_cpu(header->protocol_version) != - VERITY_METADATA_VERSION) { - DMERR("Unsupported version %u", - le32_to_cpu(header->protocol_version)); - return retval; - } - - return 0; -} - static int verify_verity_signature(char *key_id, struct android_metadata *metadata) { @@ -649,7 +664,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { dev_t uninitialized_var(dev); - struct android_metadata *uninitialized_var(metadata); + struct android_metadata *metadata = NULL; int err = 0, i, mode; char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; @@ -717,26 +732,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - metadata = extract_metadata(dev, &fec); + err = extract_metadata(dev, &fec, &metadata, &verity_enabled); - if (IS_ERR(metadata)) { + if (err) { DMERR("Error while extracting metadata"); handle_error(); - return -EINVAL; - } - - err = verify_header(metadata->header); - - if (err == VERITY_STATE_DISABLE) { - DMERR("Mounting root with verity disabled"); - verity_enabled = false; - /* we would still have to parse the args to figure out - * the data blocks size. Or may be could map the entire - * partition similar to mounting the device. - */ - } else if (err) { - DMERR("Verity header handle error"); - handle_error(); goto free_metadata; } @@ -869,8 +869,10 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } free_metadata: - kfree(metadata->header); - kfree(metadata->verity_table); + if (metadata) { + kfree(metadata->header); + kfree(metadata->verity_table); + } kfree(metadata); return err; } From 8c07f4332955923c5ad64dbfa36a30c781c2fd05 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Tue, 10 May 2016 13:10:20 +0200 Subject: [PATCH 146/813] ANDROID: dm verity fec: pack the fec_header structure The fec_header structure is generated build time and stored on disk. The fec_header might be build on a 64 bits machine while it is read per a 32 bits device or the other way around. In such situations, the fec_header fields are not aligned as expected by the device and it fails to read the fec_header structure. This patch makes the fec_header packed. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf06 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 782e1c815c67..f43b02fbb475 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -72,9 +72,6 @@ * if fec is not present * */ -/* TODO: rearrange structure to reduce memory holes - * depends on userspace change. - */ struct fec_header { __le32 magic; __le32 version; @@ -83,7 +80,7 @@ struct fec_header { __le32 fec_size; __le64 inp_size; u8 hash[SHA256_DIGEST_SIZE]; -}; +} __attribute__((packed)); struct android_metadata_header { __le32 magic_number; From deb79fc7b5430caa64af090ef6f1e9c9c7ea5894 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 1 Aug 2016 16:49:07 -0700 Subject: [PATCH 147/813] FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE check when adjusting self In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS) to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds when p == current, but the CAP_SYS_NICE doesn't. Thus while the previous commit was intended to loosen the needed privledges to modify a processes timerslack, it needlessly restricted a task modifying its own timerslack via the proc//timerslack_ns (which is permitted also via the PR_SET_TIMERSLACK method). This patch corrects this by checking if p == current before checking the CAP_SYS_NICE value. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Mailing-list-url: http://www.spinics.net/lists/kernel/msg2317488.html Change-Id: Ia3e8aff07c2d41f55b6617502d33c39b7d781aac Signed-off-by: John Stultz --- fs/proc/base.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 308d4a321874..df715a095328 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2261,15 +2261,17 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - count = -EPERM; - goto out; - } + if (p != current) { + if (!capable(CAP_SYS_NICE)) { + count = -EPERM; + goto out; + } - err = security_task_setscheduler(p); - if (err) { - count = err; - goto out; + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } } task_lock(p); @@ -2295,14 +2297,16 @@ static int timerslack_ns_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - err = -EPERM; - goto out; - } + if (p != current) { - err = security_task_getscheduler(p); - if (err) - goto out; + if (!capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + err = security_task_getscheduler(p); + if (err) + goto out; + } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); From 439bce3122de3963d6911c8514fd49b0fadfe989 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 12 Aug 2016 11:24:50 +0530 Subject: [PATCH 148/813] ANDROID: net: fib: remove duplicate assignment Remove duplicate FRA_GOTO assignment. Fixes: fd2cf795f3ab ("net: core: Support UID-based routing.") Change-Id: I462c24b16fdef42ae2332571a0b95de3ef9d2e25 Signed-off-by: Amit Pundir --- include/net/fib_rules.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 55b5419cb6a7..bdd985f41022 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -89,7 +89,6 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_UID_START] = { .type = NLA_U32 }, \ [FRA_UID_END] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ From 6acba5b6a1507f6282cbe985da7c0c3276d247d7 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 11 Aug 2016 19:13:22 +0530 Subject: [PATCH 149/813] ANDROID: net: core: fix UID-based routing Fix RTA_UID enum to match it with the Android userspace code which assumes RTA_UID=18. With this patch all Android kernel networking unit tests mentioned here https://source.android.com/devices/tech/config/kernel_network_tests.html are success. Without this patch multinetwork_test.py unit test fails. Change-Id: I3ff36670f7d4e5bf5f01dce584ae9d53deabb3ed Signed-off-by: Amit Pundir --- include/uapi/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 355eea225dd9..3eb02a1d6d8c 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -306,12 +306,12 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, + RTA_UID, RTA_VIA, RTA_NEWDST, RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, - RTA_UID, __RTA_MAX }; From b4d9d427c64f803e0797f3ea15f9b3acf27e0201 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 4 May 2016 14:04:13 -0400 Subject: [PATCH 150/813] UPSTREAM: ecryptfs: fix handling of directory opening (cherry picked from commit 6a480a7842545ec520a91730209ec0bae41694c1) First of all, trying to open them r/w is idiocy; it's guaranteed to fail. Moreover, assigning ->f_pos and assuming that everything will work is blatantly broken - try that with e.g. tmpfs as underlying layer and watch the fireworks. There may be a non-trivial amount of state associated with current IO position, well beyond the numeric offset. Using the single struct file associated with underlying inode is really not a good idea; we ought to open one for each ecryptfs directory struct file. Additionally, file_operations both for directories and non-directories are full of pointless methods; non-directories should *not* have ->iterate(), directories should not have ->flush(), ->fasync() and ->splice_read(). Signed-off-by: Al Viro Change-Id: I4813ce803f270fdd364758ce1dc108b76eab226e Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 11309683d65f..27794b137b24 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -236,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -260,6 +251,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -280,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -359,20 +402,16 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, From 8a1749a461a972b09674cb72b79a942839bc4ba4 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 9 Aug 2016 12:47:37 -0700 Subject: [PATCH 151/813] ANDROID: dm-verity: adopt changes made to dm callbacks v4.4 introduced changes to the callbacks used for dm-linear and dm-verity-target targets. Move to those headers in dm-android-verity. Verified on hikey while having BOARD_USES_RECOVERY_AS_BOOT := true BOARD_BUILD_SYSTEM_ROOT_IMAGE := true BUG: 27339727 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ic64950c3b55f0a6eaa570bcedc2ace83bbf3005e --- drivers/md/dm-android-verity.c | 12 +++++------- drivers/md/dm-android-verity.h | 6 ++---- drivers/md/dm-linear.c | 2 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-verity.h | 6 ++---- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 1f4eb099209d..15ce2a81c1f4 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -59,8 +59,7 @@ static struct target_type android_verity_target = { .dtr = verity_dtr, .map = verity_map, .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, + .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, .io_hints = verity_io_hints, }; @@ -637,8 +636,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) android_verity_target.dtr = dm_linear_dtr, android_verity_target.map = dm_linear_map, android_verity_target.status = dm_linear_status, - android_verity_target.ioctl = dm_linear_ioctl, - android_verity_target.merge = dm_linear_merge, + android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl, android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; @@ -676,7 +674,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - u64 device_size; + u64 uninitialized_var(device_size); if (argc == 1) { /* Use the default keyid */ @@ -896,7 +894,7 @@ static int __init dm_android_verity_init(void) } file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, - (u32 *)&target_added); + &target_added); if (IS_ERR_OR_NULL(file)) { DMERR("Cannot create android_verity debugfs directory: %ld", @@ -906,7 +904,7 @@ static int __init dm_android_verity_init(void) } file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, - (u32 *)&verity_enabled); + &verity_enabled); if (IS_ERR_OR_NULL(file)) { DMERR("Cannot create android_verity debugfs directory: %ld", diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index f43b02fbb475..0c7ff6afec69 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -113,10 +113,8 @@ extern void dm_linear_dtr(struct dm_target *ti); extern int dm_linear_map(struct dm_target *ti, struct bio *bio); extern void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen); -extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg); -extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); +extern int dm_linear_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode); extern int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data); extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 74caca2888a6..8505a771de42 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -116,7 +116,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, } } -static int dm_linear_prepare_ioctl(struct dm_target *ti, +int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 65835f15a116..5214ed2c7507 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -656,7 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, +int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct dm_verity *v = ti->private; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index d9cf5e4939eb..75effca400a3 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -128,10 +128,8 @@ extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, extern void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen); -extern int verity_ioctl(struct dm_target *ti, unsigned cmd, - unsigned long arg); -extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); +extern int verity_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode); extern int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data); extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); From b8060c794f53e364a967ce483cef8e86f4006a61 Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Wed, 27 Jul 2016 10:03:19 +0800 Subject: [PATCH 152/813] UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget There may be a race condition if f_fs calls unregister_gadget_item in ffs_closed() when unregister_gadget is called by UDC store at the same time. this leads to a kernel NULL pointer dereference: [ 310.644928] Unable to handle kernel NULL pointer dereference at virtual address 00000004 [ 310.645053] init: Service 'adbd' is being killed... [ 310.658938] pgd = c9528000 [ 310.662515] [00000004] *pgd=19451831, *pte=00000000, *ppte=00000000 [ 310.669702] Internal error: Oops: 817 [#1] PREEMPT SMP ARM [ 310.675211] Modules linked in: [ 310.678294] CPU: 0 PID: 1537 Comm: ->transport Not tainted 4.1.15-03725-g793404c #2 [ 310.685958] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 310.692493] task: c8e24200 ti: c945e000 task.ti: c945e000 [ 310.697911] PC is at usb_gadget_unregister_driver+0xb4/0xd0 [ 310.703502] LR is at __mutex_lock_slowpath+0x10c/0x16c [ 310.708648] pc : [] lr : [] psr: 600f0113 [ 311.565585] [] (usb_gadget_unregister_driver) from [] (unregister_gadget_item+0x1c/0x34) [ 311.575426] [] (unregister_gadget_item) from [] (ffs_closed+0x8c/0x9c) [ 311.583702] [] (ffs_closed) from [] (ffs_data_reset+0xc/0xa0) [ 311.591194] [] (ffs_data_reset) from [] (ffs_data_closed+0x90/0xd0) [ 311.599208] [] (ffs_data_closed) from [] (ffs_ep0_release+0xc/0x14) [ 311.607224] [] (ffs_ep0_release) from [] (__fput+0x80/0x1d0) [ 311.614635] [] (__fput) from [] (task_work_run+0xb0/0xe8) [ 311.621788] [] (task_work_run) from [] (do_work_pending+0x7c/0xa4) [ 311.629718] [] (do_work_pending) from [] (work_pending+0xc/0x20) for functions using functionFS, i.e. android adbd will close /dev/usb-ffs/adb/ep0 when usb IO thread fails, but switch adb from on to off also triggers write "none" > UDC. These 2 operations both call unregister_gadget, which will lead to the panic above. add a mutex before calling unregister_gadget for api used in f_fs. Signed-off-by: Winter Wang Signed-off-by: Felipe Balbi --- drivers/usb/gadget/configfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index b27ce0747c18..8df96cb3bb58 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1737,7 +1737,9 @@ void unregister_gadget_item(struct config_item *item) { struct gadget_info *gi = to_gadget_info(item); + mutex_lock(&gi->lock); unregister_gadget(gi); + mutex_unlock(&gi->lock); } EXPORT_SYMBOL_GPL(unregister_gadget_item); From bbd7cf3d188c2cae5784703d767900c0a9740db7 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Sun, 8 May 2016 23:20:59 +0200 Subject: [PATCH 153/813] usb: gadget: avoid exposing kernel stack commit ffeee83aa0461992e8a99a59db2df31933e60362 upstream. Function in_rq_cur copies random bytes from the stack. Zero the memory instead. Fixes: 132fcb460839 ("usb: gadget: Add Audio Class 2.0 Driver") Signed-off-by: Heinrich Schuchardt Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_uac2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index 044ca79d3cb5..12628dd36e55 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -1291,6 +1291,7 @@ in_rq_cur(struct usb_function *fn, const struct usb_ctrlrequest *cr) if (control_selector == UAC2_CS_CONTROL_SAM_FREQ) { struct cntrl_cur_lay3 c; + memset(&c, 0, sizeof(struct cntrl_cur_lay3)); if (entity_id == USB_IN_CLK_ID) c.dCUR = p_srate; From f1d7992894fafb62b790b5112630b4f081b3b454 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 28 May 2016 07:48:10 +0300 Subject: [PATCH 154/813] usb: f_fs: off by one bug in _ffs_func_bind() commit 0015f9156092d07b3ec06d37d014328419d5832e upstream. This loop is supposed to set all the .num[] values to -1 but it's off by one so it skips the first element and sets one element past the end of the array. I've cleaned up the loop a little as well. Fixes: ddf8abd25994 ('USB: f_fs: the FunctionFS driver') Acked-by: Michal Nazarewicz Signed-off-by: Dan Carpenter Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 97ef75af9632..803c503a2e3d 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -2740,6 +2740,7 @@ static int _ffs_func_bind(struct usb_configuration *c, func->ffs->ss_descs_count; int fs_len, hs_len, ss_len, ret, i; + struct ffs_ep *eps_ptr; /* Make it a single chunk, less management later on */ vla_group(d); @@ -2788,12 +2789,9 @@ static int _ffs_func_bind(struct usb_configuration *c, ffs->raw_descs_length); memset(vla_ptr(vlabuf, d, inums), 0xff, d_inums__sz); - for (ret = ffs->eps_count; ret; --ret) { - struct ffs_ep *ptr; - - ptr = vla_ptr(vlabuf, d, eps); - ptr[ret].num = -1; - } + eps_ptr = vla_ptr(vlabuf, d, eps); + for (i = 0; i < ffs->eps_count; i++) + eps_ptr[i].num = -1; /* Save pointers * d_eps == vlabuf, func->eps used to kfree vlabuf later From 0927c5f9515278458c1b0d93ed7aa8fea66d798b Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 8 Jun 2016 16:32:50 +0900 Subject: [PATCH 155/813] usb: renesas_usbhs: protect the CFIFOSEL setting in usbhsg_ep_enable() commit 15e4292a2d21e9997fdb2b8c014cc461b3f268f0 upstream. This patch fixes an issue that the CFIFOSEL register value is possible to be changed by usbhsg_ep_enable() wrongly. And then, a data transfer using CFIFO may not work correctly. For example: # modprobe g_multi file=usb-storage.bin # ifconfig usb0 192.168.1.1 up (During the USB host is sending file to the mass storage) # ifconfig usb0 down In this case, since the u_ether.c may call usb_ep_enable() in eth_stop(), if the renesas_usbhs driver is also using CFIFO for mass storage, the mass storage may not work correctly. So, this patch adds usbhs_lock() and usbhs_unlock() calling in usbhsg_ep_enable() to protect CFIFOSEL register. This is because: - CFIFOSEL.CURPIPE = 0 is also needed for the pipe configuration - The CFIFOSEL (fifo->sel) is already protected by usbhs_lock() Fixes: 97664a207bc2 ("usb: renesas_usbhs: shrink spin lock area") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/mod_gadget.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c index fa14198daf77..5a3abf56d56b 100644 --- a/drivers/usb/renesas_usbhs/mod_gadget.c +++ b/drivers/usb/renesas_usbhs/mod_gadget.c @@ -586,6 +586,9 @@ static int usbhsg_ep_enable(struct usb_ep *ep, struct usbhs_priv *priv = usbhsg_gpriv_to_priv(gpriv); struct usbhs_pipe *pipe; int ret = -EIO; + unsigned long flags; + + usbhs_lock(priv, flags); /* * if it already have pipe, @@ -594,7 +597,8 @@ static int usbhsg_ep_enable(struct usb_ep *ep, if (uep->pipe) { usbhs_pipe_clear(uep->pipe); usbhs_pipe_sequence_data0(uep->pipe); - return 0; + ret = 0; + goto usbhsg_ep_enable_end; } pipe = usbhs_pipe_malloc(priv, @@ -622,6 +626,9 @@ static int usbhsg_ep_enable(struct usb_ep *ep, ret = 0; } +usbhsg_ep_enable_end: + usbhs_unlock(priv, flags); + return ret; } From 09796e2cfa937e7c8a0c8f87cdae21a7168477c8 Mon Sep 17 00:00:00 2001 From: Joseph Salisbury Date: Wed, 6 Jul 2016 21:18:51 -0400 Subject: [PATCH 156/813] usb: quirks: Add no-lpm quirk for Elan commit 25b1f9acc452209ae0fcc8c1332be852b5c52f53 upstream. BugLink: http://bugs.launchpad.net/bugs/1498667 As reported in BugLink, this device has an issue with Linux Power Management so adding a quirk. This quirk was reccomended by Alan Stern: http://lkml.iu.edu/hypermail/linux/kernel/1606.2/05590.html Signed-off-by: Joseph Salisbury Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 944a6dca0fcb..d2e50a27140c 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -128,6 +128,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x04f3, 0x016f), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, + { USB_DEVICE(0x04f3, 0x0381), .driver_info = + USB_QUIRK_NO_LPM }, + { USB_DEVICE(0x04f3, 0x21b8), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, From 558b4adac08dc14106af03b87216723f1f98d08c Mon Sep 17 00:00:00 2001 From: Konrad Leszczynski Date: Mon, 8 Feb 2016 16:13:12 +0100 Subject: [PATCH 157/813] usb: dwc3: fix for the isoc transfer EP_BUSY flag commit 9cad39fe4e4a4fe95d8ea5a7b0692b0a6e89e38b upstream. commit f3af36511e60 ("usb: dwc3: gadget: always enable IOC on bulk/interrupt transfers") ended up regressing Isochronous endpoints by clearing DWC3_EP_BUSY flag too early, which resulted in choppy audio playback over USB. Fix that by partially reverting original commit and making sure that we check for isochronous endpoints. Fixes: f3af36511e60 ("usb: dwc3: gadget: always enable IOC on bulk/interrupt transfers") Signed-off-by: Konrad Leszczynski Signed-off-by: Rafal Redzimski Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 69ffe6e8d77f..70900e6ca9bc 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1965,6 +1965,10 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep, return 1; } + if (usb_endpoint_xfer_isoc(dep->endpoint.desc)) + if ((event->status & DEPEVT_STATUS_IOC) && + (trb->ctrl & DWC3_TRB_CTRL_IOC)) + return 0; return 1; } From 3ab02b35d5ce7a161ff8ad141db207837015ad6c Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 6 Jun 2016 12:38:17 +0200 Subject: [PATCH 158/813] USB: serial: option: add support for Telit LE910 PID 0x1206 commit 3c0415fa08548e3bc63ef741762664497ab187ed upstream. This patch adds support for 0x1206 PID of Telit LE910. Since the interfaces positions are the same than the ones for 0x1043 PID of Telit LE922, telit_le922_blacklist_usbcfg3 is used. Signed-off-by: Daniele Palmas Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index d96d423d00e6..8e07536c233a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -273,6 +273,7 @@ static void option_instat_callback(struct urb *urb); #define TELIT_PRODUCT_LE922_USBCFG5 0x1045 #define TELIT_PRODUCT_LE920 0x1200 #define TELIT_PRODUCT_LE910 0x1201 +#define TELIT_PRODUCT_LE910_USBCFG4 0x1206 /* ZTE PRODUCTS */ #define ZTE_VENDOR_ID 0x19d2 @@ -1198,6 +1199,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910), .driver_info = (kernel_ulong_t)&telit_le910_blacklist }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), + .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), .driver_info = (kernel_ulong_t)&telit_le920_blacklist }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ From 97917f45211d7e5c0afd5f4dc33982c096a26722 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 8 Jun 2016 16:32:49 +0900 Subject: [PATCH 159/813] usb: renesas_usbhs: fix NULL pointer dereference in xfer_work() commit 4fdef698383db07d829da567e0e405fc41ff3a89 upstream. This patch fixes an issue that the xfer_work() is possible to cause NULL pointer dereference if the usb cable is disconnected while data transfer is running. In such case, a gadget driver may call usb_ep_disable()) before xfer_work() is actually called. In this case, the usbhs_pkt_pop() will call usbhsf_fifo_unselect(), and then usbhs_pipe_to_fifo() in xfer_work() will return NULL. Fixes: e73a989 ("usb: renesas_usbhs: add DMAEngine support") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index f1893e08e51a..db565f620f82 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -808,20 +808,27 @@ static void xfer_work(struct work_struct *work) { struct usbhs_pkt *pkt = container_of(work, struct usbhs_pkt, work); struct usbhs_pipe *pipe = pkt->pipe; - struct usbhs_fifo *fifo = usbhs_pipe_to_fifo(pipe); + struct usbhs_fifo *fifo; struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); struct dma_async_tx_descriptor *desc; - struct dma_chan *chan = usbhsf_dma_chan_get(fifo, pkt); + struct dma_chan *chan; struct device *dev = usbhs_priv_to_dev(priv); enum dma_transfer_direction dir; + unsigned long flags; + usbhs_lock(priv, flags); + fifo = usbhs_pipe_to_fifo(pipe); + if (!fifo) + goto xfer_work_end; + + chan = usbhsf_dma_chan_get(fifo, pkt); dir = usbhs_pipe_is_dir_in(pipe) ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV; desc = dmaengine_prep_slave_single(chan, pkt->dma + pkt->actual, pkt->trans, dir, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) - return; + goto xfer_work_end; desc->callback = usbhsf_dma_complete; desc->callback_param = pipe; @@ -829,7 +836,7 @@ static void xfer_work(struct work_struct *work) pkt->cookie = dmaengine_submit(desc); if (pkt->cookie < 0) { dev_err(dev, "Failed to submit dma descriptor\n"); - return; + goto xfer_work_end; } dev_dbg(dev, " %s %d (%d/ %d)\n", @@ -840,6 +847,9 @@ static void xfer_work(struct work_struct *work) usbhs_pipe_set_trans_count_if_bulk(pipe, pkt->trans); dma_async_issue_pending(chan); usbhs_pipe_enable(pipe); + +xfer_work_end: + usbhs_unlock(priv, flags); } /* From 4077ef4797a8ff007a2de091c9befee4882c4790 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:32:16 -0400 Subject: [PATCH 160/813] USB: usbfs: fix potential infoleak in devio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 681fef8380eb818c0b845fca5d2ab1dcbab114ee upstream. The stack object “ci” has a total size of 8 bytes. Its last 3 bytes are padding bytes which are not initialized and leaked to userland via “copy_to_user”. Signed-off-by: Kangjie Lu Signed-off-by: Chas Williams Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 38ae877c46e3..3ffb01ff6549 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1203,10 +1203,11 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg) static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg) { - struct usbdevfs_connectinfo ci = { - .devnum = ps->dev->devnum, - .slow = ps->dev->speed == USB_SPEED_LOW - }; + struct usbdevfs_connectinfo ci; + + memset(&ci, 0, sizeof(ci)); + ci.devnum = ps->dev->devnum; + ci.slow = ps->dev->speed == USB_SPEED_LOW; if (copy_to_user(arg, &ci, sizeof(ci))) return -EFAULT; From a5a095803c2469cd3d1fc5a11acc1a8150b7c9f4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 20 Jun 2016 18:28:01 +0100 Subject: [PATCH 161/813] arm64: kernel: Save and restore UAO and addr_limit on exception entry commit e19a6ee2460bdd0d0055a6029383422773f9999a upstream. If we take an exception while at EL1, the exception handler inherits the original context's addr_limit and PSTATE.UAO values. To be consistent always reset addr_limit and PSTATE.UAO on (re-)entry to EL1. This prevents accidental re-use of the original context's addr_limit. Based on a similar patch for arm from Russell King. Cc: # 4.6- Acked-by: Will Deacon Reviewed-by: Mark Rutland Signed-off-by: James Morse Signed-off-by: Will Deacon [ backport to stop perf misusing inherited addr_limit. Removed code interacting with UAO and the irqstack ] Link: https://bugs.chromium.org/p/project-zero/issues/detail?id=822 Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/ptrace.h | 2 ++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 15 ++++++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index a307eb6e7fa8..7f94755089e2 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -117,6 +117,8 @@ struct pt_regs { }; u64 orig_x0; u64 syscallno; + u64 orig_addr_limit; + u64 unused; // maintain 16 byte alignment }; #define arch_has_single_step() (1) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 25de8b244961..087cf9a65359 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -58,6 +58,7 @@ int main(void) DEFINE(S_PC, offsetof(struct pt_regs, pc)); DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); + DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ed3d75f6304..5a3753d09e20 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -93,7 +94,13 @@ disable_step_tsk x19, x20 // exceptions when scheduling. .else add x21, sp, #S_FRAME_SIZE - .endif + get_thread_info tsk + /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ + ldr x20, [tsk, #TI_ADDR_LIMIT] + str x20, [sp, #S_ORIG_ADDR_LIMIT] + mov x20, #TASK_SIZE_64 + str x20, [tsk, #TI_ADDR_LIMIT] + .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] @@ -117,6 +124,12 @@ .endm .macro kernel_exit, el + .if \el != 0 + /* Restore the task's original addr_limit. */ + ldr x20, [sp, #S_ORIG_ADDR_LIMIT] + str x20, [tsk, #TI_ADDR_LIMIT] + .endif + ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter From f4a65209ddfce90381787859b97034f405820d63 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 19 Jul 2016 15:07:37 +0100 Subject: [PATCH 162/813] arm64: debug: unmask PSTATE.D earlier commit 2ce39ad15182604beb6c8fa8bed5e46b59fd1082 upstream. Clearing PSTATE.D is one of the requirements for generating a debug exception. The arm64 booting protocol requires that PSTATE.D is set, since many of the debug registers (for example, the hw_breakpoint registers) are UNKNOWN out of reset and could potentially generate spurious, fatal debug exceptions in early boot code if PSTATE.D was clear. Once the debug registers have been safely initialised, PSTATE.D is cleared, however this is currently broken for two reasons: (1) The boot CPU clears PSTATE.D in a postcore_initcall and secondary CPUs clear PSTATE.D in secondary_start_kernel. Since the initcall runs after SMP (and the scheduler) have been initialised, there is no guarantee that it is actually running on the boot CPU. In this case, the boot CPU is left with PSTATE.D set and is not capable of generating debug exceptions. (2) In a preemptible kernel, we may explicitly schedule on the IRQ return path to EL1. If an IRQ occurs with PSTATE.D set in the idle thread, then we may schedule the kthread_init thread, run the postcore_initcall to clear PSTATE.D and then context switch back to the idle thread before returning from the IRQ. The exception return path will then restore PSTATE.D from the stack, and set it again. This patch fixes the problem by moving the clearing of PSTATE.D earlier to proc.S. This has the desirable effect of clearing it in one place for all CPUs, long before we have to worry about the scheduler or any exception handling. We ensure that the previous reset of MDSCR_EL1 has completed before unmasking the exception, so that any spurious exceptions resulting from UNKNOWN debug registers are not generated. Without this patch applied, the kprobes selftests have been seen to fail under KVM, where we end up attempting to step the OOL instruction buffer with PSTATE.D set and therefore fail to complete the step. Acked-by: Mark Rutland Reported-by: Catalin Marinas Tested-by: Marc Zyngier Signed-off-by: Will Deacon Reviewed-by: Catalin Marinas Tested-by: Catalin Marinas Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/debug-monitors.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/proc.S | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index c1492ba1f6d1..e51f27ac13fd 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -152,7 +152,6 @@ static int debug_monitors_init(void) /* Clear the OS lock. */ on_each_cpu(clear_os_lock, NULL, 1); isb(); - local_dbg_enable(); /* Register hotplug handler. */ __register_cpu_notifier(&os_lock_nb); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b1adc51b2c2e..b796f873b0c4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -188,7 +188,6 @@ asmlinkage void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); - local_dbg_enable(); local_irq_enable(); local_async_enable(); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index b8f04b3f2786..1f6bb29ca53b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -156,6 +156,8 @@ ENTRY(__cpu_setup) msr cpacr_el1, x0 // Enable FP/ASIMD mov x0, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x0 // access to the DCC from EL0 + isb // Unmask debug exceptions now, + enable_dbg // since this is per-cpu reset_pmuserenr_el0 x0 // Disable PMU access from EL0 /* * Memory region attributes for LPAE: From ec2fdbebceb9d98fcf2f94a6d5a6f644491f2f76 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 21 Jul 2016 11:12:55 +0100 Subject: [PATCH 163/813] arm64: Fix incorrect per-cpu usage for boot CPU commit 9113c2aa05e9848cd4f1154abee17d4f265f012d upstream. In smp_prepare_boot_cpu(), we invoke cpuinfo_store_boot_cpu to store the cpuinfo in a per-cpu ptr, before initialising the per-cpu offset for the boot CPU. This patch reorders the sequence to make sure we initialise the per-cpu offset before accessing the per-cpu area. Commit 4b998ff1885eec ("arm64: Delay cpuinfo_store_boot_cpu") fixed the issue where we modified the per-cpu area even before the kernel initialises the per-cpu areas, but failed to wait until the boot cpu updated it's offset. Fixes: 4b998ff1885e ("arm64: Delay cpuinfo_store_boot_cpu") Cc: # 4.4+ Cc: Will Deacon Signed-off-by: Suzuki K Poulose Acked-by: Mark Rutland Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b796f873b0c4..f3c3d8fee5ba 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -333,8 +333,8 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { - cpuinfo_store_boot_cpu(); set_my_cpu_offset(per_cpu_offset(smp_processor_id())); + cpuinfo_store_boot_cpu(); } static u64 __init of_get_cpu_mpidr(struct device_node *dn) From 79b8ddbebf7420fa71c76d4f5f892b21ef1e4456 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 2 Jun 2016 17:48:28 -0700 Subject: [PATCH 164/813] tty: serial: msm: Don't read off end of tx fifo commit 30acf549ca1e81859a67590ab9ecfce3d1050a0b upstream. For dm uarts in pio mode tx data is transferred to the fifo register 4 bytes at a time, but care is not taken when these 4 bytes spans the end of the xmit buffer so the loop might read up to 3 bytes past the buffer and then skip the actual data at the beginning of the buffer. Fix this by, analogous to the DMA case, make sure the chunk doesn't wrap the xmit buffer. Fixes: 3a878c430fd6 ("tty: serial: msm: Add TX DMA support") Cc: Ivan Ivanov Reported-by: Frank Rowand Reported-by: Nicolas Dechesne Signed-off-by: Bjorn Andersson Acked-by: Andy Gross Tested-by: Frank Rowand Reviewed-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/msm_serial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c index dcde955475dc..e1de4944e0ce 100644 --- a/drivers/tty/serial/msm_serial.c +++ b/drivers/tty/serial/msm_serial.c @@ -726,7 +726,7 @@ static void msm_handle_tx(struct uart_port *port) return; } - pio_count = CIRC_CNT(xmit->head, xmit->tail, UART_XMIT_SIZE); + pio_count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); dma_count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); dma_min = 1; /* Always DMA */ From 8af97d26ce2054f4914eb382be2ab1d7c994b190 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 16 Jun 2016 08:27:35 +0200 Subject: [PATCH 165/813] serial: samsung: Fix ERR pointer dereference on deferred probe commit e51e4d8a185de90424b03f30181b35f29c46a25a upstream. When the clk_get() of "uart" clock returns EPROBE_DEFER, the next re-probe finishes with success but uses invalid (ERR_PTR) values. This leads to dereferencing of ERR_PTR stored under ourport->clk: 12c30000.serial: Controller clock not found (...) 12c30000.serial: ttySAC3 at MMIO 0x12c30000 (irq = 61, base_baud = 0) is a S3C6400/10 Unable to handle kernel paging request at virtual address fffffdfb (clk_prepare) from [] (s3c24xx_serial_pm+0x20/0x128) (s3c24xx_serial_pm) from [] (uart_change_pm+0x38/0x40) (uart_change_pm) from [] (uart_add_one_port+0x31c/0x44c) (uart_add_one_port) from [] (s3c24xx_serial_probe+0x2a8/0x418) (s3c24xx_serial_probe) from [] (platform_drv_probe+0x50/0xb0) (platform_drv_probe) from [] (driver_probe_device+0x1f4/0x2b0) (driver_probe_device) from [] (bus_for_each_drv+0x44/0x8c) (bus_for_each_drv) from [] (__device_attach+0x9c/0x100) (__device_attach) from [] (bus_probe_device+0x84/0x8c) (bus_probe_device) from [] (deferred_probe_work_func+0x60/0x8c) (deferred_probe_work_func) from [] (process_one_work+0x120/0x328) (process_one_work) from [] (worker_thread+0x2c/0x4ac) (worker_thread) from [] (kthread+0xd8/0xf4) (kthread) from [] (ret_from_fork+0x14/0x3c) The first unsuccessful clk_get() causes s3c24xx_serial_init_port() to exit with failure but the s3c24xx_uart_port is left half-configured (e.g. port->mapbase is set, clk contains ERR_PTR). On next re-probe, the function s3c24xx_serial_init_port() will exit early with success because of configured port->mapbase and driver will use old values, including the ERR_PTR as clock. Fix this by cleaning the port->mapbase on error path so each re-probe will initialize all of the port settings. Fixes: 60e93575476f ("serial: samsung: enable clock before clearing pending interrupts during init") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Javier Martinez Canillas Tested-by: Javier Martinez Canillas Tested-by: Kevin Hilman Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/samsung.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/samsung.c b/drivers/tty/serial/samsung.c index 8320173af846..237ef5573c18 100644 --- a/drivers/tty/serial/samsung.c +++ b/drivers/tty/serial/samsung.c @@ -1676,7 +1676,7 @@ static int s3c24xx_serial_init_port(struct s3c24xx_uart_port *ourport, return -ENODEV; if (port->mapbase != 0) - return 0; + return -EINVAL; /* setup info for port */ port->dev = &platdev->dev; @@ -1730,22 +1730,25 @@ static int s3c24xx_serial_init_port(struct s3c24xx_uart_port *ourport, ourport->dma = devm_kzalloc(port->dev, sizeof(*ourport->dma), GFP_KERNEL); - if (!ourport->dma) - return -ENOMEM; + if (!ourport->dma) { + ret = -ENOMEM; + goto err; + } } ourport->clk = clk_get(&platdev->dev, "uart"); if (IS_ERR(ourport->clk)) { pr_err("%s: Controller clock not found\n", dev_name(&platdev->dev)); - return PTR_ERR(ourport->clk); + ret = PTR_ERR(ourport->clk); + goto err; } ret = clk_prepare_enable(ourport->clk); if (ret) { pr_err("uart: clock failed to prepare+enable: %d\n", ret); clk_put(ourport->clk); - return ret; + goto err; } /* Keep all interrupts masked and cleared */ @@ -1761,7 +1764,12 @@ static int s3c24xx_serial_init_port(struct s3c24xx_uart_port *ourport, /* reset the fifos (and setup the uart) */ s3c24xx_serial_resetport(port, cfg); + return 0; + +err: + port->mapbase = 0; + return ret; } /* Device driver serial port probe */ From 580b1bbcc94e37d24e7469b3778338e4e988b190 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sat, 28 May 2016 00:54:08 +0200 Subject: [PATCH 166/813] tty/serial: atmel: fix RS485 half duplex with DMA commit 0058f0871efe7b01c6f2b3046c68196ab73e96da upstream. When using DMA, half duplex doesn't work properly because rx is not stopped before starting tx. Ensure we call atmel_stop_rx() in the DMA case. Signed-off-by: Alexandre Belloni Acked-by: Nicolas Ferre Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 7bbadd176c74..7b5462eb8388 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -485,19 +485,21 @@ static void atmel_start_tx(struct uart_port *port) { struct atmel_uart_port *atmel_port = to_atmel_uart_port(port); - if (atmel_use_pdc_tx(port)) { - if (atmel_uart_readl(port, ATMEL_PDC_PTSR) & ATMEL_PDC_TXTEN) - /* The transmitter is already running. Yes, we - really need this.*/ - return; + if (atmel_use_pdc_tx(port) && (atmel_uart_readl(port, ATMEL_PDC_PTSR) + & ATMEL_PDC_TXTEN)) + /* The transmitter is already running. Yes, we + really need this.*/ + return; + if (atmel_use_pdc_tx(port) || atmel_use_dma_tx(port)) if ((port->rs485.flags & SER_RS485_ENABLED) && !(port->rs485.flags & SER_RS485_RX_DURING_TX)) atmel_stop_rx(port); + if (atmel_use_pdc_tx(port)) /* re-enable PDC transmit */ atmel_uart_writel(port, ATMEL_PDC_PTCR, ATMEL_PDC_TXTEN); - } + /* Enable interrupts */ atmel_uart_writel(port, ATMEL_US_IER, atmel_port->tx_done_mask); } From 4a2773383ff778fc050e45f975adbcdc79df003d Mon Sep 17 00:00:00 2001 From: Vignesh R Date: Thu, 9 Jun 2016 11:02:04 +0530 Subject: [PATCH 167/813] gpio: pca953x: Fix NBANK calculation for PCA9536 commit a246b8198f776a16d1d3a3bbfc2d437bad766b29 upstream. NBANK() macro assumes that ngpios is a multiple of 8(BANK_SZ) and hence results in 0 banks for PCA9536 which has just 4 gpios. This is wrong as PCA9356 has 1 bank with 4 gpios. This results in uninitialized PCA953X_INVERT register. Fix this by using DIV_ROUND_UP macro in NBANK(). Signed-off-by: Vignesh R Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-pca953x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 2d4892cc70fb..c844d7eccb6c 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -86,7 +86,7 @@ MODULE_DEVICE_TABLE(acpi, pca953x_acpi_ids); #define MAX_BANK 5 #define BANK_SZ 8 -#define NBANK(chip) (chip->gpio_chip.ngpio / BANK_SZ) +#define NBANK(chip) DIV_ROUND_UP(chip->gpio_chip.ngpio, BANK_SZ) struct pca953x_chip { unsigned gpio_start; From 0b3ff17cabc6b06226f45b63c1d62fc186f3080d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 6 Jul 2016 12:50:12 +0300 Subject: [PATCH 168/813] gpio: intel-mid: Remove potentially harmful code commit 3dbd3212f81b2b410a34a922055e2da792864829 upstream. The commit d56d6b3d7d69 ("gpio: langwell: add Intel Merrifield support") doesn't look at all as a proper support for Intel Merrifield and I dare to say that it distorts the behaviour of the hardware. The register map is different on Intel Merrifield, i.e. only 6 out of 8 register have the same purpose but none of them has same location in the address space. The current case potentially harmful to existing hardware since it's poking registers on wrong offsets and may set some pin to be GPIO output when connected hardware doesn't expect such. Besides the above GPIO and pinctrl on Intel Merrifield have been located in different IP blocks. The functionality has been extended as well, i.e. added support of level interrupts, special registers for wake capable sources and thus, in my opinion, requires a completele separate driver. If someone wondering the existing gpio-intel-mid.c would be converted to actual pinctrl (which by the fact it is now), though I wouldn't be a volunteer to do that. Fixes: d56d6b3d7d69 ("gpio: langwell: add Intel Merrifield support") Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-intel-mid.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/gpio/gpio-intel-mid.c b/drivers/gpio/gpio-intel-mid.c index 70097472b02c..c50e930d97d3 100644 --- a/drivers/gpio/gpio-intel-mid.c +++ b/drivers/gpio/gpio-intel-mid.c @@ -17,7 +17,6 @@ * Moorestown platform Langwell chip. * Medfield platform Penwell chip. * Clovertrail platform Cloverview chip. - * Merrifield platform Tangier chip. */ #include @@ -64,10 +63,6 @@ enum GPIO_REG { /* intel_mid gpio driver data */ struct intel_mid_gpio_ddata { u16 ngpio; /* number of gpio pins */ - u32 gplr_offset; /* offset of first GPLR register from base */ - u32 flis_base; /* base address of FLIS registers */ - u32 flis_len; /* length of FLIS registers */ - u32 (*get_flis_offset)(int gpio); u32 chip_irq_type; /* chip interrupt type */ }; @@ -257,15 +252,6 @@ static const struct intel_mid_gpio_ddata gpio_cloverview_core = { .chip_irq_type = INTEL_MID_IRQ_TYPE_EDGE, }; -static const struct intel_mid_gpio_ddata gpio_tangier = { - .ngpio = 192, - .gplr_offset = 4, - .flis_base = 0xff0c0000, - .flis_len = 0x8000, - .get_flis_offset = NULL, - .chip_irq_type = INTEL_MID_IRQ_TYPE_EDGE, -}; - static const struct pci_device_id intel_gpio_ids[] = { { /* Lincroft */ @@ -292,11 +278,6 @@ static const struct pci_device_id intel_gpio_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x08f7), .driver_data = (kernel_ulong_t)&gpio_cloverview_core, }, - { - /* Tangier */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x1199), - .driver_data = (kernel_ulong_t)&gpio_tangier, - }, { 0 } }; MODULE_DEVICE_TABLE(pci, intel_gpio_ids); From e5cf298670b360550a9b772b02aaafd13652fe74 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 28 Apr 2016 18:48:25 +0200 Subject: [PATCH 169/813] Bluetooth: hci_intel: Fix null gpio desc pointer dereference commit 32b9ccbc3522811c0e483637b85ae25f5491296f upstream. gpiod_get_optional can return either ERR_PTR or NULL pointer. NULL case is not tested and then dereferenced later in desc_to_gpio. Fix this by using non optional version which returns ERR_PTR in any error case (this is not an optional gpio). Use the same non optional version for the host-wake gpio. Fixes: 765ea3abd116 ("Bluetooth: hci_intel: Retrieve host-wake IRQ") Signed-off-by: Loic Poulain Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/hci_intel.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/hci_intel.c b/drivers/bluetooth/hci_intel.c index 4a414a5a3165..b9065506a847 100644 --- a/drivers/bluetooth/hci_intel.c +++ b/drivers/bluetooth/hci_intel.c @@ -1234,8 +1234,7 @@ static int intel_probe(struct platform_device *pdev) idev->pdev = pdev; - idev->reset = devm_gpiod_get_optional(&pdev->dev, "reset", - GPIOD_OUT_LOW); + idev->reset = devm_gpiod_get(&pdev->dev, "reset", GPIOD_OUT_LOW); if (IS_ERR(idev->reset)) { dev_err(&pdev->dev, "Unable to retrieve gpio\n"); return PTR_ERR(idev->reset); @@ -1247,8 +1246,7 @@ static int intel_probe(struct platform_device *pdev) dev_err(&pdev->dev, "No IRQ, falling back to gpio-irq\n"); - host_wake = devm_gpiod_get_optional(&pdev->dev, "host-wake", - GPIOD_IN); + host_wake = devm_gpiod_get(&pdev->dev, "host-wake", GPIOD_IN); if (IS_ERR(host_wake)) { dev_err(&pdev->dev, "Unable to retrieve IRQ\n"); goto no_irq; From 4c3b381fadc08342a2f4b1182c9f710983ab299c Mon Sep 17 00:00:00 2001 From: Dan O'Donovan Date: Fri, 10 Jun 2016 13:23:34 +0100 Subject: [PATCH 170/813] pinctrl: cherryview: prevent concurrent access to GPIO controllers commit 0bd50d719b004110e791800450ad204399100a86 upstream. Due to a silicon issue on the Atom X5-Z8000 "Cherry Trail" processor series, a common lock must be used to prevent concurrent accesses across the 4 GPIO controllers managed by this driver. See Intel Atom Z8000 Processor Series Specification Update (Rev. 005), errata #CHT34, for further information. Signed-off-by: Dan O'Donovan Acked-by: Mika Westerberg Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/intel/pinctrl-cherryview.c | 80 ++++++++++++---------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c index 84936bae6e5e..4e377599d266 100644 --- a/drivers/pinctrl/intel/pinctrl-cherryview.c +++ b/drivers/pinctrl/intel/pinctrl-cherryview.c @@ -160,7 +160,6 @@ struct chv_pin_context { * @pctldev: Pointer to the pin controller device * @chip: GPIO chip in this pin controller * @regs: MMIO registers - * @lock: Lock to serialize register accesses * @intr_lines: Stores mapping between 16 HW interrupt wires and GPIO * offset (in GPIO number space) * @community: Community this pinctrl instance represents @@ -174,7 +173,6 @@ struct chv_pinctrl { struct pinctrl_dev *pctldev; struct gpio_chip chip; void __iomem *regs; - raw_spinlock_t lock; unsigned intr_lines[16]; const struct chv_community *community; u32 saved_intmask; @@ -659,6 +657,17 @@ static const struct chv_community *chv_communities[] = { &southeast_community, }; +/* + * Lock to serialize register accesses + * + * Due to a silicon issue, a shared lock must be used to prevent + * concurrent accesses across the 4 GPIO controllers. + * + * See Intel Atom Z8000 Processor Series Specification Update (Rev. 005), + * errata #CHT34, for further information. + */ +static DEFINE_RAW_SPINLOCK(chv_lock); + static void __iomem *chv_padreg(struct chv_pinctrl *pctrl, unsigned offset, unsigned reg) { @@ -720,13 +729,13 @@ static void chv_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, u32 ctrl0, ctrl1; bool locked; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); ctrl0 = readl(chv_padreg(pctrl, offset, CHV_PADCTRL0)); ctrl1 = readl(chv_padreg(pctrl, offset, CHV_PADCTRL1)); locked = chv_pad_locked(pctrl, offset); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); if (ctrl0 & CHV_PADCTRL0_GPIOEN) { seq_puts(s, "GPIO "); @@ -789,14 +798,14 @@ static int chv_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned function, grp = &pctrl->community->groups[group]; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); /* Check first that the pad is not locked */ for (i = 0; i < grp->npins; i++) { if (chv_pad_locked(pctrl, grp->pins[i])) { dev_warn(pctrl->dev, "unable to set mode for locked pin %u\n", grp->pins[i]); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return -EBUSY; } } @@ -839,7 +848,7 @@ static int chv_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned function, pin, altfunc->mode, altfunc->invert_oe ? "" : "not "); } - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return 0; } @@ -853,13 +862,13 @@ static int chv_gpio_request_enable(struct pinctrl_dev *pctldev, void __iomem *reg; u32 value; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); if (chv_pad_locked(pctrl, offset)) { value = readl(chv_padreg(pctrl, offset, CHV_PADCTRL0)); if (!(value & CHV_PADCTRL0_GPIOEN)) { /* Locked so cannot enable */ - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return -EBUSY; } } else { @@ -899,7 +908,7 @@ static int chv_gpio_request_enable(struct pinctrl_dev *pctldev, chv_writel(value, reg); } - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return 0; } @@ -913,13 +922,13 @@ static void chv_gpio_disable_free(struct pinctrl_dev *pctldev, void __iomem *reg; u32 value; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); reg = chv_padreg(pctrl, offset, CHV_PADCTRL0); value = readl(reg) & ~CHV_PADCTRL0_GPIOEN; chv_writel(value, reg); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); } static int chv_gpio_set_direction(struct pinctrl_dev *pctldev, @@ -931,7 +940,7 @@ static int chv_gpio_set_direction(struct pinctrl_dev *pctldev, unsigned long flags; u32 ctrl0; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); ctrl0 = readl(reg) & ~CHV_PADCTRL0_GPIOCFG_MASK; if (input) @@ -940,7 +949,7 @@ static int chv_gpio_set_direction(struct pinctrl_dev *pctldev, ctrl0 |= CHV_PADCTRL0_GPIOCFG_GPO << CHV_PADCTRL0_GPIOCFG_SHIFT; chv_writel(ctrl0, reg); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return 0; } @@ -965,10 +974,10 @@ static int chv_config_get(struct pinctrl_dev *pctldev, unsigned pin, u16 arg = 0; u32 term; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); ctrl0 = readl(chv_padreg(pctrl, pin, CHV_PADCTRL0)); ctrl1 = readl(chv_padreg(pctrl, pin, CHV_PADCTRL1)); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); term = (ctrl0 & CHV_PADCTRL0_TERM_MASK) >> CHV_PADCTRL0_TERM_SHIFT; @@ -1042,7 +1051,7 @@ static int chv_config_set_pull(struct chv_pinctrl *pctrl, unsigned pin, unsigned long flags; u32 ctrl0, pull; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); ctrl0 = readl(reg); switch (param) { @@ -1065,7 +1074,7 @@ static int chv_config_set_pull(struct chv_pinctrl *pctrl, unsigned pin, pull = CHV_PADCTRL0_TERM_20K << CHV_PADCTRL0_TERM_SHIFT; break; default: - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return -EINVAL; } @@ -1083,7 +1092,7 @@ static int chv_config_set_pull(struct chv_pinctrl *pctrl, unsigned pin, pull = CHV_PADCTRL0_TERM_20K << CHV_PADCTRL0_TERM_SHIFT; break; default: - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return -EINVAL; } @@ -1091,12 +1100,12 @@ static int chv_config_set_pull(struct chv_pinctrl *pctrl, unsigned pin, break; default: - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return -EINVAL; } chv_writel(ctrl0, reg); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return 0; } @@ -1162,9 +1171,9 @@ static int chv_gpio_get(struct gpio_chip *chip, unsigned offset) unsigned long flags; u32 ctrl0, cfg; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); ctrl0 = readl(chv_padreg(pctrl, pin, CHV_PADCTRL0)); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); cfg = ctrl0 & CHV_PADCTRL0_GPIOCFG_MASK; cfg >>= CHV_PADCTRL0_GPIOCFG_SHIFT; @@ -1182,7 +1191,7 @@ static void chv_gpio_set(struct gpio_chip *chip, unsigned offset, int value) void __iomem *reg; u32 ctrl0; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); reg = chv_padreg(pctrl, pin, CHV_PADCTRL0); ctrl0 = readl(reg); @@ -1194,7 +1203,7 @@ static void chv_gpio_set(struct gpio_chip *chip, unsigned offset, int value) chv_writel(ctrl0, reg); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); } static int chv_gpio_get_direction(struct gpio_chip *chip, unsigned offset) @@ -1204,9 +1213,9 @@ static int chv_gpio_get_direction(struct gpio_chip *chip, unsigned offset) u32 ctrl0, direction; unsigned long flags; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); ctrl0 = readl(chv_padreg(pctrl, pin, CHV_PADCTRL0)); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); direction = ctrl0 & CHV_PADCTRL0_GPIOCFG_MASK; direction >>= CHV_PADCTRL0_GPIOCFG_SHIFT; @@ -1244,14 +1253,14 @@ static void chv_gpio_irq_ack(struct irq_data *d) int pin = chv_gpio_offset_to_pin(pctrl, irqd_to_hwirq(d)); u32 intr_line; - raw_spin_lock(&pctrl->lock); + raw_spin_lock(&chv_lock); intr_line = readl(chv_padreg(pctrl, pin, CHV_PADCTRL0)); intr_line &= CHV_PADCTRL0_INTSEL_MASK; intr_line >>= CHV_PADCTRL0_INTSEL_SHIFT; chv_writel(BIT(intr_line), pctrl->regs + CHV_INTSTAT); - raw_spin_unlock(&pctrl->lock); + raw_spin_unlock(&chv_lock); } static void chv_gpio_irq_mask_unmask(struct irq_data *d, bool mask) @@ -1262,7 +1271,7 @@ static void chv_gpio_irq_mask_unmask(struct irq_data *d, bool mask) u32 value, intr_line; unsigned long flags; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); intr_line = readl(chv_padreg(pctrl, pin, CHV_PADCTRL0)); intr_line &= CHV_PADCTRL0_INTSEL_MASK; @@ -1275,7 +1284,7 @@ static void chv_gpio_irq_mask_unmask(struct irq_data *d, bool mask) value |= BIT(intr_line); chv_writel(value, pctrl->regs + CHV_INTMASK); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); } static void chv_gpio_irq_mask(struct irq_data *d) @@ -1309,7 +1318,7 @@ static unsigned chv_gpio_irq_startup(struct irq_data *d) unsigned long flags; u32 intsel, value; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); intsel = readl(chv_padreg(pctrl, pin, CHV_PADCTRL0)); intsel &= CHV_PADCTRL0_INTSEL_MASK; intsel >>= CHV_PADCTRL0_INTSEL_SHIFT; @@ -1324,7 +1333,7 @@ static unsigned chv_gpio_irq_startup(struct irq_data *d) irq_set_handler_locked(d, handler); pctrl->intr_lines[intsel] = offset; } - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); } chv_gpio_irq_unmask(d); @@ -1340,7 +1349,7 @@ static int chv_gpio_irq_type(struct irq_data *d, unsigned type) unsigned long flags; u32 value; - raw_spin_lock_irqsave(&pctrl->lock, flags); + raw_spin_lock_irqsave(&chv_lock, flags); /* * Pins which can be used as shared interrupt are configured in @@ -1389,7 +1398,7 @@ static int chv_gpio_irq_type(struct irq_data *d, unsigned type) else if (type & IRQ_TYPE_LEVEL_MASK) irq_set_handler_locked(d, handle_level_irq); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); + raw_spin_unlock_irqrestore(&chv_lock, flags); return 0; } @@ -1501,7 +1510,6 @@ static int chv_pinctrl_probe(struct platform_device *pdev) if (i == ARRAY_SIZE(chv_communities)) return -ENODEV; - raw_spin_lock_init(&pctrl->lock); pctrl->dev = &pdev->dev; #ifdef CONFIG_PM_SLEEP From 6d894c3f0f1cadd5649364556a9de30a9e8c0f74 Mon Sep 17 00:00:00 2001 From: Caesar Wang Date: Wed, 18 May 2016 22:41:50 +0800 Subject: [PATCH 171/813] arm64: dts: rockchip: fixes the gic400 2nd region size for rk3368 commit ad1cfdf518976447e6b0d31517bad4e3ebbce6bb upstream. The 2nd additional region is the GIC virtual cpu interface register base and size. As the gic400 of rk3368 says, the cpu interface register map as below : -0x0000 GICC_CTRL . . . -0x00fc GICC_IIDR -0x1000 GICC_IDR Obviously, the region size should be greater than 0x1000. So we should make sure to include the GICC_IDR since the kernel will access it in some cases. Fixes: b790c2cab5ca ("arm64: dts: add Rockchip rk3368 core dtsi and board dts for the r88 board") Signed-off-by: Caesar Wang Reviewed-by: Shawn Lin Signed-off-by: Greg Kroah-Hartman [added Fixes and stable-cc] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3368.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index cc093a482aa4..8fe39e1b680e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -517,7 +517,7 @@ #address-cells = <0>; reg = <0x0 0xffb71000 0x0 0x1000>, - <0x0 0xffb72000 0x0 0x1000>, + <0x0 0xffb72000 0x0 0x2000>, <0x0 0xffb74000 0x0 0x2000>, <0x0 0xffb76000 0x0 0x2000>; interrupts = Date: Mon, 1 Aug 2016 13:29:31 +0200 Subject: [PATCH 172/813] arm64: mm: avoid fdt_check_header() before the FDT is fully mapped commit 04a848106193b134741672f7e4e444b50c70b631 upstream. As reported by Zijun, the fdt_check_header() call in __fixmap_remap_fdt() is not safe since it is not guaranteed that the FDT header is mapped completely. Due to the minimum alignment of 8 bytes, the only fields we can assume to be mapped are 'magic' and 'totalsize'. Since the OF layer is in charge of validating the FDT image, and we are only interested in making reasonably sure that the size field contains a meaningful value, replace the fdt_check_header() call with an explicit comparison of the magic field's value against the expected value. Reported-by: Zijun Hu Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/mm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 116ad654dd59..653735a8c58a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -652,9 +652,9 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) /* * Check whether the physical FDT address is set and meets the minimum * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be - * at least 8 bytes so that we can always access the size field of the - * FDT header after mapping the first chunk, double check here if that - * is indeed the case. + * at least 8 bytes so that we can always access the magic and size + * fields of the FDT header after mapping the first chunk, double check + * here if that is indeed the case. */ BUILD_BUG_ON(MIN_FDT_ALIGN < 8); if (!dt_phys || dt_phys % MIN_FDT_ALIGN) @@ -682,7 +682,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, SWAPPER_BLOCK_SIZE, prot); - if (fdt_check_header(dt_virt) != 0) + if (fdt_magic(dt_virt) != FDT_MAGIC) return NULL; size = fdt_totalsize(dt_virt); From e1a90eb8afa42b02f46897b881b9e19d3594159c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jun 2016 14:21:59 +1000 Subject: [PATCH 173/813] KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures commit f024ee098476a3e620232e4a78cfac505f121245 upstream. This moves the transactional memory state save and restore sequences out of the guest entry/exit paths into separate procedures. This is so that these sequences can be used in going into and out of nap in a subsequent patch. The only code changes here are (a) saving and restore LR on the stack, since these new procedures get called with a bl instruction, (b) explicitly saving r1 into the PACA instead of assuming that HSTATE_HOST_R1(r13) is already set, and (c) removing an unnecessary and redundant setting of MSR[TM] that should have been removed by commit 9d4d0bdd9e0a ("KVM: PPC: Book3S HV: Add transactional memory support", 2013-09-24) but wasn't. Signed-off-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 449 +++++++++++++----------- 1 file changed, 237 insertions(+), 212 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 463af88c95a2..5d3b14f179ae 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -655,112 +655,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION - b skip_tm -END_FTR_SECTION_IFCLR(CPU_FTR_TM) - - /* Turn on TM/FP/VSX/VMX so we can restore them. */ - mfmsr r5 - li r6, MSR_TM >> 32 - sldi r6, r6, 32 - or r5, r5, r6 - ori r5, r5, MSR_FP - oris r5, r5, (MSR_VEC | MSR_VSX)@h - mtmsrd r5 - - /* - * The user may change these outside of a transaction, so they must - * always be context switched. - */ - ld r5, VCPU_TFHAR(r4) - ld r6, VCPU_TFIAR(r4) - ld r7, VCPU_TEXASR(r4) - mtspr SPRN_TFHAR, r5 - mtspr SPRN_TFIAR, r6 - mtspr SPRN_TEXASR, r7 - - ld r5, VCPU_MSR(r4) - rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beq skip_tm /* TM not active in guest */ - - /* Make sure the failure summary is set, otherwise we'll program check - * when we trechkpt. It's possible that this might have been not set - * on a kvmppc_set_one_reg() call but we shouldn't let this crash the - * host. - */ - oris r7, r7, (TEXASR_FS)@h - mtspr SPRN_TEXASR, r7 - - /* - * We need to load up the checkpointed state for the guest. - * We need to do this early as it will blow away any GPRs, VSRs and - * some SPRs. - */ - - mr r31, r4 - addi r3, r31, VCPU_FPRS_TM - bl load_fp_state - addi r3, r31, VCPU_VRS_TM - bl load_vr_state - mr r4, r31 - lwz r7, VCPU_VRSAVE_TM(r4) - mtspr SPRN_VRSAVE, r7 - - ld r5, VCPU_LR_TM(r4) - lwz r6, VCPU_CR_TM(r4) - ld r7, VCPU_CTR_TM(r4) - ld r8, VCPU_AMR_TM(r4) - ld r9, VCPU_TAR_TM(r4) - mtlr r5 - mtcr r6 - mtctr r7 - mtspr SPRN_AMR, r8 - mtspr SPRN_TAR, r9 - - /* - * Load up PPR and DSCR values but don't put them in the actual SPRs - * till the last moment to avoid running with userspace PPR and DSCR for - * too long. - */ - ld r29, VCPU_DSCR_TM(r4) - ld r30, VCPU_PPR_TM(r4) - - std r2, PACATMSCRATCH(r13) /* Save TOC */ - - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ - li r5, 0 - mtmsrd r5, 1 - - /* Load GPRs r0-r28 */ - reg = 0 - .rept 29 - ld reg, VCPU_GPRS_TM(reg)(r31) - reg = reg + 1 - .endr - - mtspr SPRN_DSCR, r29 - mtspr SPRN_PPR, r30 - - /* Load final GPRs */ - ld 29, VCPU_GPRS_TM(29)(r31) - ld 30, VCPU_GPRS_TM(30)(r31) - ld 31, VCPU_GPRS_TM(31)(r31) - - /* TM checkpointed state is now setup. All GPRs are now volatile. */ - TRECHKPT - - /* Now let's get back the state we need. */ - HMT_MEDIUM - GET_PACA(r13) - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 - ld r4, HSTATE_KVM_VCPU(r13) - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATMSCRATCH(r13) - - /* Set the MSR RI since we have our registers back. */ - li r5, MSR_RI - mtmsrd r5, 1 -skip_tm: + bl kvmppc_restore_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif /* Load guest PMU registers */ @@ -841,12 +737,6 @@ BEGIN_FTR_SECTION /* Skip next section on POWER7 */ b 8f END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - /* Load up POWER8-specific registers */ ld r5, VCPU_IAMR(r4) lwz r6, VCPU_PSPB(r4) @@ -1436,106 +1326,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION - b 2f -END_FTR_SECTION_IFCLR(CPU_FTR_TM) - /* Turn on TM. */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - - ld r5, VCPU_MSR(r9) - rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beq 1f /* TM not active in guest. */ - - li r3, TM_CAUSE_KVM_RESCHED - - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ - li r5, 0 - mtmsrd r5, 1 - - /* All GPRs are volatile at this point. */ - TRECLAIM(R3) - - /* Temporarily store r13 and r9 so we have some regs to play with */ - SET_SCRATCH0(r13) - GET_PACA(r13) - std r9, PACATMSCRATCH(r13) - ld r9, HSTATE_KVM_VCPU(r13) - - /* Get a few more GPRs free. */ - std r29, VCPU_GPRS_TM(29)(r9) - std r30, VCPU_GPRS_TM(30)(r9) - std r31, VCPU_GPRS_TM(31)(r9) - - /* Save away PPR and DSCR soon so don't run with user values. */ - mfspr r31, SPRN_PPR - HMT_MEDIUM - mfspr r30, SPRN_DSCR - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 - - /* Save all but r9, r13 & r29-r31 */ - reg = 0 - .rept 29 - .if (reg != 9) && (reg != 13) - std reg, VCPU_GPRS_TM(reg)(r9) - .endif - reg = reg + 1 - .endr - /* ... now save r13 */ - GET_SCRATCH0(r4) - std r4, VCPU_GPRS_TM(13)(r9) - /* ... and save r9 */ - ld r4, PACATMSCRATCH(r13) - std r4, VCPU_GPRS_TM(9)(r9) - - /* Reload stack pointer and TOC. */ - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 - - /* Save away checkpinted SPRs. */ - std r31, VCPU_PPR_TM(r9) - std r30, VCPU_DSCR_TM(r9) - mflr r5 - mfcr r6 - mfctr r7 - mfspr r8, SPRN_AMR - mfspr r10, SPRN_TAR - std r5, VCPU_LR_TM(r9) - stw r6, VCPU_CR_TM(r9) - std r7, VCPU_CTR_TM(r9) - std r8, VCPU_AMR_TM(r9) - std r10, VCPU_TAR_TM(r9) - - /* Restore r12 as trap number. */ - lwz r12, VCPU_TRAP(r9) - - /* Save FP/VSX. */ - addi r3, r9, VCPU_FPRS_TM - bl store_fp_state - addi r3, r9, VCPU_VRS_TM - bl store_vr_state - mfspr r6, SPRN_VRSAVE - stw r6, VCPU_VRSAVE_TM(r9) -1: - /* - * We need to save these SPRs after the treclaim so that the software - * error code is recorded correctly in the TEXASR. Also the user may - * change these outside of a transaction, so they must always be - * context switched. - */ - mfspr r5, SPRN_TFHAR - mfspr r6, SPRN_TFIAR - mfspr r7, SPRN_TEXASR - std r5, VCPU_TFHAR(r9) - std r6, VCPU_TFIAR(r9) - std r7, VCPU_TEXASR(r9) -2: + bl kvmppc_save_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif /* Increment yield count if they have a VPA */ @@ -2629,6 +2421,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) mr r4,r31 blr +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Save transactional state and TM-related registers. + * Called with r9 pointing to the vcpu struct. + * This can modify all checkpointed registers, but + * restores r1, r2 and r9 (vcpu pointer) before exit. + */ +kvmppc_save_tm: + mflr r0 + std r0, PPC_LR_STKOFF(r1) + + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + std r1, HSTATE_HOST_R1(r13) + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl store_fp_state + addi r3, r9, VCPU_VRS_TM + bl store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) + + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr + +/* + * Restore transactional state and TM-related registers. + * Called with r4 pointing to the vcpu struct. + * This potentially modifies all checkpointed registers. + * It restores r1, r2, r4 from the PACA. + */ +kvmppc_restore_tm: + mflr r0 + std r0, PPC_LR_STKOFF(r1) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beqlr /* TM not active in guest */ + std r1, HSTATE_HOST_R1(r13) + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl load_fp_state + addi r3, r31, VCPU_VRS_TM + bl load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr +#endif + /* * We come here if we get any exception or interrupt while we are * executing host real mode code while in guest MMU context. From 41490064ad279e8364d993e0cb1117209799fd80 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jun 2016 15:52:55 +1000 Subject: [PATCH 174/813] KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE commit 93d17397e4e2182fdaad503e2f9da46202c0f1c3 upstream. It turns out that if the guest does a H_CEDE while the CPU is in a transactional state, and the H_CEDE does a nap, and the nap loses the architected state of the CPU (which is is allowed to do), then we lose the checkpointed state of the virtual CPU. In addition, the transactional-memory state recorded in the MSR gets reset back to non-transactional, and when we try to return to the guest, we take a TM bad thing type of program interrupt because we are trying to transition from non-transactional to transactional with a hrfid instruction, which is not permitted. The result of the program interrupt occurring at that point is that the host CPU will hang in an infinite loop with interrupts disabled. Thus this is a denial of service vulnerability in the host which can be triggered by any guest (and depending on the guest kernel, it can potentially triggered by unprivileged userspace in the guest). This vulnerability has been assigned the ID CVE-2016-5412. To fix this, we save the TM state before napping and restore it on exit from the nap, when handling a H_CEDE in real mode. The case where H_CEDE exits to host virtual mode is already OK (as are other hcalls which exit to host virtual mode) because the exit path saves the TM state. Signed-off-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5d3b14f179ae..974f73df00bb 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2037,6 +2037,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ /* save FP state */ bl kvmppc_save_fp +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + ld r9, HSTATE_KVM_VCPU(r13) + bl kvmppc_save_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + /* * Set DEC to the smaller of DEC and HDEC, so that we wake * no later than the end of our timeslice (HDEC interrupts @@ -2113,6 +2120,12 @@ kvm_end_cede: bl kvmhv_accumulate_time #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + bl kvmppc_restore_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + /* load up FP state */ bl kvmppc_load_fp From b9ab0b3614fca6b363b2f2cfa918e7b7f74592ea Mon Sep 17 00:00:00 2001 From: Alexis Dambricourt Date: Mon, 4 Jul 2016 21:05:15 +0200 Subject: [PATCH 175/813] KVM: MTRR: fix kvm_mtrr_check_gfn_range_consistency page fault commit 30b072ce0356e8b141f4ca6da7220486fa3641d9 upstream. The following #PF may occurs: [ 1403.317041] BUG: unable to handle kernel paging request at 0000000200000068 [ 1403.317045] IP: [] __mtrr_lookup_var_next+0x10/0xa0 [kvm] [ 1403.317123] Call Trace: [ 1403.317134] [] ? kvm_mtrr_check_gfn_range_consistency+0xc5/0x120 [kvm] [ 1403.317143] [] ? tdp_page_fault+0x9f/0x2c0 [kvm] [ 1403.317152] [] ? kvm_set_msr_common+0x858/0xc00 [kvm] [ 1403.317161] [] ? x86_emulate_insn+0x273/0xd30 [kvm] [ 1403.317171] [] ? kvm_cpuid+0x34/0x190 [kvm] [ 1403.317180] [] ? kvm_mmu_page_fault+0x59/0xe0 [kvm] [ 1403.317183] [] ? vmx_handle_exit+0x1d1/0x14a0 [kvm_intel] [ 1403.317185] [] ? atomic_switch_perf_msrs+0x6f/0xa0 [kvm_intel] [ 1403.317187] [] ? vmx_vcpu_run+0x2ad/0x420 [kvm_intel] [ 1403.317196] [] ? kvm_arch_vcpu_ioctl_run+0x622/0x1550 [kvm] [ 1403.317204] [] ? kvm_arch_vcpu_load+0x59/0x210 [kvm] [ 1403.317206] [] ? __kernel_fpu_end+0x35/0x100 [ 1403.317213] [] ? kvm_vcpu_ioctl+0x316/0x5d0 [kvm] [ 1403.317215] [] ? do_sigtimedwait+0xd5/0x220 [ 1403.317217] [] ? do_vfs_ioctl+0x9d/0x5c0 [ 1403.317224] [] ? kvm_on_user_return+0x3e/0x70 [kvm] [ 1403.317225] [] ? SyS_ioctl+0x74/0x80 [ 1403.317227] [] ? entry_SYSCALL_64_fastpath+0x1e/0xa8 [ 1403.317242] RIP [] __mtrr_lookup_var_next+0x10/0xa0 [kvm] At mtrr_lookup_fixed_next(), when the condition 'if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))' becomes true, mtrr_lookup_var_start() is called with iter->range with gargabe values from the fixed MTRR union field. Then, list_prepare_entry() do not call list_entry() initialization, keeping a garbage pointer in iter->range which is accessed in the following __mtrr_lookup_var_next() call. Fixes: f571c0973e4b8c888e049b6842e4b4f93b5c609c Signed-off-by: Alexis Dambricourt Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mtrr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index c146f3c262c3..0149ac59c273 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -539,6 +539,7 @@ static void mtrr_lookup_var_start(struct mtrr_iter *iter) iter->fixed = false; iter->start_max = iter->start; + iter->range = NULL; iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); __mtrr_lookup_var_next(iter); From 6bd2820f906f75c4a05484a5b446e50a1f1b0e34 Mon Sep 17 00:00:00 2001 From: "Cao, Lei" Date: Fri, 15 Jul 2016 13:54:04 +0000 Subject: [PATCH 176/813] KVM: VMX: handle PML full VMEXIT that occurs during event delivery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b244c9fc251e14a083a1cbf04bef10bd99303a76 upstream. With PML enabled, guest will shut down if a PML full VMEXIT occurs during event delivery. According to Intel SDM 27.2.3, PML full VMEXIT can occur when event is being delivered through IDT, so KVM should not exit to user space with error. Instead, it should let EXIT_REASON_PML_FULL go through and the event will be re-injected on the next VMENTRY. Signed-off-by: Lei Cao Fixes: 843e4330573c ("KVM: VMX: Add PML support in VMX") [Shortened the summary and Cc'd stable.] Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 41e7943004fe..38eb9c91e9ad 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8124,6 +8124,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_PML_FULL && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; From 144941bd9907d0e616229f0b6dddcad512030407 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 8 Jul 2016 15:36:06 -0700 Subject: [PATCH 177/813] KVM: nVMX: Fix memory corruption when using VMCS shadowing commit 2f1fe81123f59271bddda673b60116bde9660385 upstream. When freeing the nested resources of a vcpu, there is an assumption that the vcpu's vmcs01 is the current VMCS on the CPU that executes nested_release_vmcs12(). If this assumption is violated, the vcpu's vmcs01 may be made active on multiple CPUs at the same time, in violation of Intel's specification. Moreover, since the vcpu's vmcs01 is not VMCLEARed on every CPU on which it is active, it can linger in a CPU's VMCS cache after it has been freed and potentially repurposed. Subsequent eviction from the CPU's VMCS cache on a capacity miss can result in memory corruption. It is not sufficient for vmx_free_vcpu() to call vmx_load_vmcs01(). If the vcpu in question was last loaded on a different CPU, it must be migrated to the current CPU before calling vmx_load_vmcs01(). Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 19 +++++++++++++++++-- virt/kvm/kvm_main.c | 2 ++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 38eb9c91e9ad..4589b6feeb7b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8737,6 +8737,22 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) put_cpu(); } +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = vcpu_load(vcpu); + BUG_ON(r); + vmx_load_vmcs01(vcpu); + free_nested(vmx); + vcpu_put(vcpu); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8745,8 +8761,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); - free_nested(vmx); + vmx_free_vcpu_nested(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 510df220d1b5..336ed267c407 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -142,6 +142,7 @@ int vcpu_load(struct kvm_vcpu *vcpu) put_cpu(); return 0; } +EXPORT_SYMBOL_GPL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { @@ -151,6 +152,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) preempt_enable(); mutex_unlock(&vcpu->mutex); } +EXPORT_SYMBOL_GPL(vcpu_put); static void ack_flush(void *_completed) { From e990e8ba92c63136f7678a8bbd656af53318343b Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Jul 2016 19:57:57 +0200 Subject: [PATCH 178/813] s390/cio: allow to reset channel measurement block commit 0f5d050ceaa31b2229102211d60c149f920df3aa upstream. Prior to commit 1bc6664bdfb949bc69a08113801e7d6acbf6bc3f a call to enable_cmf for a device for which channel measurement was already enabled resulted in a reset of the measurement data. What looked like bugs at the time (a 2nd allocation was triggered but failed, reset was called regardless of previous failures, and errors have not been reported to userspace) was actually something at least one userspace tool depended on. Restore that behavior in a sane way. Fixes: 1bc6664bdfb ("s390/cio: use device_lock during cmb activation") Signed-off-by: Sebastian Ott Reviewed-by: Cornelia Huck Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/cmf.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index b2afad5a5682..2a34eb5f6161 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -753,6 +753,17 @@ static void reset_cmb(struct ccw_device *cdev) cmf_generic_reset(cdev); } +static int cmf_enabled(struct ccw_device *cdev) +{ + int enabled; + + spin_lock_irq(cdev->ccwlock); + enabled = !!cdev->private->cmb; + spin_unlock_irq(cdev->ccwlock); + + return enabled; +} + static struct attribute_group cmf_attr_group; static struct cmb_operations cmbops_basic = { @@ -1153,13 +1164,8 @@ static ssize_t cmb_enable_show(struct device *dev, char *buf) { struct ccw_device *cdev = to_ccwdev(dev); - int enabled; - spin_lock_irq(cdev->ccwlock); - enabled = !!cdev->private->cmb; - spin_unlock_irq(cdev->ccwlock); - - return sprintf(buf, "%d\n", enabled); + return sprintf(buf, "%d\n", cmf_enabled(cdev)); } static ssize_t cmb_enable_store(struct device *dev, @@ -1199,15 +1205,20 @@ int ccw_set_cmf(struct ccw_device *cdev, int enable) * @cdev: The ccw device to be enabled * * Returns %0 for success or a negative error value. - * + * Note: If this is called on a device for which channel measurement is already + * enabled a reset of the measurement data is triggered. * Context: * non-atomic */ int enable_cmf(struct ccw_device *cdev) { - int ret; + int ret = 0; device_lock(&cdev->dev); + if (cmf_enabled(cdev)) { + cmbops->reset(cdev); + goto out_unlock; + } get_device(&cdev->dev); ret = cmbops->alloc(cdev); if (ret) @@ -1226,7 +1237,7 @@ int enable_cmf(struct ccw_device *cdev) out: if (ret) put_device(&cdev->dev); - +out_unlock: device_unlock(&cdev->dev); return ret; } From 695c69bf304d7874e0b918684818711f0d7830f9 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 8 Jul 2016 20:42:04 +0200 Subject: [PATCH 179/813] intel_pstate: Fix MSR_CONFIG_TDP_x addressing in core_get_max_pstate() commit 5fc8f707a2aa40c767c3a338738b9b6fcd151ac1 upstream. If MSR_CONFIG_TDP_CONTROL is locked, we currently try to address some MSR 0x80000648 or so. Mask out the relevant level bits 0 and 1. Found while running over the Jailhouse hypervisor which became upset about this strange MSR index. Signed-off-by: Jan Kiszka Acked-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f53b02a6bc05..6e80e4298274 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -662,7 +662,7 @@ static int core_get_max_pstate(void) if (err) goto skip_tar; - tdp_msr = MSR_CONFIG_TDP_NOMINAL + tdp_ctrl; + tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x3); err = rdmsrl_safe(tdp_msr, &tdp_ratio); if (err) goto skip_tar; From 0f984405d09114d07886850868a2a17232ab0f32 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 15 Jun 2016 01:02:26 +0200 Subject: [PATCH 180/813] mfd: qcom_rpm: Fix offset error for msm8660 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9835f1b70bb3890d38308b9be4fb9d7451ba67f1 upstream. The RPM in MSM8660/APQ8060 has different offsets to the selector ACK and request context ACK registers. Make all these register offsets part of the per-SoC data and assign the right values. The bug was found by verifying backwards to the vendor tree in the out-of-tree files : all were using offsets 3,11,15,23 and a select size of 4, except the MSM8660/APQ8060 which was using offsets 3,11,19,27 and a select size of 7. All other platforms apart from msm8660 were affected by reading excess registers, since 7 was hardcoded as the number of select words, this patch makes also this part dynamic so we only write/read as many select words as the platform actually use. Symptoms of this bug when using msm8660: the first RPM transaction would work, but the next would stall or raise an error since the previous transaction was not properly ACKed as the ACK words were read at the wrong offset. Fixes: 58e214382bdd ("mfd: qcom-rpm: Driver for the Qualcomm RPM") Signed-off-by: Linus Walleij Reviewed-by: Björn Andersson Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/qcom_rpm.c | 50 ++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c index 207a3bd68559..66f00c6427d3 100644 --- a/drivers/mfd/qcom_rpm.c +++ b/drivers/mfd/qcom_rpm.c @@ -34,7 +34,12 @@ struct qcom_rpm_resource { struct qcom_rpm_data { u32 version; const struct qcom_rpm_resource *resource_table; - unsigned n_resources; + unsigned int n_resources; + unsigned int req_ctx_off; + unsigned int req_sel_off; + unsigned int ack_ctx_off; + unsigned int ack_sel_off; + unsigned int sel_size; }; struct qcom_rpm { @@ -61,11 +66,7 @@ struct qcom_rpm { #define RPM_REQUEST_TIMEOUT (5 * HZ) -#define RPM_REQUEST_CONTEXT 3 -#define RPM_REQ_SELECT 11 -#define RPM_ACK_CONTEXT 15 -#define RPM_ACK_SELECTOR 23 -#define RPM_SELECT_SIZE 7 +#define RPM_MAX_SEL_SIZE 7 #define RPM_NOTIFICATION BIT(30) #define RPM_REJECTED BIT(31) @@ -157,6 +158,11 @@ static const struct qcom_rpm_data apq8064_template = { .version = 3, .resource_table = apq8064_rpm_resource_table, .n_resources = ARRAY_SIZE(apq8064_rpm_resource_table), + .req_ctx_off = 3, + .req_sel_off = 11, + .ack_ctx_off = 15, + .ack_sel_off = 23, + .sel_size = 4, }; static const struct qcom_rpm_resource msm8660_rpm_resource_table[] = { @@ -240,6 +246,11 @@ static const struct qcom_rpm_data msm8660_template = { .version = 2, .resource_table = msm8660_rpm_resource_table, .n_resources = ARRAY_SIZE(msm8660_rpm_resource_table), + .req_ctx_off = 3, + .req_sel_off = 11, + .ack_ctx_off = 19, + .ack_sel_off = 27, + .sel_size = 7, }; static const struct qcom_rpm_resource msm8960_rpm_resource_table[] = { @@ -322,6 +333,11 @@ static const struct qcom_rpm_data msm8960_template = { .version = 3, .resource_table = msm8960_rpm_resource_table, .n_resources = ARRAY_SIZE(msm8960_rpm_resource_table), + .req_ctx_off = 3, + .req_sel_off = 11, + .ack_ctx_off = 15, + .ack_sel_off = 23, + .sel_size = 4, }; static const struct qcom_rpm_resource ipq806x_rpm_resource_table[] = { @@ -362,6 +378,11 @@ static const struct qcom_rpm_data ipq806x_template = { .version = 3, .resource_table = ipq806x_rpm_resource_table, .n_resources = ARRAY_SIZE(ipq806x_rpm_resource_table), + .req_ctx_off = 3, + .req_sel_off = 11, + .ack_ctx_off = 15, + .ack_sel_off = 23, + .sel_size = 4, }; static const struct of_device_id qcom_rpm_of_match[] = { @@ -380,7 +401,7 @@ int qcom_rpm_write(struct qcom_rpm *rpm, { const struct qcom_rpm_resource *res; const struct qcom_rpm_data *data = rpm->data; - u32 sel_mask[RPM_SELECT_SIZE] = { 0 }; + u32 sel_mask[RPM_MAX_SEL_SIZE] = { 0 }; int left; int ret = 0; int i; @@ -398,12 +419,12 @@ int qcom_rpm_write(struct qcom_rpm *rpm, writel_relaxed(buf[i], RPM_REQ_REG(rpm, res->target_id + i)); bitmap_set((unsigned long *)sel_mask, res->select_id, 1); - for (i = 0; i < ARRAY_SIZE(sel_mask); i++) { + for (i = 0; i < rpm->data->sel_size; i++) { writel_relaxed(sel_mask[i], - RPM_CTRL_REG(rpm, RPM_REQ_SELECT + i)); + RPM_CTRL_REG(rpm, rpm->data->req_sel_off + i)); } - writel_relaxed(BIT(state), RPM_CTRL_REG(rpm, RPM_REQUEST_CONTEXT)); + writel_relaxed(BIT(state), RPM_CTRL_REG(rpm, rpm->data->req_ctx_off)); reinit_completion(&rpm->ack); regmap_write(rpm->ipc_regmap, rpm->ipc_offset, BIT(rpm->ipc_bit)); @@ -426,10 +447,11 @@ static irqreturn_t qcom_rpm_ack_interrupt(int irq, void *dev) u32 ack; int i; - ack = readl_relaxed(RPM_CTRL_REG(rpm, RPM_ACK_CONTEXT)); - for (i = 0; i < RPM_SELECT_SIZE; i++) - writel_relaxed(0, RPM_CTRL_REG(rpm, RPM_ACK_SELECTOR + i)); - writel(0, RPM_CTRL_REG(rpm, RPM_ACK_CONTEXT)); + ack = readl_relaxed(RPM_CTRL_REG(rpm, rpm->data->ack_ctx_off)); + for (i = 0; i < rpm->data->sel_size; i++) + writel_relaxed(0, + RPM_CTRL_REG(rpm, rpm->data->ack_sel_off + i)); + writel(0, RPM_CTRL_REG(rpm, rpm->data->ack_ctx_off)); if (ack & RPM_NOTIFICATION) { dev_warn(rpm->dev, "ignoring notification!\n"); From 43ef3b69698bfcd23c6608b511cd055bee491df1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 22 Jun 2016 08:27:17 +0200 Subject: [PATCH 181/813] mfd: qcom_rpm: Parametrize also ack selector size commit f37be01e6dc606f2fcc5e95c9933d948ce19bd35 upstream. The RPM has two sets of selectors (IPC bit fields): request and acknowledge. Apparently, some models use 4*32 bit words for select and some use 7*32 bit words for request, but all use 7*32 words for acknowledge bits. So apparently you can on the models with requests of 4*32 select bits send 4*32 messages and get 7*32 different replies, so on ACK interrupt, 7*32 bit words need to be read. This is how the vendor code apparently works. Reported-by: Stephen Boyd Signed-off-by: Linus Walleij Reviewed-by: Bjorn Andersson Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/qcom_rpm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c index 66f00c6427d3..a867cc91657e 100644 --- a/drivers/mfd/qcom_rpm.c +++ b/drivers/mfd/qcom_rpm.c @@ -39,7 +39,8 @@ struct qcom_rpm_data { unsigned int req_sel_off; unsigned int ack_ctx_off; unsigned int ack_sel_off; - unsigned int sel_size; + unsigned int req_sel_size; + unsigned int ack_sel_size; }; struct qcom_rpm { @@ -162,7 +163,8 @@ static const struct qcom_rpm_data apq8064_template = { .req_sel_off = 11, .ack_ctx_off = 15, .ack_sel_off = 23, - .sel_size = 4, + .req_sel_size = 4, + .ack_sel_size = 7, }; static const struct qcom_rpm_resource msm8660_rpm_resource_table[] = { @@ -250,7 +252,8 @@ static const struct qcom_rpm_data msm8660_template = { .req_sel_off = 11, .ack_ctx_off = 19, .ack_sel_off = 27, - .sel_size = 7, + .req_sel_size = 7, + .ack_sel_size = 7, }; static const struct qcom_rpm_resource msm8960_rpm_resource_table[] = { @@ -337,7 +340,8 @@ static const struct qcom_rpm_data msm8960_template = { .req_sel_off = 11, .ack_ctx_off = 15, .ack_sel_off = 23, - .sel_size = 4, + .req_sel_size = 4, + .ack_sel_size = 7, }; static const struct qcom_rpm_resource ipq806x_rpm_resource_table[] = { @@ -382,7 +386,8 @@ static const struct qcom_rpm_data ipq806x_template = { .req_sel_off = 11, .ack_ctx_off = 15, .ack_sel_off = 23, - .sel_size = 4, + .req_sel_size = 4, + .ack_sel_size = 7, }; static const struct of_device_id qcom_rpm_of_match[] = { @@ -419,7 +424,7 @@ int qcom_rpm_write(struct qcom_rpm *rpm, writel_relaxed(buf[i], RPM_REQ_REG(rpm, res->target_id + i)); bitmap_set((unsigned long *)sel_mask, res->select_id, 1); - for (i = 0; i < rpm->data->sel_size; i++) { + for (i = 0; i < rpm->data->req_sel_size; i++) { writel_relaxed(sel_mask[i], RPM_CTRL_REG(rpm, rpm->data->req_sel_off + i)); } @@ -448,7 +453,7 @@ static irqreturn_t qcom_rpm_ack_interrupt(int irq, void *dev) int i; ack = readl_relaxed(RPM_CTRL_REG(rpm, rpm->data->ack_ctx_off)); - for (i = 0; i < rpm->data->sel_size; i++) + for (i = 0; i < rpm->data->ack_sel_size; i++) writel_relaxed(0, RPM_CTRL_REG(rpm, rpm->data->ack_sel_off + i)); writel(0, RPM_CTRL_REG(rpm, rpm->data->ack_ctx_off)); From ddc35199f2ad8e30c3a018e5a71c56dc60a791cd Mon Sep 17 00:00:00 2001 From: Matthew Leach Date: Fri, 8 Jul 2016 09:04:27 -0300 Subject: [PATCH 182/813] media: usbtv: prevent access to free'd resources commit 2a00932f082aff93c3a55426e0c7af6d0ec03997 upstream. When disconnecting the usbtv device, the sound card is unregistered from ALSA and the snd member of the usbtv struct is set to NULL. If the usbtv snd_trigger work is running, this can cause a race condition where the kernel will attempt to access free'd resources, shown in [1]. This patch fixes the disconnection code by cancelling any snd_trigger work before unregistering the sound card from ALSA and checking that the snd member still exists in the work function. [1]: usb 3-1.2: USB disconnect, device number 6 BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] process_one_work+0x30/0x480 PGD 405bbf067 PUD 405bbe067 PMD 0 Call Trace: [] worker_thread+0x48/0x4e0 [] ? process_one_work+0x480/0x480 [] ? process_one_work+0x480/0x480 [] kthread+0xd8/0xf0 [] ret_from_fork+0x22/0x40 [] ? kthread_worker_fn+0x170/0x170 ---[ end trace 0f3dac5c1a38e610 ]--- Signed-off-by: Matthew Leach Tested-by: Peter Sutton Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/usbtv/usbtv-audio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/usb/usbtv/usbtv-audio.c b/drivers/media/usb/usbtv/usbtv-audio.c index 78c12d22dfbb..5dab02432e82 100644 --- a/drivers/media/usb/usbtv/usbtv-audio.c +++ b/drivers/media/usb/usbtv/usbtv-audio.c @@ -278,6 +278,9 @@ static void snd_usbtv_trigger(struct work_struct *work) { struct usbtv *chip = container_of(work, struct usbtv, snd_trigger); + if (!chip->snd) + return; + if (atomic_read(&chip->snd_stream)) usbtv_audio_start(chip); else @@ -378,6 +381,8 @@ err: void usbtv_audio_free(struct usbtv *usbtv) { + cancel_work_sync(&usbtv->snd_trigger); + if (usbtv->snd && usbtv->udev) { snd_card_free(usbtv->snd); usbtv->snd = NULL; From 9e416faea82dd9231228ba5025c9d62f041e7726 Mon Sep 17 00:00:00 2001 From: Soeren Moch Date: Wed, 11 May 2016 13:49:11 -0300 Subject: [PATCH 183/813] media: dvb_ringbuffer: Add memory barriers commit ca6e6126db5494f18c6c6615060d4d803b528bff upstream. Implement memory barriers according to Documentation/circular-buffers.txt: - use smp_store_release() to update ringbuffer read/write pointers - use smp_load_acquire() to load write pointer on reader side - use ACCESS_ONCE() to load read pointer on writer side This fixes data stream corruptions observed e.g. on an ARM Cortex-A9 quad core system with different types (PCI, USB) of DVB tuners. Signed-off-by: Soeren Moch Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/dvb-core/dvb_ringbuffer.c | 74 ++++++++++++++++++++----- 1 file changed, 61 insertions(+), 13 deletions(-) diff --git a/drivers/media/dvb-core/dvb_ringbuffer.c b/drivers/media/dvb-core/dvb_ringbuffer.c index 1100e98a7b1d..7df7fb3738a0 100644 --- a/drivers/media/dvb-core/dvb_ringbuffer.c +++ b/drivers/media/dvb-core/dvb_ringbuffer.c @@ -55,7 +55,13 @@ void dvb_ringbuffer_init(struct dvb_ringbuffer *rbuf, void *data, size_t len) int dvb_ringbuffer_empty(struct dvb_ringbuffer *rbuf) { - return (rbuf->pread==rbuf->pwrite); + /* smp_load_acquire() to load write pointer on reader side + * this pairs with smp_store_release() in dvb_ringbuffer_write(), + * dvb_ringbuffer_write_user(), or dvb_ringbuffer_reset() + * + * for memory barriers also see Documentation/circular-buffers.txt + */ + return (rbuf->pread == smp_load_acquire(&rbuf->pwrite)); } @@ -64,7 +70,12 @@ ssize_t dvb_ringbuffer_free(struct dvb_ringbuffer *rbuf) { ssize_t free; - free = rbuf->pread - rbuf->pwrite; + /* ACCESS_ONCE() to load read pointer on writer side + * this pairs with smp_store_release() in dvb_ringbuffer_read(), + * dvb_ringbuffer_read_user(), dvb_ringbuffer_flush(), + * or dvb_ringbuffer_reset() + */ + free = ACCESS_ONCE(rbuf->pread) - rbuf->pwrite; if (free <= 0) free += rbuf->size; return free-1; @@ -76,7 +87,11 @@ ssize_t dvb_ringbuffer_avail(struct dvb_ringbuffer *rbuf) { ssize_t avail; - avail = rbuf->pwrite - rbuf->pread; + /* smp_load_acquire() to load write pointer on reader side + * this pairs with smp_store_release() in dvb_ringbuffer_write(), + * dvb_ringbuffer_write_user(), or dvb_ringbuffer_reset() + */ + avail = smp_load_acquire(&rbuf->pwrite) - rbuf->pread; if (avail < 0) avail += rbuf->size; return avail; @@ -86,14 +101,25 @@ ssize_t dvb_ringbuffer_avail(struct dvb_ringbuffer *rbuf) void dvb_ringbuffer_flush(struct dvb_ringbuffer *rbuf) { - rbuf->pread = rbuf->pwrite; + /* dvb_ringbuffer_flush() counts as read operation + * smp_load_acquire() to load write pointer + * smp_store_release() to update read pointer, this ensures that the + * correct pointer is visible for subsequent dvb_ringbuffer_free() + * calls on other cpu cores + */ + smp_store_release(&rbuf->pread, smp_load_acquire(&rbuf->pwrite)); rbuf->error = 0; } EXPORT_SYMBOL(dvb_ringbuffer_flush); void dvb_ringbuffer_reset(struct dvb_ringbuffer *rbuf) { - rbuf->pread = rbuf->pwrite = 0; + /* dvb_ringbuffer_reset() counts as read and write operation + * smp_store_release() to update read pointer + */ + smp_store_release(&rbuf->pread, 0); + /* smp_store_release() to update write pointer */ + smp_store_release(&rbuf->pwrite, 0); rbuf->error = 0; } @@ -119,12 +145,17 @@ ssize_t dvb_ringbuffer_read_user(struct dvb_ringbuffer *rbuf, u8 __user *buf, si return -EFAULT; buf += split; todo -= split; - rbuf->pread = 0; + /* smp_store_release() for read pointer update to ensure + * that buf is not overwritten until read is complete, + * this pairs with ACCESS_ONCE() in dvb_ringbuffer_free() + */ + smp_store_release(&rbuf->pread, 0); } if (copy_to_user(buf, rbuf->data+rbuf->pread, todo)) return -EFAULT; - rbuf->pread = (rbuf->pread + todo) % rbuf->size; + /* smp_store_release() to update read pointer, see above */ + smp_store_release(&rbuf->pread, (rbuf->pread + todo) % rbuf->size); return len; } @@ -139,11 +170,16 @@ void dvb_ringbuffer_read(struct dvb_ringbuffer *rbuf, u8 *buf, size_t len) memcpy(buf, rbuf->data+rbuf->pread, split); buf += split; todo -= split; - rbuf->pread = 0; + /* smp_store_release() for read pointer update to ensure + * that buf is not overwritten until read is complete, + * this pairs with ACCESS_ONCE() in dvb_ringbuffer_free() + */ + smp_store_release(&rbuf->pread, 0); } memcpy(buf, rbuf->data+rbuf->pread, todo); - rbuf->pread = (rbuf->pread + todo) % rbuf->size; + /* smp_store_release() to update read pointer, see above */ + smp_store_release(&rbuf->pread, (rbuf->pread + todo) % rbuf->size); } @@ -158,10 +194,16 @@ ssize_t dvb_ringbuffer_write(struct dvb_ringbuffer *rbuf, const u8 *buf, size_t memcpy(rbuf->data+rbuf->pwrite, buf, split); buf += split; todo -= split; - rbuf->pwrite = 0; + /* smp_store_release() for write pointer update to ensure that + * written data is visible on other cpu cores before the pointer + * update, this pairs with smp_load_acquire() in + * dvb_ringbuffer_empty() or dvb_ringbuffer_avail() + */ + smp_store_release(&rbuf->pwrite, 0); } memcpy(rbuf->data+rbuf->pwrite, buf, todo); - rbuf->pwrite = (rbuf->pwrite + todo) % rbuf->size; + /* smp_store_release() for write pointer update, see above */ + smp_store_release(&rbuf->pwrite, (rbuf->pwrite + todo) % rbuf->size); return len; } @@ -181,12 +223,18 @@ ssize_t dvb_ringbuffer_write_user(struct dvb_ringbuffer *rbuf, return len - todo; buf += split; todo -= split; - rbuf->pwrite = 0; + /* smp_store_release() for write pointer update to ensure that + * written data is visible on other cpu cores before the pointer + * update, this pairs with smp_load_acquire() in + * dvb_ringbuffer_empty() or dvb_ringbuffer_avail() + */ + smp_store_release(&rbuf->pwrite, 0); } status = copy_from_user(rbuf->data+rbuf->pwrite, buf, todo); if (status) return len - todo; - rbuf->pwrite = (rbuf->pwrite + todo) % rbuf->size; + /* smp_store_release() for write pointer update, see above */ + smp_store_release(&rbuf->pwrite, (rbuf->pwrite + todo) % rbuf->size); return len; } From 175845952ef0b36f89032fe4bad455e898571f52 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Sun, 3 Apr 2016 16:31:03 -0300 Subject: [PATCH 184/813] videobuf2-v4l2: Verify planes array in buffer dequeueing commit 83934b75c368f529d084815c463a7ef781dc9751 upstream. When a buffer is being dequeued using VIDIOC_DQBUF IOCTL, the exact buffer which will be dequeued is not known until the buffer has been removed from the queue. The number of planes is specific to a buffer, not to the queue. This does lead to the situation where multi-plane buffers may be requested and queued with n planes, but VIDIOC_DQBUF IOCTL may be passed an argument struct with fewer planes. __fill_v4l2_buffer() however uses the number of planes from the dequeued videobuf2 buffer, overwriting kernel memory (the m.planes array allocated in video_usercopy() in v4l2-ioctl.c) if the user provided fewer planes than the dequeued buffer had. Oops! Fixes: b0e0e1f83de3 ("[media] media: videobuf2: Prepare to divide videobuf2") Signed-off-by: Sakari Ailus Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/videobuf2-v4l2.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c index 502984c724ff..6c441be8f893 100644 --- a/drivers/media/v4l2-core/videobuf2-v4l2.c +++ b/drivers/media/v4l2-core/videobuf2-v4l2.c @@ -67,6 +67,11 @@ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer return 0; } +static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb) +{ + return __verify_planes_array(vb, pb); +} + /** * __verify_length() - Verify that the bytesused value for each plane fits in * the plane length and that the data offset doesn't exceed the bytesused value. @@ -432,6 +437,7 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, } static const struct vb2_buf_ops v4l2_buf_ops = { + .verify_planes_array = __verify_planes_array_core, .fill_user_buffer = __fill_v4l2_buffer, .fill_vb2_buffer = __fill_vb2_buffer, .set_timestamp = __set_timestamp, From 33201bcbc309af1d90372b078b8cc5017e5fff33 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 11 May 2016 18:44:32 -0300 Subject: [PATCH 185/813] vb2: core: Skip planes array verification if pb is NULL commit 126f40298446a82116e1f92a1aaf72b8c8228fae upstream. An earlier patch fixing an input validation issue introduced another issue: vb2_core_dqbuf() is called with pb argument value NULL in some cases, causing a NULL pointer dereference. Fix this by skipping the verification as there's nothing to verify. Fixes: e7e0c3e26587 ("[media] videobuf2-core: Check user space planes array in dqbuf") Signed-off-by: David R Signed-off-by: Sakari Ailus Reviewed-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/videobuf2-core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c index 11f39791ec33..47f37683893a 100644 --- a/drivers/media/v4l2-core/videobuf2-core.c +++ b/drivers/media/v4l2-core/videobuf2-core.c @@ -1505,7 +1505,7 @@ static int __vb2_get_done_vb(struct vb2_queue *q, struct vb2_buffer **vb, void *pb, int nonblocking) { unsigned long flags; - int ret; + int ret = 0; /* * Wait for at least one buffer to become available on the done_list. @@ -1521,10 +1521,12 @@ static int __vb2_get_done_vb(struct vb2_queue *q, struct vb2_buffer **vb, spin_lock_irqsave(&q->done_lock, flags); *vb = list_first_entry(&q->done_list, struct vb2_buffer, done_entry); /* - * Only remove the buffer from done_list if v4l2_buffer can handle all - * the planes. + * Only remove the buffer from done_list if all planes can be + * handled. Some cases such as V4L2 file I/O and DVB have pb + * == NULL; skip the check then as there's nothing to verify. */ - ret = call_bufop(q, verify_planes_array, *vb, pb); + if (pb) + ret = call_bufop(q, verify_planes_array, *vb, pb); if (!ret) list_del(&(*vb)->done_entry); spin_unlock_irqrestore(&q->done_lock, flags); From 8dbd7a3684fa0c33ed9448a237e1b0ac2cb3d52e Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Sat, 14 May 2016 14:01:26 -0300 Subject: [PATCH 186/813] Fix RC5 decoding with Fintek CIR chipset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bbdb34c90aeb8b2253eae88029788ebe1d7f2fd4 upstream. Fix RC5 decoding with Fintek CIR chipset Commit e87b540be2dd02552fb9244d50ae8b4e4619a34b tightened up the RC5 decoding by adding a check for trailing silence to ensure a valid RC5 command had been received. Unfortunately the trailer length checked was 10 units and the Fintek CIR device does not want to provide details of a space longer than 6350us. This meant that RC5 remotes working on a Fintek setup on 3.16 failed on 3.17 and later. Fix this by shortening the trailer check to 6 units (allowing for a previous space in the received remote command). Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=117221 Signed-off-by: Jonathan McDowell Signed-off-by: David Härdeman Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/ir-rc5-decoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index 84fa6e9b59a1..67314c034cdb 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -29,7 +29,7 @@ #define RC5_BIT_START (1 * RC5_UNIT) #define RC5_BIT_END (1 * RC5_UNIT) #define RC5X_SPACE (4 * RC5_UNIT) -#define RC5_TRAILER (10 * RC5_UNIT) /* In reality, approx 100 */ +#define RC5_TRAILER (6 * RC5_UNIT) /* In reality, approx 100 */ enum rc5_state { STATE_INACTIVE, From f8ad7cb60062a5a0514eb720b44f164276ebc4e0 Mon Sep 17 00:00:00 2001 From: Florian Echtler Date: Tue, 31 May 2016 17:15:32 -0300 Subject: [PATCH 187/813] sur40: lower poll interval to fix occasional FPS drops to ~56 FPS commit af766ee005c496b8567976dc3eed7676443ed6de upstream. The framerate sometimes drops below 60 Hz if the poll interval is too high. Lowering it to the minimum of 1 ms fixes this. Signed-off-by: Martin Kaltenbrunner Signed-off-by: Florian Echtler Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/sur40.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/sur40.c b/drivers/input/touchscreen/sur40.c index d214f22ed305..07a262bddf57 100644 --- a/drivers/input/touchscreen/sur40.c +++ b/drivers/input/touchscreen/sur40.c @@ -126,7 +126,7 @@ struct sur40_image_header { #define VIDEO_PACKET_SIZE 16384 /* polling interval (ms) */ -#define POLL_INTERVAL 4 +#define POLL_INTERVAL 1 /* maximum number of contacts FIXME: this is a guess? */ #define MAX_CONTACTS 64 From 1fe16eaeb94068fbe002ec75cb4799a2474108fd Mon Sep 17 00:00:00 2001 From: Florian Echtler Date: Tue, 31 May 2016 17:15:33 -0300 Subject: [PATCH 188/813] sur40: fix occasional oopses on device close commit 6a8588156657e607fcfdffd46c1daae8ba88a1e5 upstream. Closing the V4L2 device sometimes triggers a kernel oops. Present patch fixes this. Signed-off-by: Martin Kaltenbrunner Signed-off-by: Florian Echtler Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/sur40.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/sur40.c b/drivers/input/touchscreen/sur40.c index 07a262bddf57..45b466e3bbe8 100644 --- a/drivers/input/touchscreen/sur40.c +++ b/drivers/input/touchscreen/sur40.c @@ -441,7 +441,7 @@ static void sur40_process_video(struct sur40_state *sur40) /* return error if streaming was stopped in the meantime */ if (sur40->sequence == -1) - goto err_poll; + return; /* mark as finished */ v4l2_get_timestamp(&new_buf->vb.timestamp); @@ -730,6 +730,7 @@ static int sur40_start_streaming(struct vb2_queue *vq, unsigned int count) static void sur40_stop_streaming(struct vb2_queue *vq) { struct sur40_state *sur40 = vb2_get_drv_priv(vq); + vb2_wait_for_all_buffers(vq); sur40->sequence = -1; /* Release all active buffers */ From fb76628b66f88b6c8206fa906f524362869b5c03 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 2 Aug 2016 13:07:20 -0400 Subject: [PATCH 189/813] dm: set DMF_SUSPENDED* _before_ clearing DMF_NOFLUSH_SUSPENDING commit eaf9a7361f47727b166688a9f2096854eef60fbe upstream. Otherwise, there is potential for both DMF_SUSPENDED* and DMF_NOFLUSH_SUSPENDING to not be set during dm_suspend() -- which is definitely _not_ a valid state. This fix, in conjuction with "dm rq: fix the starting and stopping of blk-mq queues", addresses the potential for request-based DM multipath's __multipath_map() to see !dm_noflush_suspending() during suspend. Reported-by: Bart Van Assche Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c338aebb4ccd..a42729ebf272 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -3078,7 +3078,8 @@ static void unlock_fs(struct mapped_device *md) * Caller must hold md->suspend_lock */ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, - unsigned suspend_flags, int interruptible) + unsigned suspend_flags, int interruptible, + int dmf_suspended_flag) { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; @@ -3145,6 +3146,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * to finish. */ r = dm_wait_for_completion(md, interruptible); + if (!r) + set_bit(dmf_suspended_flag, &md->flags); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); @@ -3206,12 +3209,10 @@ retry: map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); - r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); if (r) goto out_unlock; - set_bit(DMF_SUSPENDED, &md->flags); - dm_table_postsuspend_targets(map); out_unlock: @@ -3305,9 +3306,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla * would require changing .presuspend to return an error -- avoid this * until there is a need for more elaborate variants of internal suspend. */ - (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); - - set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, + DMF_SUSPENDED_INTERNALLY); dm_table_postsuspend_targets(map); } From fdec508104e768a64ac0a7d8571ceb52689deed2 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Mon, 13 Jun 2016 19:44:00 +0800 Subject: [PATCH 190/813] hp-wmi: Fix wifi cannot be hard-unblocked commit fc8a601e1175ae351f662506030f9939cb7fdbfe upstream. Several users reported wifi cannot be unblocked as discussed in [1]. This patch removes the use of the 2009 flag by BIOS but uses the actual WMI function calls - it will be skipped if WMI reports unsupported. [1] https://bugzilla.kernel.org/show_bug.cgi?id=69131 Signed-off-by: Alex Hung Tested-by: Evgenii Shatokhin Signed-off-by: Darren Hart Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/hp-wmi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index fb4dd7b3ee71..af2046c87806 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -723,6 +723,11 @@ static int __init hp_wmi_rfkill_setup(struct platform_device *device) if (err) return err; + err = hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, &wireless, + sizeof(wireless), 0); + if (err) + return err; + if (wireless & 0x1) { wifi_rfkill = rfkill_alloc("hp-wifi", &device->dev, RFKILL_TYPE_WLAN, @@ -910,7 +915,7 @@ static int __init hp_wmi_bios_setup(struct platform_device *device) gps_rfkill = NULL; rfkill2_count = 0; - if (hp_wmi_bios_2009_later() || hp_wmi_rfkill_setup(device)) + if (hp_wmi_rfkill_setup(device)) hp_wmi_rfkill2_setup(device); err = device_create_file(&device->dev, &dev_attr_display); From bddb6876b6e40d9c787b04087bcb2de2cf044fb6 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 3 May 2016 16:27:16 -0400 Subject: [PATCH 191/813] s5p-mfc: Set device name for reserved memory region devs commit 29debab0a94035a390801d1f177d171d014b7765 upstream. The devices don't have a name set, so makes dev_name() returns NULL which makes harder to identify the devices that are causing issues, for example: WARNING: CPU: 2 PID: 616 at drivers/base/core.c:251 device_release+0x8c/0x90 Device '(null)' does not have a release() function, it is broken and must be fixed. And after setting the device name: WARNING: CPU: 0 PID: 591 at drivers/base/core.c:251 device_release+0x8c/0x90 Device 's5p-mfc-l' does not have a release() function, it is broken and must be fixed. Fixes: 6e83e6e25eb4 ("[media] s5p-mfc: Fix kernel warning on memory init") Signed-off-by: Javier Martinez Canillas Tested-by: Marek Szyprowski Signed-off-by: Sylwester Nawrocki Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/s5p-mfc/s5p_mfc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c index 3ffe2ecfd5ef..539e30a098e2 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c @@ -1041,6 +1041,8 @@ static int s5p_mfc_alloc_memdevs(struct s5p_mfc_dev *dev) mfc_err("Not enough memory\n"); return -ENOMEM; } + + dev_set_name(dev->mem_dev_l, "%s", "s5p-mfc-l"); device_initialize(dev->mem_dev_l); of_property_read_u32_array(dev->plat_dev->dev.of_node, "samsung,mfc-l", mem_info, 2); @@ -1058,6 +1060,8 @@ static int s5p_mfc_alloc_memdevs(struct s5p_mfc_dev *dev) mfc_err("Not enough memory\n"); return -ENOMEM; } + + dev_set_name(dev->mem_dev_r, "%s", "s5p-mfc-r"); device_initialize(dev->mem_dev_r); of_property_read_u32_array(dev->plat_dev->dev.of_node, "samsung,mfc-r", mem_info, 2); From 31edf03971c2dd1189024f87b349f66a473aeb4b Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 3 May 2016 16:27:17 -0400 Subject: [PATCH 192/813] s5p-mfc: Add release callback for memory region devs commit 6311f1261f59ce5e51fbe5cc3b5e7737197316ac upstream. When s5p_mfc_remove() calls put_device() for the reserved memory region devs, the driver core warns that the dev doesn't have a release callback: WARNING: CPU: 0 PID: 591 at drivers/base/core.c:251 device_release+0x8c/0x90 Device 's5p-mfc-l' does not have a release() function, it is broken and must be fixed. Also, the declared DMA memory using dma_declare_coherent_memory() isn't relased so add a dev .release that calls dma_release_declared_memory(). Fixes: 6e83e6e25eb4 ("[media] s5p-mfc: Fix kernel warning on memory init") Signed-off-by: Javier Martinez Canillas Tested-by: Marek Szyprowski Signed-off-by: Sylwester Nawrocki Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/s5p-mfc/s5p_mfc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c index 539e30a098e2..c8946f98ced4 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c @@ -1029,6 +1029,11 @@ static int match_child(struct device *dev, void *data) return !strcmp(dev_name(dev), (char *)data); } +static void s5p_mfc_memdev_release(struct device *dev) +{ + dma_release_declared_memory(dev); +} + static void *mfc_get_drv_data(struct platform_device *pdev); static int s5p_mfc_alloc_memdevs(struct s5p_mfc_dev *dev) @@ -1043,6 +1048,7 @@ static int s5p_mfc_alloc_memdevs(struct s5p_mfc_dev *dev) } dev_set_name(dev->mem_dev_l, "%s", "s5p-mfc-l"); + dev->mem_dev_l->release = s5p_mfc_memdev_release; device_initialize(dev->mem_dev_l); of_property_read_u32_array(dev->plat_dev->dev.of_node, "samsung,mfc-l", mem_info, 2); @@ -1062,6 +1068,7 @@ static int s5p_mfc_alloc_memdevs(struct s5p_mfc_dev *dev) } dev_set_name(dev->mem_dev_r, "%s", "s5p-mfc-r"); + dev->mem_dev_r->release = s5p_mfc_memdev_release; device_initialize(dev->mem_dev_r); of_property_read_u32_array(dev->plat_dev->dev.of_node, "samsung,mfc-r", mem_info, 2); From 3f32fd3a3a49780994d4f1b32816adfba75777cb Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 16 Jul 2016 02:36:38 +0300 Subject: [PATCH 193/813] i2c: efm32: fix a failure path in efm32_i2c_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7dd91d52a813f99a95d20f539b777e9e6198b931 upstream. There is the only failure path in efm32_i2c_probe(), where clk_disable_unprepare() is missed. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Acked-by: Uwe Kleine-König Signed-off-by: Wolfram Sang Fixes: 1b5b23718b84 ("i2c: efm32: new bus driver") Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-efm32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-efm32.c b/drivers/i2c/busses/i2c-efm32.c index 8eff62738877..e253598d764c 100644 --- a/drivers/i2c/busses/i2c-efm32.c +++ b/drivers/i2c/busses/i2c-efm32.c @@ -433,7 +433,7 @@ static int efm32_i2c_probe(struct platform_device *pdev) ret = request_irq(ddata->irq, efm32_i2c_irq, 0, DRIVER_NAME, ddata); if (ret < 0) { dev_err(&pdev->dev, "failed to request irq (%d)\n", ret); - return ret; + goto err_disable_clk; } ret = i2c_add_adapter(&ddata->adapter); From 663c26074f48d08683877dc8b4256dc12ffcfe0e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 6 Jul 2016 12:08:11 +0300 Subject: [PATCH 194/813] spi: pxa2xx: Clear all RFT bits in reset_sccr1() on Intel Quark commit 152bc19e2fc2b7fce7ffbc2a9cea94b147223702 upstream. It seems the commit e5262d0568dc ("spi: spi-pxa2xx: SPI support for Intel Quark X1000") misses one place to be adapted for Intel Quark, i.e. in reset_sccr1(). Clear all RFT bits when call reset_sccr1() on Intel Quark. Fixes: e5262d0568dc ("spi: spi-pxa2xx: SPI support for Intel Quark X1000") Signed-off-by: Andy Shevchenko Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-pxa2xx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 73c8ea0b1360..3cac73e4c3e4 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -548,7 +548,14 @@ static void reset_sccr1(struct driver_data *drv_data) u32 sccr1_reg; sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1) & ~drv_data->int_cr1; - sccr1_reg &= ~SSCR1_RFT; + switch (drv_data->ssp_type) { + case QUARK_X1000_SSP: + sccr1_reg &= ~QUARK_X1000_SSCR1_RFT; + break; + default: + sccr1_reg &= ~SSCR1_RFT; + break; + } sccr1_reg |= chip->threshold; pxa2xx_spi_write(drv_data, SSCR1, sccr1_reg); } From 5c93b99d27ffe7d1a3c6d43429bd1966b4c353dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Thu, 14 Jul 2016 10:50:23 +0200 Subject: [PATCH 195/813] Bluetooth: Fix l2cap_sock_setsockopt() with optname BT_RCVMTU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 23bc6ab0a0912146fd674a0becc758c3162baabc upstream. When we retrieve imtu value from userspace we should use 16 bit pointer cast instead of 32 as it's defined that way in headers. Fixes setsockopt calls on big-endian platforms. Signed-off-by: Amadeusz Sławiński Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/l2cap_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1bb551527044..d9bbbded49ef 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -927,7 +927,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (get_user(opt, (u16 __user *) optval)) { err = -EFAULT; break; } From 02808fd9e78d31e8e3e41cab06acc0a1153b0ef6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 10 Jun 2016 10:28:38 +0200 Subject: [PATCH 196/813] EDAC: Correct channel count limit commit bba142957e04c400440d2df83c1b3b2dfc42e220 upstream. c44696fff04f ("EDAC: Remove arbitrary limit on number of channels") lifted the arbitrary limit on memory controller channels in EDAC. However, the dynamic channel attributes dynamic_csrow_dimm_attr and dynamic_csrow_ce_count_attr remained 6. This wasn't a problem except channels 6 and 7 weren't visible in sysfs on machines with more than 6 channels after the conversion to static attr groups with 2c1946b6d629 ("EDAC: Use static attribute groups for managing sysfs entries") [ without that, we're exploding in edac_create_sysfs_mci_device() because we're dereferencing out of the bounds of the dynamic_csrow_dimm_attr array. ] Add attributes for channels 6 and 7 along with a guard for the future, should more channels be required and/or to sanity check for misconfigured machines. We still need to check against the number of channels present on the MC first, as Thor reported. Signed-off-by: Borislav Petkov Reported-by: Hironobu Ishii Tested-by: Thor Thayer Signed-off-by: Greg Kroah-Hartman --- drivers/edac/edac_mc_sysfs.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 58aed67b7eba..3c8f19f5ac81 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -313,7 +313,6 @@ static struct device_type csrow_attr_type = { * possible dynamic channel DIMM Label attribute files * */ - DEVICE_CHANNEL(ch0_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 0); DEVICE_CHANNEL(ch1_dimm_label, S_IRUGO | S_IWUSR, @@ -326,6 +325,10 @@ DEVICE_CHANNEL(ch4_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 4); DEVICE_CHANNEL(ch5_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 5); +DEVICE_CHANNEL(ch6_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 6); +DEVICE_CHANNEL(ch7_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 7); /* Total possible dynamic DIMM Label attribute file table */ static struct attribute *dynamic_csrow_dimm_attr[] = { @@ -335,6 +338,8 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { &dev_attr_legacy_ch3_dimm_label.attr.attr, &dev_attr_legacy_ch4_dimm_label.attr.attr, &dev_attr_legacy_ch5_dimm_label.attr.attr, + &dev_attr_legacy_ch6_dimm_label.attr.attr, + &dev_attr_legacy_ch7_dimm_label.attr.attr, NULL }; @@ -351,6 +356,10 @@ DEVICE_CHANNEL(ch4_ce_count, S_IRUGO, channel_ce_count_show, NULL, 4); DEVICE_CHANNEL(ch5_ce_count, S_IRUGO, channel_ce_count_show, NULL, 5); +DEVICE_CHANNEL(ch6_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 6); +DEVICE_CHANNEL(ch7_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 7); /* Total possible dynamic ce_count attribute file table */ static struct attribute *dynamic_csrow_ce_count_attr[] = { @@ -360,6 +369,8 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { &dev_attr_legacy_ch3_ce_count.attr.attr, &dev_attr_legacy_ch4_ce_count.attr.attr, &dev_attr_legacy_ch5_ce_count.attr.attr, + &dev_attr_legacy_ch6_ce_count.attr.attr, + &dev_attr_legacy_ch7_ce_count.attr.attr, NULL }; @@ -371,9 +382,16 @@ static umode_t csrow_dev_is_visible(struct kobject *kobj, if (idx >= csrow->nr_channels) return 0; + + if (idx >= ARRAY_SIZE(dynamic_csrow_ce_count_attr) - 1) { + WARN_ONCE(1, "idx: %d\n", idx); + return 0; + } + /* Only expose populated DIMMs */ if (!csrow->channels[idx]->dimm->nr_pages) return 0; + return attr->mode; } From 7a2cfda740e60097c1369c274d2772af9d073b76 Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander Date: Wed, 18 May 2016 13:11:09 -0700 Subject: [PATCH 197/813] HID: uhid: fix timeout when probe races with IO commit 67f8ecc550b5bda03335f845dc869b8501d25fd0 upstream. Many devices use userspace bluetooth stacks like BlueZ or Bluedroid in combination with uhid. If any of these stacks is used with a HID device for which the driver performs a HID request as part .probe (or technically another HID operation), this results in a deadlock situation. The deadlock results in a 5 second timeout for I/O operations in HID drivers, so isn't fatal, but none of the I/O operations have a chance of succeeding. The root cause for the problem is that uhid only allows for one request to be processed at a time per uhid instance and locks out other operations. This means that if a user space is creating a new HID device through 'UHID_CREATE', which ultimately triggers '.probe' through the HID layer. Then any HID request e.g. a read for calibration data would trigger a HID operation on uhid again, but it won't go out to userspace, because it is still stuck in UHID_CREATE. In addition bluetooth stacks are typically single threaded, so they wouldn't be able to handle any requests while waiting on uhid. Lucikly the UHID spec is somewhat flexible and allows for fixing the issue, without breaking user space. The idea which the patch implements as discussed with David Herrmann is to decouple adding of a hid device (which triggers .probe) from UHID_CREATE. The work will kick off roughly once UHID_CREATE completed (or else will wait a tiny bit of time in .probe for a lock). A HID driver has to call HID to call 'hid_hw_start()' as part of .probe once it is ready for I/O, which triggers UHID_START to user space. Any HID operations should function now within .probe and won't deadlock because userspace is stuck on UHID_CREATE. We verified this patch on Bluedroid with Android 6.0 and on desktop Linux with BlueZ stacks. Prior to the patch they had the deadlock issue. [jkosina@suse.cz: reword subject] Signed-off-by: Roderick Colenbrander Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/uhid.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index e094c572b86e..1a2032c2c1fb 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -51,10 +51,26 @@ struct uhid_device { u32 report_id; u32 report_type; struct uhid_event report_buf; + struct work_struct worker; }; static struct miscdevice uhid_misc; +static void uhid_device_add_worker(struct work_struct *work) +{ + struct uhid_device *uhid = container_of(work, struct uhid_device, worker); + int ret; + + ret = hid_add_device(uhid->hid); + if (ret) { + hid_err(uhid->hid, "Cannot register HID device: error %d\n", ret); + + hid_destroy_device(uhid->hid); + uhid->hid = NULL; + uhid->running = false; + } +} + static void uhid_queue(struct uhid_device *uhid, struct uhid_event *ev) { __u8 newhead; @@ -498,18 +514,14 @@ static int uhid_dev_create2(struct uhid_device *uhid, uhid->hid = hid; uhid->running = true; - ret = hid_add_device(hid); - if (ret) { - hid_err(hid, "Cannot register HID device\n"); - goto err_hid; - } + /* Adding of a HID device is done through a worker, to allow HID drivers + * which use feature requests during .probe to work, without they would + * be blocked on devlock, which is held by uhid_char_write. + */ + schedule_work(&uhid->worker); return 0; -err_hid: - hid_destroy_device(hid); - uhid->hid = NULL; - uhid->running = false; err_free: kfree(uhid->rd_data); uhid->rd_data = NULL; @@ -550,6 +562,8 @@ static int uhid_dev_destroy(struct uhid_device *uhid) uhid->running = false; wake_up_interruptible(&uhid->report_wait); + cancel_work_sync(&uhid->worker); + hid_destroy_device(uhid->hid); kfree(uhid->rd_data); @@ -612,6 +626,7 @@ static int uhid_char_open(struct inode *inode, struct file *file) init_waitqueue_head(&uhid->waitq); init_waitqueue_head(&uhid->report_wait); uhid->running = false; + INIT_WORK(&uhid->worker, uhid_device_add_worker); file->private_data = uhid; nonseekable_open(inode, file); From 54c4ddcbab7396c58e1fc745663417ad7e872137 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 29 Jul 2016 12:05:24 +0200 Subject: [PATCH 198/813] ovl: disallow overlayfs as upperdir commit 76bc8e2843b66f8205026365966b49ec6da39ae7 upstream. This does not work and does not make sense. So instead of fixing it (probably not hard) just disallow. Reported-by: Andrei Vagin Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a1acc6004a91..70a7bbe199d0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -376,7 +376,8 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) static bool ovl_dentry_remote(struct dentry *dentry) { return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_REAL); } static bool ovl_dentry_weird(struct dentry *dentry) From 3633bd8220a778e41de3e75930aa21f8a51a1629 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Wed, 25 May 2016 15:41:28 -0500 Subject: [PATCH 199/813] remoteproc: Fix potential race condition in rproc_add commit d2e12e66a939c54ed84e5f1b6947f0c45f6c56eb upstream. rproc_add adds the newly created remoteproc to a list for use by rproc_get_by_phandle and then does some additional processing to finish adding the remoteproc. This leaves a small window of time in which the rproc is available in the list but not yet fully initialized, so if another driver comes along and gets a handle to the rproc, it will be invalid. Rearrange the code in rproc_add to make sure the rproc is added to the list only after it has been successfuly initialized. Fixes: fec47d863587 ("remoteproc: introduce rproc_get_by_phandle API") Signed-off-by: Dave Gerlach Signed-off-by: Bjorn Andersson Signed-off-by: Greg Kroah-Hartman --- drivers/remoteproc/remoteproc_core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 9e03d158f411..4f7ce0097191 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1239,11 +1239,6 @@ int rproc_add(struct rproc *rproc) if (ret < 0) return ret; - /* expose to rproc_get_by_phandle users */ - mutex_lock(&rproc_list_mutex); - list_add(&rproc->node, &rproc_list); - mutex_unlock(&rproc_list_mutex); - dev_info(dev, "%s is available\n", rproc->name); dev_info(dev, "Note: remoteproc is still under development and considered experimental.\n"); @@ -1251,8 +1246,16 @@ int rproc_add(struct rproc *rproc) /* create debugfs entries */ rproc_create_debug_dir(rproc); + ret = rproc_add_virtio_devices(rproc); + if (ret < 0) + return ret; - return rproc_add_virtio_devices(rproc); + /* expose to rproc_get_by_phandle users */ + mutex_lock(&rproc_list_mutex); + list_add(&rproc->node, &rproc_list); + mutex_unlock(&rproc_list_mutex); + + return 0; } EXPORT_SYMBOL(rproc_add); From f2aa5d3771351ed45cf9f5ce73bc4695a09318be Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 28 Jul 2016 11:35:50 -0700 Subject: [PATCH 200/813] ARC: mm: don't loose PTE_SPECIAL in pte_modify() commit 3925a16ae980c79d1a8fd182d7f9487da1edd4dc upstream. LTP madvise05 was generating mm splat | [ARCLinux]# /sd/ltp/testcases/bin/madvise05 | BUG: Bad page map in process madvise05 pte:80e08211 pmd:9f7d4000 | page:9fdcfc90 count:1 mapcount:-1 mapping: (null) index:0x0 flags: 0x404(referenced|reserved) | page dumped because: bad pte | addr:200b8000 vm_flags:00000070 anon_vma: (null) mapping: (null) index:1005c | file: (null) fault: (null) mmap: (null) readpage: (null) | CPU: 2 PID: 6707 Comm: madvise05 And for newer kernels, the system was rendered unusable afterwards. The problem was mprotect->pte_modify() clearing PTE_SPECIAL (which is set to identify the special zero page wired to the pte). When pte was finally unmapped, special casing for zero page was not done, and instead it was treated as a "normal" page, tripping on the map counts etc. This fixes ARC STAR 9001053308 Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 57af2f05ae84..3cab04255ae0 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -110,7 +110,7 @@ #define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE) /* Set of bits not changed in pte_modify */ -#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL) /* More Abbrevaited helpers */ #define PAGE_U_NONE __pgprot(___DEF) From 564e0f8b22814e1b811bbc77953c9554e2a08328 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Jun 2016 11:49:01 -0400 Subject: [PATCH 201/813] jbd2: make journal y2038 safe commit abcfb5d979892fc8b12574551fc907c05fe1b11b upstream. The jbd2 journal stores the commit time in 64-bit seconds and 32-bit nanoseconds, which avoids an overflow in 2038, but it gets the numbers from current_kernel_time(), which uses 'long' seconds on 32-bit architectures. This simply changes the code to call current_kernel_time64() so we use 64-bit seconds consistently. Signed-off-by: Arnd Bergmann Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 36345fefa3ff..2d964ce45606 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -124,7 +124,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; int ret; - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); *cbh = NULL; From a636a9b1306587bbfab54b1e435461289a4c2c35 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Wed, 25 May 2016 19:59:09 +0200 Subject: [PATCH 202/813] fs/cifs: make share unaccessible at root level mountable commit a6b5058fafdf508904bbf16c29b24042cef3c496 upstream. if, when mounting //HOST/share/sub/dir/foo we can query /sub/dir/foo but not any of the path components above: - store the /sub/dir/foo prefix in the cifs super_block info - in the superblock, set root dentry to the subpath dentry (instead of the share root) - set a flag in the superblock to remember it - use prefixpath when building path from a dentry fixes bso#8950 Signed-off-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifs_fs_sb.h | 4 ++++ fs/cifs/cifsfs.c | 14 ++++++++++++- fs/cifs/connect.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/dir.c | 20 ++++++++++++++++-- fs/cifs/inode.c | 22 ++++++++++++++++++-- 5 files changed, 104 insertions(+), 5 deletions(-) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 3182273a3407..1418daa03d95 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -46,6 +46,9 @@ #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ #define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ +#define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible + * root mountable + */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -67,5 +70,6 @@ struct cifs_sb_info { struct backing_dev_info bdi; struct delayed_work prune_tlinks; struct rcu_head rcu; + char *prepath; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cbc0f4bca0c0..450578097fb7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -686,6 +686,14 @@ cifs_do_mount(struct file_system_type *fs_type, goto out_cifs_sb; } + if (volume_info->prepath) { + cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; + } + } + cifs_setup_cifs_sb(volume_info, cifs_sb); rc = cifs_mount(cifs_sb, volume_info); @@ -724,7 +732,11 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - root = cifs_get_root(volume_info, sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + root = dget(sb->s_root); + else + root = cifs_get_root(volume_info, sb); + if (IS_ERR(root)) goto out_super; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5481a6eb9a95..61c3a5ab8637 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3517,6 +3517,44 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +static int +cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, + unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + char *full_path) +{ + int rc; + char *s; + char sep, tmp; + + sep = CIFS_DIR_SEP(cifs_sb); + s = full_path; + + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, ""); + while (rc == 0) { + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + /* next separator */ + while (*s && *s != sep) + s++; + + /* + * temporarily null-terminate the path at the end of + * the current component + */ + tmp = *s; + *s = 0; + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, + full_path); + *s = tmp; + } + return rc; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3654,6 +3692,16 @@ remote_path_check: kfree(full_path); goto mount_fail_check; } + + rc = cifs_are_all_path_components_accessible(server, + xid, tcon, cifs_sb, + full_path); + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } kfree(full_path); } @@ -3923,6 +3971,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb) bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); + kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index c3eb998a99bd..b95bffcee8aa 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -84,6 +84,7 @@ build_path_from_dentry(struct dentry *direntry) struct dentry *temp; int namelen; int dfsplen; + int pplen = 0; char *full_path; char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); @@ -95,8 +96,12 @@ build_path_from_dentry(struct dentry *direntry) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else dfsplen = 0; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; + cifs_bp_rename_retry: - namelen = dfsplen; + namelen = dfsplen + pplen; seq = read_seqbegin(&rename_lock); rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { @@ -137,7 +142,7 @@ cifs_bp_rename_retry: } } rcu_read_unlock(); - if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { + if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) { cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", namelen, dfsplen); /* presumably this is only possible if racing with a rename @@ -153,6 +158,17 @@ cifs_bp_rename_retry: those safely to '/' if any are found in the middle of the prepath */ /* BB test paths to Windows with '/' in the midst of prepath */ + if (pplen) { + int i; + + cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); + memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); + full_path[dfsplen] = '\\'; + for (i = 0; i < pplen-1; i++) + if (full_path[dfsplen+1+i] == '/') + full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); + } + if (dfsplen) { strncpy(full_path, tcon->treeName, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a329f5ba35aa..9cdeb0293267 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -982,10 +982,26 @@ struct inode *cifs_root_iget(struct super_block *sb) struct inode *inode = NULL; long rc; struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + char *path = NULL; + int len; + + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + && cifs_sb->prepath) { + len = strlen(cifs_sb->prepath); + path = kzalloc(len + 2 /* leading sep + null */, GFP_KERNEL); + if (path == NULL) + return ERR_PTR(-ENOMEM); + path[0] = '/'; + memcpy(path+1, cifs_sb->prepath, len); + } else { + path = kstrdup("", GFP_KERNEL); + if (path == NULL) + return ERR_PTR(-ENOMEM); + } xid = get_xid(); if (tcon->unix_ext) { - rc = cifs_get_inode_info_unix(&inode, "", sb, xid); + rc = cifs_get_inode_info_unix(&inode, path, sb, xid); /* some servers mistakenly claim POSIX support */ if (rc != -EOPNOTSUPP) goto iget_no_retry; @@ -993,7 +1009,8 @@ struct inode *cifs_root_iget(struct super_block *sb) tcon->unix_ext = false; } - rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); + convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); + rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL); iget_no_retry: if (!inode) { @@ -1022,6 +1039,7 @@ iget_no_retry: } out: + kfree(path); /* can not call macro free_xid here since in a void func * TODO: This is no longer true */ From 36e6321056ba24f004bfc16d4398e65a6651f843 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Thu, 7 Jul 2016 21:28:27 +0100 Subject: [PATCH 203/813] cifs: Check for existing directory when opening file with O_CREAT commit 8d9535b6efd86e6c07da59f97e68f44efb7fe080 upstream. When opening a file with O_CREAT flag, check to see if the file opened is an existing directory. This prevents the directory from being opened which subsequently causes a crash when the close function for directories cifs_closedir() is called which frees up the file->private_data memory while the file is still listed on the open file list for the tcon. Signed-off-by: Sachin Prabhu Signed-off-by: Steve French Reported-by: Xiaoli Feng Signed-off-by: Greg Kroah-Hartman --- fs/cifs/dir.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index b95bffcee8aa..26a3b389a265 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -245,6 +245,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto cifs_create_get_file_info; } + if (S_ISDIR(newinode->i_mode)) { + CIFSSMBClose(xid, tcon, fid->netfid); + iput(newinode); + rc = -EISDIR; + goto out; + } + if (!S_ISREG(newinode->i_mode)) { /* * The server may allow us to open things like @@ -415,10 +422,14 @@ cifs_create_set_dentry: if (rc != 0) { cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", rc); - if (server->ops->close) - server->ops->close(xid, tcon, fid); - goto out; + goto out_err; } + + if (S_ISDIR(newinode->i_mode)) { + rc = -EISDIR; + goto out_err; + } + d_drop(direntry); d_add(direntry, newinode); @@ -426,6 +437,13 @@ out: kfree(buf); kfree(full_path); return rc; + +out_err: + if (server->ops->close) + server->ops->close(xid, tcon, fid); + if (newinode) + iput(newinode); + goto out; } int From 047617448daecf05e57498d8697acd3dbd38672d Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 19 Jul 2016 09:26:21 +0200 Subject: [PATCH 204/813] cifs: fix crash due to race in hmac(md5) handling commit bd975d1eead2558b76e1079e861eacf1f678b73b upstream. The secmech hmac(md5) structures are present in the TCP_Server_Info struct and can be shared among multiple CIFS sessions. However, the server mutex is not currently held when these structures are allocated and used, which can lead to a kernel crashes, as in the scenario below: mount.cifs(8) #1 mount.cifs(8) #2 Is secmech.sdeschmaccmd5 allocated? // false Is secmech.sdeschmaccmd5 allocated? // false secmech.hmacmd = crypto_alloc_shash.. secmech.sdeschmaccmd5 = kzalloc.. sdeschmaccmd5->shash.tfm = &secmec.hmacmd; secmech.sdeschmaccmd5 = kzalloc // sdeschmaccmd5->shash.tfm // not yet assigned crypto_shash_update() deref NULL sdeschmaccmd5->shash.tfm Unable to handle kernel paging request at virtual address 00000030 epc : 8027ba34 crypto_shash_update+0x38/0x158 ra : 8020f2e8 setup_ntlmv2_rsp+0x4bc/0xa84 Call Trace: crypto_shash_update+0x38/0x158 setup_ntlmv2_rsp+0x4bc/0xa84 build_ntlmssp_auth_blob+0xbc/0x34c sess_auth_rawntlmssp_authenticate+0xac/0x248 CIFS_SessSetup+0xf0/0x178 cifs_setup_session+0x4c/0x84 cifs_get_smb_ses+0x2c8/0x314 cifs_mount+0x38c/0x76c cifs_do_mount+0x98/0x440 mount_fs+0x20/0xc0 vfs_kern_mount+0x58/0x138 do_mount+0x1e8/0xccc SyS_mount+0x88/0xd4 syscall_common+0x30/0x54 Fix this by locking the srv_mutex around the code which uses these hmac(md5) structures. All the other secmech algos already have similar locking. Fixes: 95dc8dd14e2e84cc ("Limit allocation of crypto mechanisms to dialect which requires") Signed-off-by: Rabin Vincent Acked-by: Sachin Prabhu Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsencrypt.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index e682b36a210f..4acbc390a7d6 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -731,24 +731,26 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) memcpy(ses->auth_key.response + baselen, tiblob, tilen); + mutex_lock(&ses->server->srv_mutex); + rc = crypto_hmacmd5_alloc(ses->server); if (rc) { cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* calculate ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* calculate first part of the client response (CR1) */ rc = CalcNTLMv2_response(ses, ntlmv2_hash); if (rc) { cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* now calculate the session key for NTLMv2 */ @@ -757,13 +759,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, @@ -771,7 +773,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, @@ -779,6 +781,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); +unlock: + mutex_unlock(&ses->server->srv_mutex); setup_ntlmv2_rsp_ret: kfree(tiblob); From adc58bfd4d75183a65f806b1b5354d1b65f832c6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sun, 24 Jul 2016 10:37:38 +0300 Subject: [PATCH 205/813] CIFS: Fix a possible invalid memory access in smb2_query_symlink() commit 7893242e2465aea6f2cbc2639da8fa5ce96e8cc2 upstream. During following a symbolic link we received err_buf from SMB2_open(). While the validity of SMB2 error response is checked previously in smb2_check_message() a symbolic link payload is not checked at all. Fix it by adding such checks. Cc: Dan Carpenter Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 53ccdde6ff18..dd8543caa56e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1039,6 +1039,9 @@ smb2_new_lease_key(struct cifs_fid *fid) get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); } +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, char **target_path, @@ -1051,7 +1054,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct smb2_err_rsp *err_buf = NULL; struct smb2_symlink_err_rsp *symlink; - unsigned int sub_len, sub_offset; + unsigned int sub_len; + unsigned int sub_offset; + unsigned int print_len; + unsigned int print_offset; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -1072,11 +1078,33 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, kfree(utf16_path); return -ENOENT; } + + if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || + get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) { + kfree(utf16_path); + return -ENOENT; + } + /* open must fail on symlink - reset rc */ rc = 0; symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; sub_len = le16_to_cpu(symlink->SubstituteNameLength); sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); + print_len = le16_to_cpu(symlink->PrintNameLength); + print_offset = le16_to_cpu(symlink->PrintNameOffset); + + if (get_rfc1002_length(err_buf) + 4 < + SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { + kfree(utf16_path); + return -ENOENT; + } + + if (get_rfc1002_length(err_buf) + 4 < + SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { + kfree(utf16_path); + return -ENOENT; + } + *target_path = cifs_strndup_from_utf16( (char *)symlink->PathBuffer + sub_offset, sub_len, true, cifs_sb->local_nls); From f41fc0bfede5bbeca4f09d75c76c4db5d6c0d2ee Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 12 Jun 2016 18:11:51 -0400 Subject: [PATCH 206/813] random: initialize the non-blocking pool via add_hwgenerator_randomness() commit 3371f3da08cff4b75c1f2dce742d460539d6566d upstream. If we have a hardware RNG and are using the in-kernel rngd, we should use this to initialize the non-blocking pool so that getrandom(2) doesn't block unnecessarily. Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 0227b0465b40..802d0c840865 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1847,12 +1847,18 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, { struct entropy_store *poolp = &input_pool; - /* Suspend writing if we're above the trickle threshold. - * We'll be woken up again once below random_write_wakeup_thresh, - * or when the calling thread is about to terminate. - */ - wait_event_interruptible(random_write_wait, kthread_should_stop() || + if (unlikely(nonblocking_pool.initialized == 0)) + poolp = &nonblocking_pool; + else { + /* Suspend writing if we're above the trickle + * threshold. We'll be woken up again once below + * random_write_wakeup_thresh, or when the calling + * thread is about to terminate. + */ + wait_event_interruptible(random_write_wait, + kthread_should_stop() || ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits); + } mix_pool_bytes(poolp, buffer, count); credit_entropy_bits(poolp, entropy); } From 529025b1293b2af844348d700cf22fa585c1f2b6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 13 Jun 2016 10:10:51 -0400 Subject: [PATCH 207/813] random: print a warning for the first ten uninitialized random users commit 9b4d008787f864f17d008c9c15bbe8a0f7e2fc24 upstream. Since systemd is consistently using /dev/urandom before it is initialized, we can't see the other potentially dangerous users of /dev/urandom immediately after boot. So print the first ten such complaints instead. Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 802d0c840865..e5e1f1432e4d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1460,12 +1460,16 @@ random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) static ssize_t urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { + static int maxwarn = 10; int ret; - if (unlikely(nonblocking_pool.initialized == 0)) - printk_once(KERN_NOTICE "random: %s urandom read " - "with %d bits of entropy available\n", - current->comm, nonblocking_pool.entropy_total); + if (unlikely(nonblocking_pool.initialized == 0) && + maxwarn > 0) { + maxwarn--; + printk(KERN_NOTICE "random: %s: uninitialized urandom read " + "(%zd bytes read, %d bits of entropy available)\n", + current->comm, nbytes, nonblocking_pool.entropy_total); + } nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3)); ret = extract_entropy_user(&nonblocking_pool, buf, nbytes); From f48dd2d0a75c6facd044c36fe86251027c12f0eb Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 2 May 2016 02:14:34 -0400 Subject: [PATCH 208/813] random: add interrupt callback to VMBus IRQ handler commit 4b44f2d18a330565227a7348844493c59366171e upstream. The Hyper-V Linux Integration Services use the VMBus implementation for communication with the Hypervisor. VMBus registers its own interrupt handler that completely bypasses the common Linux interrupt handling. This implies that the interrupt entropy collector is not triggered. This patch adds the interrupt entropy collection callback into the VMBus interrupt handler function. Signed-off-by: Stephan Mueller Signed-off-by: Stephan Mueller Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 1 + drivers/hv/vmbus_drv.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index e5e1f1432e4d..491a4dce13fe 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -948,6 +948,7 @@ void add_interrupt_randomness(int irq, int irq_flags) /* award one bit for the contents of the fast pool */ credit_entropy_bits(r, credit + 1); } +EXPORT_SYMBOL_GPL(add_interrupt_randomness); #ifdef CONFIG_BLOCK void add_disk_randomness(struct gendisk *disk) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index f19b6f7a467a..9b5440f6b3b4 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "hyperv_vmbus.h" static struct acpi_device *hv_acpi_dev; @@ -826,6 +827,8 @@ static void vmbus_isr(void) else tasklet_schedule(&msg_dpc); } + + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0); } From ed7e1dd5abb39e0d6ce71e4914be365062bb1711 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 18 Aug 2016 10:05:29 +0100 Subject: [PATCH 209/813] MIPS: KVM: Fix mapped fault broken commpage handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c604cffa93478f8888bec62b23d6073dad03d43a upstream. kvm_mips_handle_mapped_seg_tlb_fault() appears to map the guest page at virtual address 0 to PFN 0 if the guest has created its own mapping there. The intention is unclear, but it may have been an attempt to protect the zero page from being mapped to anything but the comm page in code paths you wouldn't expect from genuine commpage accesses (guest kernel mode cache instructions on that address, hitting trapping instructions when executing from that address with a coincidental TLB eviction during the KVM handling, and guest user mode accesses to that address). Fix this to check for mappings exactly at KVM_GUEST_COMMPAGE_ADDR (it may not be at address 0 since commit 42aa12e74e91 ("MIPS: KVM: Move commpage so 0x0 is unmapped")), and set the corresponding EntryLo to be interpreted as 0 (invalid). Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Radim Krčmář [james.hogan@imgtec.com: Backport to v3.17.y - v4.4.y] Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/kvm/tlb.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index aed0ac2a4972..d3c5715426c4 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -361,24 +361,31 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; pfn_t pfn0, pfn1; + long tlb_lo[2]; - if ((tlb->tlb_hi & VPN2_MASK) == 0) { - pfn0 = 0; - pfn1 = 0; - } else { - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT) < 0) - return -1; + tlb_lo[0] = tlb->tlb_lo0; + tlb_lo[1] = tlb->tlb_lo1; - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT) < 0) - return -1; + /* + * The commpage address must not be mapped to anything else if the guest + * TLB contains entries nearby, or commpage accesses will break. + */ + if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) & + VPN2_MASK & (PAGE_MASK << 1))) + tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0; - pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT]; - } + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[0]) + >> PAGE_SHIFT) < 0) + return -1; + + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[1]) + >> PAGE_SHIFT) < 0) + return -1; + + pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb_lo[0]) + >> PAGE_SHIFT]; + pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb_lo[1]) + >> PAGE_SHIFT]; if (hpa0) *hpa0 = pfn0 << PAGE_SHIFT; @@ -391,9 +398,9 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, kvm_mips_get_kernel_asid(vcpu) : kvm_mips_get_user_asid(vcpu)); entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); + (tlb_lo[0] & MIPS3_PG_D) | (tlb_lo[0] & MIPS3_PG_V); entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V); + (tlb_lo[1] & MIPS3_PG_D) | (tlb_lo[1] & MIPS3_PG_V); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo0, tlb->tlb_lo1); From f26fac101c44492ca80d7ec65d484589299d8c99 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 18 Aug 2016 10:05:30 +0100 Subject: [PATCH 210/813] MIPS: KVM: Add missing gfn range check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8985d50382359e5bf118fdbefc859d0dbf6cebc7 upstream. kvm_mips_handle_mapped_seg_tlb_fault() calculates the guest frame number based on the guest TLB EntryLo values, however it is not range checked to ensure it lies within the guest_pmap. If the physical memory the guest refers to is out of range then dump the guest TLB and emit an internal error. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Radim Krčmář [james.hogan@imgtec.com: Backport to v3.17.y - v4.4.y] Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/kvm/tlb.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index d3c5715426c4..59e885fa4c65 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -361,6 +361,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; pfn_t pfn0, pfn1; + gfn_t gfn0, gfn1; long tlb_lo[2]; tlb_lo[0] = tlb->tlb_lo0; @@ -374,18 +375,24 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, VPN2_MASK & (PAGE_MASK << 1))) tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0; - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[0]) - >> PAGE_SHIFT) < 0) + gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT; + gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT; + if (gfn0 >= kvm->arch.guest_pmap_npages || + gfn1 >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n", + __func__, gfn0, gfn1, tlb->tlb_hi); + kvm_mips_dump_guest_tlbs(vcpu); + return -1; + } + + if (kvm_mips_map_page(kvm, gfn0) < 0) return -1; - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[1]) - >> PAGE_SHIFT) < 0) + if (kvm_mips_map_page(kvm, gfn1) < 0) return -1; - pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb_lo[0]) - >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb_lo[1]) - >> PAGE_SHIFT]; + pfn0 = kvm->arch.guest_pmap[gfn0]; + pfn1 = kvm->arch.guest_pmap[gfn1]; if (hpa0) *hpa0 = pfn0 << PAGE_SHIFT; From e93dbb1cb6a1ed00c020f257780aac4750ef1c8e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 18 Aug 2016 10:05:31 +0100 Subject: [PATCH 211/813] MIPS: KVM: Fix gfn range check in kseg0 tlb faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0741f52d1b980dbeb290afe67d88fc2928edd8ab upstream. Two consecutive gfns are loaded into host TLB, so ensure the range check isn't off by one if guest_pmap_npages is odd. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Radim Krčmář [james.hogan@imgtec.com: Backport to v3.17.y - v4.4.y] Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/kvm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 59e885fa4c65..836b61aabf86 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -276,7 +276,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, } gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); - if (gfn >= kvm->arch.guest_pmap_npages) { + if ((gfn | 1) >= kvm->arch.guest_pmap_npages) { kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, gfn, badvaddr); kvm_mips_dump_host_tlbs(); From 4fa571ebe60f311c885b37a3dcfbf961e5fbd57d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 18 Aug 2016 10:05:32 +0100 Subject: [PATCH 212/813] MIPS: KVM: Propagate kseg0/mapped tlb fault errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9b731bcfdec4c159ad2e4312e25d69221709b96a upstream. Propagate errors from kvm_mips_handle_kseg0_tlb_fault() and kvm_mips_handle_mapped_seg_tlb_fault(), usually triggering an internal error since they normally indicate the guest accessed bad physical memory or the commpage in an unexpected way. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Fixes: e685c689f3a8 ("KVM/MIPS32: Privileged instruction/target branch emulation.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Radim Krčmář [james.hogan@imgtec.com: Backport to v3.17.y - v4.4.y] Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/kvm/emulate.c | 40 ++++++++++++++++++++++++++++------------ arch/mips/kvm/tlb.c | 14 ++++++++++---- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index dc10c77b7500..d6476d11212e 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1629,8 +1629,14 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, preempt_disable(); if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { - if (kvm_mips_host_tlb_lookup(vcpu, va) < 0) - kvm_mips_handle_kseg0_tlb_fault(va, vcpu); + if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 && + kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) { + kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n", + __func__, va, vcpu, read_c0_entryhi()); + er = EMULATE_FAIL; + preempt_enable(); + goto done; + } } else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) || KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { int index; @@ -1665,14 +1671,19 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, run, vcpu); preempt_enable(); goto dont_update_pc; - } else { - /* - * We fault an entry from the guest tlb to the - * shadow host TLB - */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, - NULL, - NULL); + } + /* + * We fault an entry from the guest tlb to the + * shadow host TLB + */ + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, + NULL, NULL)) { + kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", + __func__, va, index, vcpu, + read_c0_entryhi()); + er = EMULATE_FAIL; + preempt_enable(); + goto done; } } } else { @@ -2633,8 +2644,13 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL, - NULL); + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, + NULL, NULL)) { + kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", + __func__, va, index, vcpu, + read_c0_entryhi()); + er = EMULATE_FAIL; + } } } diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 836b61aabf86..7a7ed9ca01bb 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -808,10 +808,16 @@ uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu) local_irq_restore(flags); return KVM_INVALID_INST; } - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch. - guest_tlb[index], - NULL, NULL); + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, + &vcpu->arch.guest_tlb[index], + NULL, NULL)) { + kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n", + __func__, opc, index, vcpu, + read_c0_entryhi()); + kvm_mips_dump_guest_tlbs(vcpu); + local_irq_restore(flags); + return KVM_INVALID_INST; + } inst = *(opc); } local_irq_restore(flags); From 3d6562fded3ce875b8a7fc30eeed73b16366d77e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 18 Jul 2016 10:41:57 -0400 Subject: [PATCH 213/813] nfs: don't create zero-length requests commit 149a4fddd0a72d526abbeac0c8deaab03559836a upstream. NFS doesn't expect requests with wb_bytes set to zero and may make unexpected decisions about how to handle that request at the page IO layer. Skip request creation if we won't have any wb_bytes in the request. Signed-off-by: Benjamin Coddington Signed-off-by: Alexey Dobriyan Reviewed-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7b9316406930..7a9b6e347249 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1261,6 +1261,9 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", file, count, (long long)(page_file_offset(page) + offset)); + if (!count) + goto out; + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; @@ -1271,7 +1274,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_set_pageerror(page); else __set_page_dirty_nobuffers(page); - +out: dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; From 6dfc20babd386b8990c9ad99fa9e3afe875cba1f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2016 10:37:30 -0400 Subject: [PATCH 214/813] nfsd: Fix race between FREE_STATEID and LOCK commit 42691398be08bd1fe99326911a0aa31f2c041d53 upstream. When running LTP's nfslock01 test, the Linux client can send a LOCK and a FREE_STATEID request at the same time. The outcome is: Frame 324 R OPEN stateid [2,O] Frame 115004 C LOCK lockowner_is_new stateid [2,O] offset 672000 len 64 Frame 115008 R LOCK stateid [1,L] Frame 115012 C WRITE stateid [0,L] offset 672000 len 64 Frame 115016 R WRITE NFS4_OK Frame 115019 C LOCKU stateid [1,L] offset 672000 len 64 Frame 115022 R LOCKU NFS4_OK Frame 115025 C FREE_STATEID stateid [2,L] Frame 115026 C LOCK lockowner_is_new stateid [2,O] offset 672128 len 64 Frame 115029 R FREE_STATEID NFS4_OK Frame 115030 R LOCK stateid [3,L] Frame 115034 C WRITE stateid [0,L] offset 672128 len 64 Frame 115038 R WRITE NFS4ERR_BAD_STATEID In other words, the server returns stateid L in a successful LOCK reply, but it has already released it. Subsequent uses of stateid L fail. To address this, protect the generation check in nfsd4_free_stateid with the st_mutex. This should guarantee that only one of two outcomes occurs: either LOCK returns a fresh valid stateid, or FREE_STATEID returns NFS4ERR_LOCKS_HELD. Reported-by: Alexey Kodanev Fix-suggested-by: Jeff Layton Signed-off-by: Chuck Lever Tested-by: Alexey Kodanev Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ed2f64ca49de..38e353dfba4b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4882,6 +4882,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfs_ok; } +static __be32 +nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) +{ + struct nfs4_ol_stateid *stp = openlockstateid(s); + __be32 ret; + + mutex_lock(&stp->st_mutex); + + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + goto out; + + release_lock_stateid(stp); + ret = nfs_ok; + +out: + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(s); + return ret; +} + __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) @@ -4889,7 +4915,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_ol_stateid *stp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; @@ -4908,18 +4933,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: - ret = check_stateid_generation(stateid, &s->sc_stateid, 1); - if (ret) - break; - stp = openlockstateid(s); - ret = nfserr_locks_held; - if (check_for_locks(stp->st_stid.sc_file, - lockowner(stp->st_stateowner))) - break; - WARN_ON(!unhash_lock_stateid(stp)); + atomic_inc(&s->sc_count); spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; + ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: dp = delegstateid(s); From 37cbe5b6d12580c6bb189dc3be418b681ce7d5a1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 11 Aug 2016 10:37:39 -0400 Subject: [PATCH 215/813] nfsd: don't return an unhashed lock stateid after taking mutex commit dd257933fa4b9fea66a1195f8a15111029810abc upstream. nfsd4_lock will take the st_mutex before working with the stateid it gets, but between the time when we drop the cl_lock and take the mutex, the stateid could become unhashed (a'la FREE_STATEID). If that happens the lock stateid returned to the client will be forgotten. Fix this by first moving the st_mutex acquisition into lookup_or_create_lock_state. Then, have it check to see if the lock stateid is still hashed after taking the mutex. If it's not, then put the stateid and try the find/create again. Signed-off-by: Jeff Layton Tested-by: Alexey Kodanev Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 38e353dfba4b..f7ea624780a7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5502,7 +5502,7 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, - struct nfs4_ol_stateid **lst, bool *new) + struct nfs4_ol_stateid **plst, bool *new) { __be32 status; struct nfs4_file *fi = ost->st_stid.sc_file; @@ -5510,7 +5510,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_client *cl = oo->oo_owner.so_client; struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *lst; unsigned int strhashval; + bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5526,12 +5528,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } - *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); - if (*lst == NULL) { +retry: + lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); + if (lst == NULL) { status = nfserr_jukebox; goto out; } + + mutex_lock(&lst->st_mutex); + + /* See if it's still hashed to avoid race with FREE_STATEID */ + spin_lock(&cl->cl_lock); + hashed = !list_empty(&lst->st_perfile); + spin_unlock(&cl->cl_lock); + + if (!hashed) { + mutex_unlock(&lst->st_mutex); + nfs4_put_stid(&lst->st_stid); + goto retry; + } status = nfs_ok; + *plst = lst; out: nfs4_put_stateowner(&lo->lo_owner); return status; @@ -5598,8 +5615,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); - if (status == nfs_ok) - mutex_lock(&lock_stp->st_mutex); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, From fa89ad55304a0053d6e9900bc76599d77b8abb50 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Fri, 23 Oct 2015 11:00:06 +0200 Subject: [PATCH 216/813] drm/i915: Don't complain about lack of ACPI video bios commit 78c3d5fa7354774b7c8638033d46c042ebae41fb upstream. Another CI fail we have for no reason. Totally unjustified since nothing fails at all. Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1445590806-23886-1-git-send-email-daniel.vetter@ffwll.ch Acked-by: Jani Nikula Signed-off-by: Daniel Vetter Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_opregion.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_opregion.c b/drivers/gpu/drm/i915/intel_opregion.c index 6dc13c02c28e..e362a30776fa 100644 --- a/drivers/gpu/drm/i915/intel_opregion.c +++ b/drivers/gpu/drm/i915/intel_opregion.c @@ -682,7 +682,7 @@ static void intel_didl_outputs(struct drm_device *dev) } if (!acpi_video_bus) { - DRM_ERROR("No ACPI video bus found\n"); + DRM_DEBUG_KMS("No ACPI video bus found\n"); return; } From bd78d819b4abaf1b471ea6da8b440730d24ac897 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 20 May 2016 15:48:21 +0200 Subject: [PATCH 217/813] iommu/exynos: Suppress unbinding to prevent system failure commit b54b874fbaf5e024723e50dfb035a9916d6752b4 upstream. Removal of IOMMU driver cannot be done reliably, so Exynos IOMMU driver doesn't support this operation. It is essential for system operation, so it makes sense to prevent unbinding by disabling bind/unbind sysfs feature for SYSMMU controller driver to avoid kernel ops or trashing memory caused by such operation. Signed-off-by: Marek Szyprowski Reviewed-by: Krzysztof Kozlowski Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/exynos-iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 97c41b8ab5d9..29a31eb9ace3 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -647,6 +647,7 @@ static struct platform_driver exynos_sysmmu_driver __refdata = { .name = "exynos-sysmmu", .of_match_table = sysmmu_of_match, .pm = &sysmmu_pm_ops, + .suppress_bind_attrs = true, } }; From 28a5f4cbb18b86ad40be5da53263bc1704dbdb6f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 13 Jul 2016 13:53:21 +0000 Subject: [PATCH 218/813] iommu/vt-d: Return error code in domain_context_mapping_one() commit 5c365d18a73d3979db37006eaacefc0008869c0f upstream. In 'commit <55d940430ab9> ("iommu/vt-d: Get rid of domain->iommu_lock")', the error handling path is changed a little, which makes the function always return 0. This path fixes this. Signed-off-by: Wei Yang Fixes: 55d940430ab9 ('iommu/vt-d: Get rid of domain->iommu_lock') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 6763a4dfed94..24d81308a1a6 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2032,7 +2032,7 @@ out_unlock: spin_unlock(&iommu->lock); spin_unlock_irqrestore(&device_domain_lock, flags); - return 0; + return ret; } struct domain_context_mapping_data { From 51e94ff8d90804ec1912fd71b2c7e22902a61585 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Jul 2016 15:57:04 +0200 Subject: [PATCH 219/813] iommu/amd: Handle IOMMU_DOMAIN_DMA in ops->domain_free call-back commit cda7005ba2cbd0744fea343dd5b2aa637eba5b9e upstream. This domain type is not yet handled in the iommu_ops->domain_free() call-back. Fix that. Fixes: 0bb6e243d7fb ('iommu/amd: Support IOMMU_DOMAIN_DMA type allocation') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/amd_iommu.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b9319b76a8a1..a2d054f96fda 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2970,9 +2970,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; - - if (!dom) - return; + struct dma_ops_domain *dma_dom; domain = to_pdomain(dom); @@ -2981,13 +2979,24 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) BUG_ON(domain->dev_cnt != 0); - if (domain->mode != PAGE_MODE_NONE) - free_pagetable(domain); + if (!dom) + return; - if (domain->flags & PD_IOMMUV2_MASK) - free_gcr3_table(domain); + switch (dom->type) { + case IOMMU_DOMAIN_DMA: + dma_dom = domain->priv; + dma_ops_domain_free(dma_dom); + break; + default: + if (domain->mode != PAGE_MODE_NONE) + free_pagetable(domain); - protection_domain_free(domain); + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); + + protection_domain_free(domain); + break; + } } static void amd_iommu_detach_device(struct iommu_domain *dom, From df8eaed22590a484a64e4a6f2af266f59a692aed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Jul 2016 12:35:24 +0200 Subject: [PATCH 220/813] iommu/amd: Init unity mappings only for dma_ops domains commit b548e786ce47017107765bbeb0f100202525ea83 upstream. The default domain for a device might also be identity-mapped. In this case the kernel would crash when unity mappings are defined for the device. Fix that by making sure the domain is a dma_ops domain. Fixes: 0bb6e243d7fb ('iommu/amd: Support IOMMU_DOMAIN_DMA type allocation') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/amd_iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a2d054f96fda..e54e335d082b 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -352,9 +352,11 @@ static void init_iommu_group(struct device *dev) if (!domain) goto out; - dma_domain = to_pdomain(domain)->priv; + if (to_pdomain(domain)->flags == PD_DMA_OPS_MASK) { + dma_domain = to_pdomain(domain)->priv; + init_unity_mappings_for_device(dev, dma_domain); + } - init_unity_mappings_for_device(dev, dma_domain); out: iommu_group_put(group); } From c5612d4370098aa0528dfe20cc5698acaf94c4af Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 26 Jul 2016 15:18:54 +0200 Subject: [PATCH 221/813] iommu/amd: Update Alias-DTE in update_device_table() commit 3254de6bf74fe94c197c9f819fe62a3a3c36f073 upstream. Not doing so might cause IO-Page-Faults when a device uses an alias request-id and the alias-dte is left in a lower page-mode which does not cover the address allocated from the iova-allocator. Fixes: 492667dacc0a ('x86/amd-iommu: Remove amd_iommu_pd_table') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/amd_iommu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index e54e335d082b..0397985a2601 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2324,8 +2324,15 @@ static void update_device_table(struct protection_domain *domain) { struct iommu_dev_data *dev_data; - list_for_each_entry(dev_data, &domain->dev_list, list) + list_for_each_entry(dev_data, &domain->dev_list, list) { set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled); + + if (dev_data->devid == dev_data->alias) + continue; + + /* There is an alias, update device table entry for it */ + set_dte_entry(dev_data->alias, domain, dev_data->ats.enabled); + } } static void update_domain(struct protection_domain *domain) From 53eaa3910ae67e497fb33188d515c14ed17a7a0e Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Jul 2016 17:42:57 -0400 Subject: [PATCH 222/813] audit: fix a double fetch in audit_log_single_execve_arg() commit 43761473c254b45883a64441dd0bc85a42f3645c upstream. There is a double fetch problem in audit_log_single_execve_arg() where we first check the execve(2) argumnets for any "bad" characters which would require hex encoding and then re-fetch the arguments for logging in the audit record[1]. Of course this leaves a window of opportunity for an unsavory application to munge with the data. This patch reworks things by only fetching the argument data once[2] into a buffer where it is scanned and logged into the audit records(s). In addition to fixing the double fetch, this patch improves on the original code in a few other ways: better handling of large arguments which require encoding, stricter record length checking, and some performance improvements (completely unverified, but we got rid of some strlen() calls, that's got to be a good thing). As part of the development of this patch, I've also created a basic regression test for the audit-testsuite, the test can be tracked on GitHub at the following link: * https://github.com/linux-audit/audit-testsuite/issues/25 [1] If you pay careful attention, there is actually a triple fetch problem due to a strnlen_user() call at the top of the function. [2] This is a tiny white lie, we do make a call to strnlen_user() prior to fetching the argument data. I don't like it, but due to the way the audit record is structured we really have no choice unless we copy the entire argument at once (which would require a rather wasteful allocation). The good news is that with this patch the kernel no longer relies on this strnlen_user() value for anything beyond recording it in the log, we also update it with a trustworthy value whenever possible. Reported-by: Pengfei Wang Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/auditsc.c | 332 +++++++++++++++++++++++------------------------ 1 file changed, 164 insertions(+), 168 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..48f45987dc6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) -{ - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; - } - - /* walk the whole argument looking for non-ascii chars */ - do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } - - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} - static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { - int i, len; - size_t len_sent = 0; - const char __user *p; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; - p = (const char __user *)current->mm->arg_start; + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { audit_panic("out of memory for argv string"); return; } + buf = buf_head; - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; + do { + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; + + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); + } + + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } + + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } + + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); + + /* NOTE: the caller handles the final audit_log_end() call */ + +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) From 225ecdbca37ab7b8a6ec38e72739980c80c218e7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 4 Jun 2016 12:58:39 +0200 Subject: [PATCH 223/813] ARM: dts: sunxi: Add a startup delay for fixed regulator enabled phys commit fc51b632c7b047c25807023b76f3877aed19c770 upstream. It seems that recent kernels have a shorter timeout when scanning for ethernet phys causing us to hit a timeout on boards where the phy's regulator gets enabled just before scanning, which leads to non working ethernet. A 10ms startup delay seems to be enough to fix it, this commit adds a 20ms startup delay just to be safe. This has been tested on a sun4i-a10-a1000 and sun5i-a10s-wobo-i5 board, both of which have non-working ethernet on recent kernels without this fix. Signed-off-by: Hans de Goede Signed-off-by: Maxime Ripard Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/sun4i-a10-a1000.dts | 1 + arch/arm/boot/dts/sun4i-a10-hackberry.dts | 1 + arch/arm/boot/dts/sun4i-a10-jesurun-q5.dts | 1 + arch/arm/boot/dts/sun5i-a10s-wobo-i5.dts | 1 + 4 files changed, 4 insertions(+) diff --git a/arch/arm/boot/dts/sun4i-a10-a1000.dts b/arch/arm/boot/dts/sun4i-a10-a1000.dts index 97570cb7f2fc..1d23527d4ecf 100644 --- a/arch/arm/boot/dts/sun4i-a10-a1000.dts +++ b/arch/arm/boot/dts/sun4i-a10-a1000.dts @@ -84,6 +84,7 @@ regulator-name = "emac-3v3"; regulator-min-microvolt = <3300000>; regulator-max-microvolt = <3300000>; + startup-delay-us = <20000>; enable-active-high; gpio = <&pio 7 15 GPIO_ACTIVE_HIGH>; }; diff --git a/arch/arm/boot/dts/sun4i-a10-hackberry.dts b/arch/arm/boot/dts/sun4i-a10-hackberry.dts index 2b17c5199151..6de83a6187d0 100644 --- a/arch/arm/boot/dts/sun4i-a10-hackberry.dts +++ b/arch/arm/boot/dts/sun4i-a10-hackberry.dts @@ -66,6 +66,7 @@ regulator-name = "emac-3v3"; regulator-min-microvolt = <3300000>; regulator-max-microvolt = <3300000>; + startup-delay-us = <20000>; enable-active-high; gpio = <&pio 7 19 GPIO_ACTIVE_HIGH>; }; diff --git a/arch/arm/boot/dts/sun4i-a10-jesurun-q5.dts b/arch/arm/boot/dts/sun4i-a10-jesurun-q5.dts index 7afc7a64eef1..e28f080b1fd5 100644 --- a/arch/arm/boot/dts/sun4i-a10-jesurun-q5.dts +++ b/arch/arm/boot/dts/sun4i-a10-jesurun-q5.dts @@ -80,6 +80,7 @@ regulator-name = "emac-3v3"; regulator-min-microvolt = <3300000>; regulator-max-microvolt = <3300000>; + startup-delay-us = <20000>; enable-active-high; gpio = <&pio 7 19 GPIO_ACTIVE_HIGH>; /* PH19 */ }; diff --git a/arch/arm/boot/dts/sun5i-a10s-wobo-i5.dts b/arch/arm/boot/dts/sun5i-a10s-wobo-i5.dts index 9fea918f949e..39731a78f087 100644 --- a/arch/arm/boot/dts/sun5i-a10s-wobo-i5.dts +++ b/arch/arm/boot/dts/sun5i-a10s-wobo-i5.dts @@ -79,6 +79,7 @@ regulator-name = "emac-3v3"; regulator-min-microvolt = <3300000>; regulator-max-microvolt = <3300000>; + startup-delay-us = <20000>; enable-active-high; gpio = <&pio 0 2 GPIO_ACTIVE_HIGH>; }; From 4a3e88a02e71cbc1dacfb958d8bffe0547d8fa2e Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 6 Jun 2016 15:17:20 -0400 Subject: [PATCH 224/813] netlabel: add address family checks to netlbl_{sock,req}_delattr() commit 0e0e36774081534783aa8eeb9f6fbddf98d3c061 upstream. It seems risky to always rely on the caller to ensure the socket's address family is correct before passing it to the NetLabel kAPI, especially since we see at least one LSM which didn't. Add address family checks to the *_delattr() functions to help prevent future problems. Reported-by: Maninder Singh Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- net/netlabel/netlabel_kapi.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 28cddc85b700..bfa2b6d5b5cf 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -824,7 +824,11 @@ socket_setattr_return: */ void netlbl_sock_delattr(struct sock *sk) { - cipso_v4_sock_delattr(sk); + switch (sk->sk_family) { + case AF_INET: + cipso_v4_sock_delattr(sk); + break; + } } /** @@ -987,7 +991,11 @@ req_setattr_return: */ void netlbl_req_delattr(struct request_sock *req) { - cipso_v4_req_delattr(req); + switch (req->rsk_ops->family) { + case AF_INET: + cipso_v4_req_delattr(req); + break; + } } /** From bedd8d6037247002091d85d8abf4aa7e6c92ff24 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Tue, 2 Aug 2016 14:07:12 -0700 Subject: [PATCH 225/813] w1:omap_hdq: fix regression commit ecfaf0c42fc4306b5ec4bf6be01b66f8fe9a9733 upstream. Commit e93762bbf681 ("w1: masters: omap_hdq: add support for 1-wire mode") added a statement to clear the hdq_irqstatus flags in hdq_read_byte(). If the hdq reading process is scheduled slowly or interrupts are disabled for a while the hardware read activity might already be finished on entry of hdq_read_byte(). And hdq_isr() already has set the hdq_irqstatus to 0x6 (can be seen in debug mode) denoting that both, the TXCOMPLETE and RXCOMPLETE interrupts occurred in parallel. This means there is no need to wait and the hdq_read_byte() can just read the byte from the hdq controller. By resetting hdq_irqstatus to 0 the read process is forced to be always waiting again (because the if statement always succeeds) but the hardware will not issue another RXCOMPLETE interrupt. This results in a false timeout. After such a situation the hdq bus hangs. Link: http://lkml.kernel.org/r/b724765f87ad276a69625bc19806c8c8844c4590.1469513669.git.hns@goldelico.com Signed-off-by: H. Nikolaus Schaller Cc: Evgeniy Polyakov Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/w1/masters/omap_hdq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/w1/masters/omap_hdq.c b/drivers/w1/masters/omap_hdq.c index 0e2f43bccf1f..0c427d6a12d1 100644 --- a/drivers/w1/masters/omap_hdq.c +++ b/drivers/w1/masters/omap_hdq.c @@ -390,8 +390,6 @@ static int hdq_read_byte(struct hdq_data *hdq_data, u8 *val) goto out; } - hdq_data->hdq_irqstatus = 0; - if (!(hdq_data->hdq_irqstatus & OMAP_HDQ_INT_STATUS_RXCOMPLETE)) { hdq_reg_merge(hdq_data, OMAP_HDQ_CTRL_STATUS, OMAP_HDQ_CTRL_STATUS_DIR | OMAP_HDQ_CTRL_STATUS_GO, From 8f811d101bd2b1c606b5ee1c3a8bd59a4f48cb8d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 1 Jun 2016 12:54:33 -0400 Subject: [PATCH 226/813] drm/amdgpu: add a delay after ATPX dGPU power off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f81eb1a349d47694fe1e688336ca1b40ea3e248a upstream. ATPX dGPU power control requires a 200ms delay between power off and on. This should fix dGPU failures on resume from power off. Reviewed-by: Hawking Zhang Acked-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c index 5a8fbadbd27b..29adbbe225c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "amdgpu_acpi.h" @@ -256,6 +257,10 @@ static int amdgpu_atpx_set_discrete_state(struct amdgpu_atpx *atpx, u8 state) if (!info) return -EIO; kfree(info); + + /* 200ms delay is required after off */ + if (state == 0) + msleep(200); } return 0; } From 2c10a2c5cef3f3ae7dd7cd931398014c2b4c290f Mon Sep 17 00:00:00 2001 From: Lyude Date: Fri, 24 Jun 2016 17:54:32 -0400 Subject: [PATCH 227/813] drm/amdgpu: Poll for both connect/disconnect on analog connectors commit b636a1b3d624b49b23cc1be2f9f6bcbb89aca855 upstream. DRM_CONNECTOR_POLL_CONNECT only enables polling for connections, not disconnections. Because of this, we end up losing hotplug polling for analog connectors once they get connected. Easy way to reproduce: - Grab a machine with an AMD GPU and a VGA port - Plug a monitor into the VGA port, wait for it to update the connector from disconnected to connected - Disconnect the monitor on VGA, a hotplug event is never sent for the removal of the connector. Originally, only using DRM_CONNECTOR_POLL_CONNECT might have been a good idea since doing VGA polling can sometimes result in having to mess with the DAC voltages to figure out whether or not there's actually something there since VGA doesn't have HPD. Doing this would have the potential of showing visible artifacts on the screen every time we ran a poll while a VGA display was connected. Luckily, amdgpu_vga_detect() only resorts to this sort of polling if the poll is forced, and DRM's polling helper doesn't force it's polls. Additionally, this removes some assignments to connector->polled that weren't actually doing anything. Signed-off-by: Lyude Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index 7ef2c13921b4..930083336968 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -1690,7 +1690,6 @@ amdgpu_connector_add(struct amdgpu_device *adev, DRM_MODE_SCALE_NONE); /* no HPD on analog connectors */ amdgpu_connector->hpd.hpd = AMDGPU_HPD_NONE; - connector->polled = DRM_CONNECTOR_POLL_CONNECT; connector->interlace_allowed = true; connector->doublescan_allowed = true; break; @@ -1893,8 +1892,10 @@ amdgpu_connector_add(struct amdgpu_device *adev, } if (amdgpu_connector->hpd.hpd == AMDGPU_HPD_NONE) { - if (i2c_bus->valid) - connector->polled = DRM_CONNECTOR_POLL_CONNECT; + if (i2c_bus->valid) { + connector->polled = DRM_CONNECTOR_POLL_CONNECT | + DRM_CONNECTOR_POLL_DISCONNECT; + } } else connector->polled = DRM_CONNECTOR_POLL_HPD; From ca7eb0c3080cc3e1b6e04e04ed60ed63e6fb5667 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 8 Jul 2016 17:19:59 -0400 Subject: [PATCH 228/813] drm/amdgpu: support backlight control for UNIPHY3 commit dba6c4fa26ccf47661be5b68dba87e746fa137d8 upstream. Same interface as other UNIPHY blocks Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/atombios_encoders.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_encoders.c b/drivers/gpu/drm/amd/amdgpu/atombios_encoders.c index 1cd6de575305..542517d4e584 100644 --- a/drivers/gpu/drm/amd/amdgpu/atombios_encoders.c +++ b/drivers/gpu/drm/amd/amdgpu/atombios_encoders.c @@ -98,6 +98,7 @@ amdgpu_atombios_encoder_set_backlight_level(struct amdgpu_encoder *amdgpu_encode case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA: case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1: case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2: + case ENCODER_OBJECT_ID_INTERNAL_UNIPHY3: if (dig->backlight_level == 0) amdgpu_atombios_encoder_setup_dig_transmitter(encoder, ATOM_TRANSMITTER_ACTION_LCD_BLOFF, 0, 0); From 3281d1055ca82a240e6f38c5b011bc1f3f28d4b6 Mon Sep 17 00:00:00 2001 From: Lyude Date: Mon, 18 Jul 2016 11:41:37 -0400 Subject: [PATCH 229/813] drm/amdgpu: Disable RPM helpers while reprobing connectors on resume commit 23a1a9e54e71593fe5657e883662995d181d2d6b upstream. Just about all of amdgpu's connector probing functions try to acquire runtime PM refs. If we try to do this in the context of amdgpu_resume_kms by calling drm_helper_hpd_irq_event(), we end up deadlocking the system. Since we're guaranteed to be holding the spinlock for RPM in amdgpu_resume_kms, and we already know the GPU is in working order, we need to prevent the RPM helpers from trying to run during the initial connector reprobe on resume. There's a couple of solutions I've explored for fixing this, but this one by far seems to be the simplest and most reliable (plus I'm pretty sure that's what disable_depth is there for anyway). Reproduction recipe: - Get any laptop dual GPUs using PRIME - Make sure runtime PM is enabled for amdgpu - Boot the machine - If the machine managed to boot without hanging, switch out of X to another VT. This should definitely cause X to hang infinitely. Changes since v1: - add appropriate #ifdef checks for CONFIG_PM. This is not very useful, but it appears some kernel test suites test compiling amdgpu with CONFIG_PM disabled, which results in this patch breaking the builds if we don't include this #ifdef Cc: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Lyude Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c961fe093e12..16302f7d59f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1793,7 +1793,23 @@ int amdgpu_resume_kms(struct drm_device *dev, bool resume, bool fbcon) } drm_kms_helper_poll_enable(dev); + + /* + * Most of the connector probing functions try to acquire runtime pm + * refs to ensure that the GPU is powered on when connector polling is + * performed. Since we're calling this from a runtime PM callback, + * trying to acquire rpm refs will cause us to deadlock. + * + * Since we're guaranteed to be holding the rpm lock, it's safe to + * temporarily disable the rpm helpers so this doesn't deadlock us. + */ +#ifdef CONFIG_PM + dev->dev->power.disable_depth++; +#endif drm_helper_hpd_irq_event(dev); +#ifdef CONFIG_PM + dev->dev->power.disable_depth--; +#endif if (fbcon) { amdgpu_fbdev_set_suspend(adev, 0); From 672138e0ce66e47b33171096dddfe296e948de37 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 27 Jul 2016 15:31:59 -0400 Subject: [PATCH 230/813] drm/amdgpu: fix firmware info version checks commit a8a04c994d41a489eb0f2899893209e04e030153 upstream. Some of the checks didn't handle frev 2 tables properly. amdgpu doesn't support any tables pre-frev 2, so drop the checks. Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 25 +++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index 9416e0f5c1db..0aaa457a1710 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -566,28 +566,19 @@ int amdgpu_atombios_get_clock_info(struct amdgpu_device *adev) le16_to_cpu(firmware_info->info.usReferenceClock); ppll->reference_div = 0; - if (crev < 2) - ppll->pll_out_min = - le16_to_cpu(firmware_info->info.usMinPixelClockPLL_Output); - else - ppll->pll_out_min = - le32_to_cpu(firmware_info->info_12.ulMinPixelClockPLL_Output); + ppll->pll_out_min = + le32_to_cpu(firmware_info->info_12.ulMinPixelClockPLL_Output); ppll->pll_out_max = le32_to_cpu(firmware_info->info.ulMaxPixelClockPLL_Output); - if (crev >= 4) { - ppll->lcd_pll_out_min = - le16_to_cpu(firmware_info->info_14.usLcdMinPixelClockPLL_Output) * 100; - if (ppll->lcd_pll_out_min == 0) - ppll->lcd_pll_out_min = ppll->pll_out_min; - ppll->lcd_pll_out_max = - le16_to_cpu(firmware_info->info_14.usLcdMaxPixelClockPLL_Output) * 100; - if (ppll->lcd_pll_out_max == 0) - ppll->lcd_pll_out_max = ppll->pll_out_max; - } else { + ppll->lcd_pll_out_min = + le16_to_cpu(firmware_info->info_14.usLcdMinPixelClockPLL_Output) * 100; + if (ppll->lcd_pll_out_min == 0) ppll->lcd_pll_out_min = ppll->pll_out_min; + ppll->lcd_pll_out_max = + le16_to_cpu(firmware_info->info_14.usLcdMaxPixelClockPLL_Output) * 100; + if (ppll->lcd_pll_out_max == 0) ppll->lcd_pll_out_max = ppll->pll_out_max; - } if (ppll->pll_out_min == 0) ppll->pll_out_min = 64800; From 4925cf140ff6208894feb0e81984c7f320025f2e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 29 Jul 2016 18:03:42 -0400 Subject: [PATCH 231/813] drm/amdgpu/gmc7: add missing mullins case commit 7f555c8e5a84b348c2b76f4ca78eae7222354c03 upstream. Looks like this got missed when we ported the code from radeon. Reviewed-by: Edward O'Callaghan Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index ea87033bfaf6..df17fababbd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -167,6 +167,7 @@ static int gmc_v7_0_init_microcode(struct amdgpu_device *adev) break; case CHIP_KAVERI: case CHIP_KABINI: + case CHIP_MULLINS: return 0; default: BUG(); } From ada3815fab5191d1c96ff07ed6c0c77ecd34dfd0 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 1 Jun 2016 12:58:36 -0400 Subject: [PATCH 232/813] drm/radeon: add a delay after ATPX dGPU power off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d814b24fb74cb9797d70cb8053961447c5879a5c upstream. ATPX dGPU power control requires a 200ms delay between power off and on. This should fix dGPU failures on resume from power off. Reviewed-by: Hawking Zhang Acked-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_atpx_handler.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_atpx_handler.c b/drivers/gpu/drm/radeon/radeon_atpx_handler.c index c4b4f298a283..69ce95571136 100644 --- a/drivers/gpu/drm/radeon/radeon_atpx_handler.c +++ b/drivers/gpu/drm/radeon/radeon_atpx_handler.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "radeon_acpi.h" @@ -255,6 +256,10 @@ static int radeon_atpx_set_discrete_state(struct radeon_atpx *atpx, u8 state) if (!info) return -EIO; kfree(info); + + /* 200ms delay is required after off */ + if (state == 0) + msleep(200); } return 0; } From 943682861f55e9b979aaca4be3a2d75afe7831d9 Mon Sep 17 00:00:00 2001 From: Lyude Date: Fri, 24 Jun 2016 17:54:31 -0400 Subject: [PATCH 233/813] drm/radeon: Poll for both connect/disconnect on analog connectors commit 14ff8d48f2235295dfb3117693008e367b49cdb5 upstream. DRM_CONNECTOR_POLL_CONNECT only enables polling for connections, not disconnections. Because of this, we end up losing hotplug polling for analog connectors once they get connected. Easy way to reproduce: - Grab a machine with a radeon GPU and a VGA port - Plug a monitor into the VGA port, wait for it to update the connector from disconnected to connected - Disconnect the monitor on VGA, a hotplug event is never sent for the removal of the connector. Originally, only using DRM_CONNECTOR_POLL_CONNECT might have been a good idea since doing VGA polling can sometimes result in having to mess with the DAC voltages to figure out whether or not there's actually something there since VGA doesn't have HPD. Doing this would have the potential of showing visible artifacts on the screen every time we ran a poll while a VGA display was connected. Luckily, radeon_vga_detect() only resorts to this sort of polling if the poll is forced, and DRM's polling helper doesn't force it's polls. Additionally, this removes some assignments to connector->polled that weren't actually doing anything. Signed-off-by: Lyude Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_connectors.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index 9cfc1c3e1965..30f00748ed37 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -2058,7 +2058,6 @@ radeon_add_atom_connector(struct drm_device *dev, RADEON_OUTPUT_CSC_BYPASS); /* no HPD on analog connectors */ radeon_connector->hpd.hpd = RADEON_HPD_NONE; - connector->polled = DRM_CONNECTOR_POLL_CONNECT; connector->interlace_allowed = true; connector->doublescan_allowed = true; break; @@ -2308,8 +2307,10 @@ radeon_add_atom_connector(struct drm_device *dev, } if (radeon_connector->hpd.hpd == RADEON_HPD_NONE) { - if (i2c_bus->valid) - connector->polled = DRM_CONNECTOR_POLL_CONNECT; + if (i2c_bus->valid) { + connector->polled = DRM_CONNECTOR_POLL_CONNECT | + DRM_CONNECTOR_POLL_DISCONNECT; + } } else connector->polled = DRM_CONNECTOR_POLL_HPD; @@ -2385,7 +2386,6 @@ radeon_add_legacy_connector(struct drm_device *dev, 1); /* no HPD on analog connectors */ radeon_connector->hpd.hpd = RADEON_HPD_NONE; - connector->polled = DRM_CONNECTOR_POLL_CONNECT; connector->interlace_allowed = true; connector->doublescan_allowed = true; break; @@ -2470,10 +2470,13 @@ radeon_add_legacy_connector(struct drm_device *dev, } if (radeon_connector->hpd.hpd == RADEON_HPD_NONE) { - if (i2c_bus->valid) - connector->polled = DRM_CONNECTOR_POLL_CONNECT; + if (i2c_bus->valid) { + connector->polled = DRM_CONNECTOR_POLL_CONNECT | + DRM_CONNECTOR_POLL_DISCONNECT; + } } else connector->polled = DRM_CONNECTOR_POLL_HPD; + connector->display_info.subpixel_order = subpixel_order; drm_connector_register(connector); } From f1cb5eb8ff01c79d8c022a1d6fce68bc28638bce Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 27 Jul 2016 15:28:56 -0400 Subject: [PATCH 234/813] drm/radeon: fix firmware info version checks commit 3edc38a0facef45ee22af8afdce3737f421f36ab upstream. Some of the checks didn't handle frev 2 tables properly. Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_atombios.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index de9a2ffcf5f7..0c5b3eeff82d 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -1155,7 +1155,7 @@ bool radeon_atom_get_clock_info(struct drm_device *dev) le16_to_cpu(firmware_info->info.usReferenceClock); p1pll->reference_div = 0; - if (crev < 2) + if ((frev < 2) && (crev < 2)) p1pll->pll_out_min = le16_to_cpu(firmware_info->info.usMinPixelClockPLL_Output); else @@ -1164,7 +1164,7 @@ bool radeon_atom_get_clock_info(struct drm_device *dev) p1pll->pll_out_max = le32_to_cpu(firmware_info->info.ulMaxPixelClockPLL_Output); - if (crev >= 4) { + if (((frev < 2) && (crev >= 4)) || (frev >= 2)) { p1pll->lcd_pll_out_min = le16_to_cpu(firmware_info->info_14.usLcdMinPixelClockPLL_Output) * 100; if (p1pll->lcd_pll_out_min == 0) From 4cb688506f91f53a7f1a6864524e088fb4992a65 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 8 Jul 2016 17:27:04 -0400 Subject: [PATCH 235/813] drm/radeon: support backlight control for UNIPHY3 commit d3200be6c423afa1c34f7e39e9f6d04dd5b0af9d upstream. Same interface as other UNIPHY blocks Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/atombios_encoders.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/radeon/atombios_encoders.c b/drivers/gpu/drm/radeon/atombios_encoders.c index 0b04b9282f56..d4ac8c837314 100644 --- a/drivers/gpu/drm/radeon/atombios_encoders.c +++ b/drivers/gpu/drm/radeon/atombios_encoders.c @@ -120,6 +120,7 @@ atombios_set_backlight_level(struct radeon_encoder *radeon_encoder, u8 level) case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA: case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1: case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2: + case ENCODER_OBJECT_ID_INTERNAL_UNIPHY3: if (dig->backlight_level == 0) atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_LCD_BLOFF, 0, 0); else { From 21f36ec95dcd59365ea8ab1c49739bc2abcff32e Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Tue, 12 Jul 2016 11:57:07 +1000 Subject: [PATCH 236/813] drm/nouveau: check for supported chipset before booting fbdev off the hw commit 0e67bed2c765ff0fdaec62c963881f5416fe3692 upstream. Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nouveau_drm.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 1d3ee5179ab8..d236fc7c425b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -308,7 +308,16 @@ static int nouveau_drm_probe(struct pci_dev *pdev, bool boot = false; int ret; - /* remove conflicting drivers (vesafb, efifb etc) */ + /* We need to check that the chipset is supported before booting + * fbdev off the hardware, as there's no way to put it back. + */ + ret = nvkm_device_pci_new(pdev, NULL, "error", true, false, 0, &device); + if (ret) + return ret; + + nvkm_device_del(&device); + + /* Remove conflicting drivers (vesafb, efifb etc). */ aper = alloc_apertures(3); if (!aper) return -ENOMEM; From 77f424903037180d485b6ed40273f5d8f9dccb1a Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Wed, 27 Jul 2016 19:16:39 -0400 Subject: [PATCH 237/813] drm/nouveau/gr/nv3x: fix instobj write offsets in gr setup commit d0e62ef6ed257715a88d0e5d7cd850a1695429e2 upstream. This should fix some unaligned access warnings. This is also likely to fix non-descript issues on nv30/nv34 as a result of incorrect channel setup. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96836 Signed-off-by: Ilia Mirkin Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c | 4 ++-- drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c index 69de8c6259fe..f1e15a4d4f64 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv30.c @@ -76,8 +76,8 @@ nv30_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch, nvkm_wo32(chan->inst, i, 0x00040004); for (i = 0x1f18; i <= 0x3088 ; i += 16) { nvkm_wo32(chan->inst, i + 0, 0x10700ff9); - nvkm_wo32(chan->inst, i + 1, 0x0436086c); - nvkm_wo32(chan->inst, i + 2, 0x000c001b); + nvkm_wo32(chan->inst, i + 4, 0x0436086c); + nvkm_wo32(chan->inst, i + 8, 0x000c001b); } for (i = 0x30b8; i < 0x30c8; i += 4) nvkm_wo32(chan->inst, i, 0x0000ffff); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c index 2207dac23981..300f5ed5de0b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv34.c @@ -75,8 +75,8 @@ nv34_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch, nvkm_wo32(chan->inst, i, 0x00040004); for (i = 0x15ac; i <= 0x271c ; i += 16) { nvkm_wo32(chan->inst, i + 0, 0x10700ff9); - nvkm_wo32(chan->inst, i + 1, 0x0436086c); - nvkm_wo32(chan->inst, i + 2, 0x000c001b); + nvkm_wo32(chan->inst, i + 4, 0x0436086c); + nvkm_wo32(chan->inst, i + 8, 0x000c001b); } for (i = 0x274c; i < 0x275c; i += 4) nvkm_wo32(chan->inst, i, 0x0000ffff); From 702117fe1ecde2bef728146e0a5fff689731c8c5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 28 Jul 2016 18:56:13 -0400 Subject: [PATCH 238/813] drm/nouveau/fbcon: fix font width not divisible by 8 commit 28668f43b8e421634e1623f72a879812288dd06b upstream. The patch f045f459d925 ("drm/nouveau/fbcon: fix out-of-bounds memory accesses") tries to fix some out of memory accesses. Unfortunatelly, the patch breaks the display when using fonts with width that is not divisiable by 8. The monochrome bitmap for each character is stored in memory by lines from top to bottom. Each line is padded to a full byte. For example, for 22x11 font, each line is padded to 16 bits, so each character is consuming 44 bytes total, that is 11 32-bit words. The patch f045f459d925 changed the logic to "dsize = ALIGN(image->width * image->height, 32) >> 5", that is just 8 words - this is incorrect and it causes display corruption. This patch adds the necesary padding of lines to 8 bytes. This patch should be backported to stable kernels where f045f459d925 was backported. Signed-off-by: Mikulas Patocka Fixes: f045f459d925 ("drm/nouveau/fbcon: fix out-of-bounds memory accesses") Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nv04_fbcon.c | 4 ++-- drivers/gpu/drm/nouveau/nv50_fbcon.c | 2 +- drivers/gpu/drm/nouveau/nvc0_fbcon.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nv04_fbcon.c b/drivers/gpu/drm/nouveau/nv04_fbcon.c index 8f715feadf56..f90568327468 100644 --- a/drivers/gpu/drm/nouveau/nv04_fbcon.c +++ b/drivers/gpu/drm/nouveau/nv04_fbcon.c @@ -107,11 +107,11 @@ nv04_fbcon_imageblit(struct fb_info *info, const struct fb_image *image) ((image->dx + image->width) & 0xffff)); OUT_RING(chan, bg); OUT_RING(chan, fg); - OUT_RING(chan, (image->height << 16) | image->width); + OUT_RING(chan, (image->height << 16) | ALIGN(image->width, 8)); OUT_RING(chan, (image->height << 16) | image->width); OUT_RING(chan, (image->dy << 16) | (image->dx & 0xffff)); - dsize = ALIGN(image->width * image->height, 32) >> 5; + dsize = ALIGN(ALIGN(image->width, 8) * image->height, 32) >> 5; while (dsize) { int iter_len = dsize > 128 ? 128 : dsize; diff --git a/drivers/gpu/drm/nouveau/nv50_fbcon.c b/drivers/gpu/drm/nouveau/nv50_fbcon.c index a4e259a00430..c8e096533f60 100644 --- a/drivers/gpu/drm/nouveau/nv50_fbcon.c +++ b/drivers/gpu/drm/nouveau/nv50_fbcon.c @@ -125,7 +125,7 @@ nv50_fbcon_imageblit(struct fb_info *info, const struct fb_image *image) OUT_RING(chan, 0); OUT_RING(chan, image->dy); - dwords = ALIGN(image->width * image->height, 32) >> 5; + dwords = ALIGN(ALIGN(image->width, 8) * image->height, 32) >> 5; while (dwords) { int push = dwords > 2047 ? 2047 : dwords; diff --git a/drivers/gpu/drm/nouveau/nvc0_fbcon.c b/drivers/gpu/drm/nouveau/nvc0_fbcon.c index f28315e865a5..22d32578dafd 100644 --- a/drivers/gpu/drm/nouveau/nvc0_fbcon.c +++ b/drivers/gpu/drm/nouveau/nvc0_fbcon.c @@ -125,7 +125,7 @@ nvc0_fbcon_imageblit(struct fb_info *info, const struct fb_image *image) OUT_RING (chan, 0); OUT_RING (chan, image->dy); - dwords = ALIGN(image->width * image->height, 32) >> 5; + dwords = ALIGN(ALIGN(image->width, 8) * image->height, 32) >> 5; while (dwords) { int push = dwords > 2047 ? 2047 : dwords; From 1df3e60664af6e494119509e740b074630902f8c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 7 Jul 2016 09:41:12 +0100 Subject: [PATCH 239/813] drm: Restore double clflush on the last partial cacheline commit 396f5d62d1a5fd99421855a08ffdef8edb43c76e upstream. This effectively reverts commit afcd950cafea6e27b739fe7772cbbeed37d05b8b Author: Chris Wilson Date: Wed Jun 10 15:58:01 2015 +0100 drm: Avoid the double clflush on the last cache line in drm_clflush_virt_range() as we have observed issues with serialisation of the clflush operations on Baytrail+ Atoms with partial updates. Applying the double flush on the last cacheline forces that clflush to be ordered with respect to the previous clflush, and the mfence then protects against prefetches crossing the clflush boundary. The same issue can be demonstrated in userspace with igt/gem_exec_flush. Fixes: afcd950cafea6 (drm: Avoid the double clflush on the last cache...) Testcase: igt/gem_concurrent_blit Testcase: igt/gem_partial_pread_pwrite Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92845 Signed-off-by: Chris Wilson Cc: dri-devel@lists.freedesktop.org Cc: Akash Goel Cc: Imre Deak Cc: Daniel Vetter Cc: Jason Ekstrand Reviewed-by: Mika Kuoppala Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1467880930-23082-6-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index 6743ff7dccfa..7f4a6c550319 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -136,6 +136,7 @@ drm_clflush_virt_range(void *addr, unsigned long length) mb(); for (; addr < end; addr += size) clflushopt(addr); + clflushopt(end - 1); /* force serialisation */ mb(); return; } From 794c90b25b332391517e3fe131cf0b8e8db87be9 Mon Sep 17 00:00:00 2001 From: Mario Kleiner Date: Wed, 6 Jul 2016 12:05:44 +0200 Subject: [PATCH 240/813] drm/edid: Add 6 bpc quirk for display AEO model 0. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e10aec652f31ec61d6a0b4d00d8ef8d2b66fa0fd upstream. Bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=105331 reports that the "AEO model 0" display is driven with 8 bpc without dithering by default, which looks bad because that panel is apparently a 6 bpc DP panel with faulty EDID. A fix for this was made by commit 013dd9e03872 ("drm/i915/dp: fall back to 18 bpp when sink capability is unknown"). That commit triggers new regressions in precision for DP->DVI and DP->VGA displays. A patch is out to revert that commit, but it will revert video output for the AEO model 0 panel to 8 bpc without dithering. The EDID 1.3 of that panel, as decoded from the xrandr output attached to that bugzilla bug report, is somewhat faulty, and beyond other problems also sets the "DFP 1.x compliant TMDS" bit, which according to DFP spec means to drive the panel with 8 bpc and no dithering in absence of other colorimetry information. Try to make the original bug reporter happy despite the faulty EDID by adding a quirk to mark that panel as 6 bpc, so 6 bpc output with dithering creates a nice picture. Tested by injecting the edid from the fdo bug into a DP connector via drm_kms_helper.edid_firmware and verifying the 6 bpc + dithering is selected. This patch should be backported to stable. Signed-off-by: Mario Kleiner Cc: Jani Nikula Cc: Ville Syrjälä Cc: Daniel Vetter Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_edid.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index d5d2c03fd136..8c9ac021608f 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -73,6 +73,8 @@ #define EDID_QUIRK_FORCE_8BPC (1 << 8) /* Force 12bpc */ #define EDID_QUIRK_FORCE_12BPC (1 << 9) +/* Force 6bpc */ +#define EDID_QUIRK_FORCE_6BPC (1 << 10) struct detailed_mode_closure { struct drm_connector *connector; @@ -99,6 +101,9 @@ static struct edid_quirk { /* Unknown Acer */ { "ACR", 2423, EDID_QUIRK_FIRST_DETAILED_PREFERRED }, + /* AEO model 0 reports 8 bpc, but is a 6 bpc panel */ + { "AEO", 0, EDID_QUIRK_FORCE_6BPC }, + /* Belinea 10 15 55 */ { "MAX", 1516, EDID_QUIRK_PREFER_LARGE_60 }, { "MAX", 0x77e, EDID_QUIRK_PREFER_LARGE_60 }, @@ -3820,6 +3825,9 @@ int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid) drm_add_display_info(edid, &connector->display_info, connector); + if (quirks & EDID_QUIRK_FORCE_6BPC) + connector->display_info.bpc = 6; + if (quirks & EDID_QUIRK_FORCE_8BPC) connector->display_info.bpc = 8; From 3b30197a63f93c0c315b46d73120cca0d55e4973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 23 May 2016 17:42:48 +0300 Subject: [PATCH 241/813] drm/i915: Never fully mask the the EI up rps interrupt on SNB/IVB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a7b4667a00025ac28300737c868bd4818b6d8c4d upstream. SNB (and IVB too I suppose) starts to misbehave if the GPU gets stuck in an infinite batch buffer loop. The GPU apparently hogs something critical and CPUs start to lose interrupts and whatnot. We can keep the system limping along by unmasking some interrupts in GEN6_PMINTRMSK. The EI up interrupt has been previously chosen for that task, so let's never mask it. v2: s/gen6_rps_pm_mask/gen6_sanitize_rps_pm_mask/ (Chris) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93122 Signed-off-by: Ville Syrjälä Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: http://patchwork.freedesktop.org/patch/msgid/1464014568-4529-1-git-send-email-ville.syrjala@linux.intel.com (cherry picked from commit 12c100bfa5d9103b6c4d43636fee09c31e75605a) Signed-off-by: Daniel Vetter Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_pm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index eb434881ddbc..1e851e037c29 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -4526,7 +4526,8 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv) else gen6_set_rps(dev_priv->dev, dev_priv->rps.idle_freq); dev_priv->rps.last_adj = 0; - I915_WRITE(GEN6_PMINTRMSK, 0xffffffff); + I915_WRITE(GEN6_PMINTRMSK, + gen6_sanitize_rps_pm_mask(dev_priv, ~0)); } mutex_unlock(&dev_priv->rps.hw_lock); From 24cdeed2d3b2c66d3846a74707a9a829dcd2e880 Mon Sep 17 00:00:00 2001 From: Mario Kleiner Date: Wed, 6 Jul 2016 12:05:45 +0200 Subject: [PATCH 242/813] drm/i915/dp: Revert "drm/i915/dp: fall back to 18 bpp when sink capability is unknown" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 196f954e250943df414efd3d632254c29be38e59 upstream. This reverts commit 013dd9e03872 ("drm/i915/dp: fall back to 18 bpp when sink capability is unknown") This commit introduced a regression into stable kernels, as it reduces output color depth to 6 bpc for any video sink connected to a Displayport connector if that sink doesn't report a specific color depth via EDID, or if our EDID parser doesn't actually recognize the proper bpc from EDID. Affected are active DisplayPort->VGA converters and active DisplayPort->DVI converters. Both should be able to handle 8 bpc, but are degraded to 6 bpc with this patch. The reverted commit was meant to fix Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=105331 A followup patch implements a fix for that specific bug, which is caused by a faulty EDID of the affected DP panel by adding a new EDID quirk for that panel. DP 18 bpp fallback handling and other improvements to DP sink bpc detection will be handled for future kernels in a separate series of patches. Please backport to stable. Signed-off-by: Mario Kleiner Acked-by: Jani Nikula Cc: Ville Syrjälä Cc: Daniel Vetter Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_display.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index c41bc42b6fa7..3292495ee10f 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -11952,21 +11952,11 @@ connected_sink_compute_bpp(struct intel_connector *connector, pipe_config->pipe_bpp = connector->base.display_info.bpc*3; } - /* Clamp bpp to default limit on screens without EDID 1.4 */ - if (connector->base.display_info.bpc == 0) { - int type = connector->base.connector_type; - int clamp_bpp = 24; - - /* Fall back to 18 bpp when DP sink capability is unknown. */ - if (type == DRM_MODE_CONNECTOR_DisplayPort || - type == DRM_MODE_CONNECTOR_eDP) - clamp_bpp = 18; - - if (bpp > clamp_bpp) { - DRM_DEBUG_KMS("clamping display bpp (was %d) to default limit of %d\n", - bpp, clamp_bpp); - pipe_config->pipe_bpp = clamp_bpp; - } + /* Clamp bpp to 8 on screens without EDID 1.4 */ + if (connector->base.display_info.bpc == 0 && bpp > 24) { + DRM_DEBUG_KMS("clamping display bpp (was %d) to default limit of 24\n", + bpp); + pipe_config->pipe_bpp = 24; } } From cfb466bd40d7822446ca0abbac263ed6be7306d7 Mon Sep 17 00:00:00 2001 From: Konstantin Neumoin Date: Mon, 11 Jul 2016 15:28:59 +0300 Subject: [PATCH 243/813] balloon: check the number of available pages in leak balloon commit 37cf99e08c6fb4dcea0f9ad2b13b6daa8c76a711 upstream. The balloon has a special mechanism that is subscribed to the oom notification which leads to deflation for a fixed number of pages. The number is always fixed even when the balloon is fully deflated. But leak_balloon did not expect that the pages to deflate will be more than taken, and raise a "BUG" in balloon_page_dequeue when page list will be empty. So, the simplest solution would be to check that the number of releases pages is less or equal to the number taken pages. Signed-off-by: Konstantin Neumoin Signed-off-by: Denis V. Lunev CC: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/virtio/virtio_balloon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8ab6238c9299..56f7e2521202 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -196,6 +196,8 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) num = min(num, ARRAY_SIZE(vb->pfns)); mutex_lock(&vb->balloon_lock); + /* We can't release more pages than taken */ + num = min(num, (size_t)vb->num_pages); for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { page = balloon_page_dequeue(vb_dev_info); From 84f8a24ade96a8c1499823b160a0591199cf9d12 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 8 Jul 2016 12:18:50 -0700 Subject: [PATCH 244/813] ftrace/recordmcount: Work around for addition of metag magic but not relocations commit b2e1c26f0b62531636509fbcb6dab65617ed8331 upstream. glibc recently did a sync up (94e73c95d9b5 "elf.h: Sync with the gabi webpage") that added a #define for EM_METAG but did not add relocations This triggers build errors: scripts/recordmcount.c: In function 'do_file': scripts/recordmcount.c:466:28: error: 'R_METAG_ADDR32' undeclared (first use in this function) case EM_METAG: reltype = R_METAG_ADDR32; ^~~~~~~~~~~~~~ scripts/recordmcount.c:466:28: note: each undeclared identifier is reported only once for each function it appears in scripts/recordmcount.c:468:20: error: 'R_METAG_NONE' undeclared (first use in this function) rel_type_nop = R_METAG_NONE; ^~~~~~~~~~~~ Work around this change with some more #ifdefery for the relocations. Fedora Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1354034 Link: http://lkml.kernel.org/r/1468005530-14757-1-git-send-email-labbott@redhat.com Cc: James Hogan Fixes: 00512bdd4573 ("metag: ftrace support") Reported-by: Ross Burton Signed-off-by: Laura Abbott Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- scripts/recordmcount.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index e167592793a7..42396a74405d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -33,10 +33,17 @@ #include #include +/* + * glibc synced up and added the metag number but didn't add the relocations. + * Work around this in a crude manner for now. + */ #ifndef EM_METAG -/* Remove this when these make it to the standard system elf.h. */ #define EM_METAG 174 +#endif +#ifndef R_METAG_ADDR32 #define R_METAG_ADDR32 2 +#endif +#ifndef R_METAG_NONE #define R_METAG_NONE 3 #endif From 8660eadce2bafd16b2b2d0100190c5ac76519caf Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 4 Aug 2016 17:36:08 +0100 Subject: [PATCH 245/813] metag: Fix __cmpxchg_u32 asm constraint for CMP commit 6154c187b97ee7513046bb4eb317a89f738f13ef upstream. The LNKGET based atomic sequence in __cmpxchg_u32 has slightly incorrect constraints for the return value which under certain circumstances can allow an address unit register to be used as the first operand of a CMP instruction. This isn't a valid instruction however as the encodings only allow a data unit to be specified. This would result in an assembler error like the following: Error: failed to assemble instruction: "CMP A0.2,D0Ar6" Fix by changing the constraint from "=&da" (assigned, early clobbered, data or address unit register) to "=&d" (data unit register only). The constraint for the second operand, "bd" (an op2 register where op1 is a data unit register and the instruction supports O2R) is already correct assuming the first operand is a data unit register. Other cases of CMP in inline asm have had their constraints checked, and appear to all be fine. Fixes: 6006c0d8ce94 ("metag: Atomics, locks and bitops") Signed-off-by: James Hogan Cc: linux-metag@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/metag/include/asm/cmpxchg_lnkget.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/metag/include/asm/cmpxchg_lnkget.h b/arch/metag/include/asm/cmpxchg_lnkget.h index 0154e2807ebb..2369ad394876 100644 --- a/arch/metag/include/asm/cmpxchg_lnkget.h +++ b/arch/metag/include/asm/cmpxchg_lnkget.h @@ -73,7 +73,7 @@ static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old, " DCACHE [%2], %0\n" #endif "2:\n" - : "=&d" (temp), "=&da" (retval) + : "=&d" (temp), "=&d" (retval) : "da" (m), "bd" (old), "da" (new) : "cc" ); From 01daea925d04909561bf7c39c76e71d13ddcb2ec Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 27 Jul 2016 07:22:05 +0200 Subject: [PATCH 246/813] block: add missing group association in bio-cloning functions commit 20bd723ec6a3261df5e02250cd3a1fbb09a343f2 upstream. When a bio is cloned, the newly created bio must be associated with the same blkcg as the original bio (if BLK_CGROUP is enabled). If this operation is not performed, then the new bio is not associated with any group, and the group of the current task is returned when the group of the bio is requested. Depending on the cloning frequency, this may cause a large percentage of the bios belonging to a given group to be treated as if belonging to other groups (in most cases as if belonging to the root group). The expected group isolation may thereby be broken. This commit adds the missing association in bio-cloning functions. Fixes: da2f0f74cf7d ("Btrfs: add support for blkio controllers") Signed-off-by: Paolo Valente Reviewed-by: Nikolay Borisov Reviewed-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/bio.c | 15 +++++++++++++++ fs/btrfs/extent_io.c | 6 ------ include/linux/bio.h | 3 +++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index d4d144363250..46e2cc1d4016 100644 --- a/block/bio.c +++ b/block/bio.c @@ -584,6 +584,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_rw = bio_src->bi_rw; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; + + bio_clone_blkcg_association(bio, bio_src); } EXPORT_SYMBOL(__bio_clone_fast); @@ -689,6 +691,8 @@ integrity_clone: } } + bio_clone_blkcg_association(bio, bio_src); + return bio; } EXPORT_SYMBOL(bio_clone_bioset); @@ -2014,6 +2018,17 @@ void bio_disassociate_task(struct bio *bio) } } +/** + * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_css) + WARN_ON(bio_associate_blkcg(dst, src->bi_css)); +} + #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9abe18763a7f..257bbdcb5df6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2786,12 +2786,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } diff --git a/include/linux/bio.h b/include/linux/bio.h index fbe47bc700bd..42e4e3cbb001 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -527,11 +527,14 @@ extern unsigned int bvec_nr_vecs(unsigned short idx); int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); +void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } +static inline void bio_clone_blkcg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM From 0d301856de347a43fa87833dba61d3239211429f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 Jul 2016 11:15:13 -0700 Subject: [PATCH 247/813] block: fix bdi vs gendisk lifetime mismatch commit df08c32ce3be5be138c1dbfcba203314a3a7cd6f upstream. The name for a bdi of a gendisk is derived from the gendisk's devt. However, since the gendisk is destroyed before the bdi it leaves a window where a new gendisk could dynamically reuse the same devt while a bdi with the same name is still live. Arrange for the bdi to hold a reference against its "owner" disk device while it is registered. Otherwise we can hit sysfs duplicate name collisions like the following: WARNING: CPU: 10 PID: 2078 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x64/0x80 sysfs: cannot create duplicate filename '/devices/virtual/bdi/259:1' Hardware name: HP ProLiant DL580 Gen8, BIOS P79 05/06/2015 0000000000000286 0000000002c04ad5 ffff88006f24f970 ffffffff8134caec ffff88006f24f9c0 0000000000000000 ffff88006f24f9b0 ffffffff8108c351 0000001f0000000c ffff88105d236000 ffff88105d1031e0 ffff8800357427f8 Call Trace: [] dump_stack+0x63/0x87 [] __warn+0xd1/0xf0 [] warn_slowpath_fmt+0x5f/0x80 [] sysfs_warn_dup+0x64/0x80 [] sysfs_create_dir_ns+0x7e/0x90 [] kobject_add_internal+0xaa/0x320 [] ? vsnprintf+0x34e/0x4d0 [] kobject_add+0x75/0xd0 [] ? mutex_lock+0x12/0x2f [] device_add+0x125/0x610 [] device_create_groups_vargs+0xd8/0x100 [] device_create_vargs+0x1c/0x20 [] bdi_register+0x8c/0x180 [] bdi_register_dev+0x27/0x30 [] add_disk+0x175/0x4a0 Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman Fixed up missing 0 return in bdi_register_owner(). Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- include/linux/backing-dev-defs.h | 1 + include/linux/backing-dev.h | 1 + mm/backing-dev.c | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index d2a1d43bf9fa..a5bed6bc869d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -612,7 +612,7 @@ void add_disk(struct gendisk *disk) /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); + bdi_register_owner(bdi, disk_to_dev(disk)); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1b4d69f68c33..140c29635069 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -163,6 +163,7 @@ struct backing_dev_info { wait_queue_head_t wb_waitq; struct device *dev; + struct device *owner; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c82794f20110..89d3de3e096b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -24,6 +24,7 @@ __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index cbe6f0b96f29..9ef80bf441b3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -825,6 +825,20 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +{ + int rc; + + rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt), + MINOR(owner->devt)); + if (rc) + return rc; + bdi->owner = owner; + get_device(owner); + return 0; +} +EXPORT_SYMBOL(bdi_register_owner); + /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ @@ -849,6 +863,11 @@ void bdi_unregister(struct backing_dev_info *bdi) device_unregister(bdi->dev); bdi->dev = NULL; } + + if (bdi->owner) { + put_device(bdi->owner); + bdi->owner = NULL; + } } void bdi_exit(struct backing_dev_info *bdi) From 7928de5185f04b970dc9505cb8caa1cb5e46fa07 Mon Sep 17 00:00:00 2001 From: Hector Palacios Date: Mon, 18 Jul 2016 10:39:18 +0200 Subject: [PATCH 248/813] mtd: nand: fix bug writing 1 byte less than page size commit 144f4c98399e2c0ca60eb414c15a2c68125c18b8 upstream. nand_do_write_ops() determines if it is writing a partial page with the formula: part_pagewr = (column || writelen < (mtd->writesize - 1)) When 'writelen' is exactly 1 byte less than the NAND page size the formula equates to zero, so the code doesn't process it as a partial write, although it should. As a consequence the function remains in the while(1) loop with 'writelen' becoming 0xffffffff and iterating endlessly. The bug may not be easy to reproduce in Linux since user space tools usually force the padding or round-up the write size to a page-size multiple. This was discovered in U-Boot where the issue can be reproduced by writing any size that is 1 byte less than a page-size multiple. For example, on a NAND with 2K page (0x800): => nand erase.part => nand write $loadaddr 7ff [Editor's note: the bug was added in commit 29072b96078f, but moved around in commit 66507c7bc8895 ("mtd: nand: Add support to use nand_base poi databuf as bounce buffer")] Fixes: 29072b96078f ("[MTD] NAND: add subpage write support") Signed-off-by: Hector Palacios Acked-by: Boris Brezillon Signed-off-by: Brian Norris Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/nand/nand_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index ce7b2cab5762..54ab48827258 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2586,7 +2586,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, int cached = writelen > bytes && page != blockmask; uint8_t *wbuf = buf; int use_bufpoi; - int part_pagewr = (column || writelen < (mtd->writesize - 1)); + int part_pagewr = (column || writelen < mtd->writesize); if (part_pagewr) use_bufpoi = 1; From 4733b66d45d4452155a123b12dfeba3edba0facd Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 2 Aug 2016 14:02:31 -0700 Subject: [PATCH 249/813] mm/hugetlb: avoid soft lockup in set_max_huge_pages() commit 649920c6ab93429b94bc7c1aa7c0e8395351be32 upstream. In powerpc servers with large memory(32TB), we watched several soft lockups for hugepage under stress tests. The call traces are as follows: 1. get_page_from_freelist+0x2d8/0xd50 __alloc_pages_nodemask+0x180/0xc20 alloc_fresh_huge_page+0xb0/0x190 set_max_huge_pages+0x164/0x3b0 2. prep_new_huge_page+0x5c/0x100 alloc_fresh_huge_page+0xc8/0x190 set_max_huge_pages+0x164/0x3b0 This patch fixes such soft lockups. It is safe to call cond_resched() there because it is out of spin_lock/unlock section. Link: http://lkml.kernel.org/r/1469674442-14848-1-git-send-email-hejianet@gmail.com Signed-off-by: Jia He Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Acked-by: Dave Hansen Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef6963b577fd..0c31f184daf8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2170,6 +2170,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); + + /* yield cpu to avoid soft lockup */ + cond_resched(); + if (hstate_is_gigantic(h)) ret = alloc_fresh_gigantic_page(h, nodes_allowed); else From 7e4a2f979dd707742352dce6409916bc5788d7f8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 3 Aug 2016 15:13:00 +0200 Subject: [PATCH 250/813] ALSA: hda: Fix krealloc() with __GFP_ZERO usage commit 33baefe5e72f17a6df378e48196cd8cada11deec upstream. krealloc() doesn't work always properly with __GFP_ZERO flag as expected. For clearing the reallocated area, we need to clear explicitly instead. Reported-by: Joe Perches Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/hda/array.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/hda/array.c b/sound/hda/array.c index 516795baa7db..5dfa610e4471 100644 --- a/sound/hda/array.c +++ b/sound/hda/array.c @@ -21,13 +21,15 @@ void *snd_array_new(struct snd_array *array) return NULL; if (array->used >= array->alloced) { int num = array->alloced + array->alloc_align; + int oldsize = array->alloced * array->elem_size; int size = (num + 1) * array->elem_size; void *nlist; if (snd_BUG_ON(num >= 4096)) return NULL; - nlist = krealloc(array->list, size, GFP_KERNEL | __GFP_ZERO); + nlist = krealloc(array->list, size, GFP_KERNEL); if (!nlist) return NULL; + memset(nlist + oldsize, 0, size - oldsize); array->list = nlist; array->alloced = num; } From 39d505450f84aef77377c1fe0757e6b906132112 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Mon, 1 Aug 2016 10:20:32 +0800 Subject: [PATCH 251/813] ALSA: hda/realtek - Can't adjust speaker's volume on a Dell AIO commit dd9aa335c88003d131ac874e7f6809902de0b847 upstream. We have a Dell AIO on which we can't adjust its speaker's volume. The problem is it is connected to a Audio Output node without Amp-out capability. To fix it, we change it to be connnected to a node with Amp-out capability. Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index abcb5a6a1cd9..c3f02aa294a3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4674,6 +4674,22 @@ static void alc290_fixup_mono_speakers(struct hda_codec *codec, } } +static void alc298_fixup_speaker_volume(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + if (action == HDA_FIXUP_ACT_PRE_PROBE) { + /* The speaker is routed to the Node 0x06 by a mistake, as a result + we can't adjust the speaker's volume since this node does not has + Amp-out capability. we change the speaker's route to: + Node 0x02 (Audio Output) -> Node 0x0c (Audio Mixer) -> Node 0x17 ( + Pin Complex), since Node 0x02 has Amp-out caps, we can adjust + speaker's volume now. */ + + hda_nid_t conn1[1] = { 0x0c }; + snd_hda_override_conn_list(codec, 0x17, 1, conn1); + } +} + /* Hook to update amp GPIO4 for automute */ static void alc280_hp_gpio4_automute_hook(struct hda_codec *codec, struct hda_jack_callback *jack) @@ -4823,6 +4839,7 @@ enum { ALC280_FIXUP_HP_HEADSET_MIC, ALC221_FIXUP_HP_FRONT_MIC, ALC292_FIXUP_TPT460, + ALC298_FIXUP_SPK_VOLUME, }; static const struct hda_fixup alc269_fixups[] = { @@ -5478,6 +5495,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC293_FIXUP_LENOVO_SPK_NOISE, }, + [ALC298_FIXUP_SPK_VOLUME] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc298_fixup_speaker_volume, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -5524,6 +5545,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13 9350", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE), SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), + SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From f39db7fb96a0f2ab9d0fb34b9cc64c272abd6c21 Mon Sep 17 00:00:00 2001 From: Maruthi Srinivas Bayyavarapu Date: Wed, 3 Aug 2016 16:46:39 +0530 Subject: [PATCH 252/813] ALSA: hda: add AMD Bonaire AZ PCI ID with proper driver caps commit fd48331f9b71d2add941adaee3619f5b8527182d upstream. This commit fixes garbled audio on Bonaire HDMI Signed-off-by: Maruthi Bayyavarapu Reviewed-by: Alex Deucher Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 8218cace8fea..e769e5764cba 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2288,6 +2288,8 @@ static const struct pci_device_id azx_ids[] = { { PCI_DEVICE(0x1022, 0x780d), .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB }, /* ATI HDMI */ + { PCI_DEVICE(0x1002, 0x0002), + .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, { PCI_DEVICE(0x1002, 0x1308), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, { PCI_DEVICE(0x1002, 0x157a), From 148a19e60566ac9a72381bbfd51d00a8c264949b Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 4 Aug 2016 15:28:04 +0800 Subject: [PATCH 253/813] ALSA: hda - Fix headset mic detection problem for two dell machines commit 59ec4b57bcaede46546d54d037a21004b9aa5cef upstream. One of the machines has ALC255 on it, another one has ALC298 on it. On the machine with the codec ALC298, it also has the speaker volume problem, so we add the fixup chained to ALC298_FIXUP_SPK_VOLUME rather than adding a group of pin definition in the pin quirk table, since the speak volume problem does not happen on other machines yet. Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c3f02aa294a3..f25479ba3981 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5498,6 +5498,8 @@ static const struct hda_fixup alc269_fixups[] = { [ALC298_FIXUP_SPK_VOLUME] = { .type = HDA_FIXUP_FUNC, .v.func = alc298_fixup_speaker_volume, + .chained = true, + .chain_id = ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, }, }; @@ -5820,6 +5822,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x14, 0x90170130}, {0x1b, 0x01014020}, {0x21, 0x0221103f}), + SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x14, 0x90170130}, + {0x1b, 0x02011020}, + {0x21, 0x0221103f}), SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, {0x14, 0x90170150}, {0x1b, 0x02011020}, From 02773ea7eddad4b35bc2812d3e7743ee48430d4b Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Fri, 17 Jun 2016 15:33:31 +0300 Subject: [PATCH 254/813] IB/mlx5: Fix MODIFY_QP command input structure commit e3353c268b06236d6c40fa1714c114f21f44451c upstream. Make MODIFY_QP command input structure compliant to specification Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- include/linux/mlx5/qp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f079fb1a31f7..554a5ef50c39 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -534,9 +534,9 @@ struct mlx5_destroy_qp_mbox_out { struct mlx5_modify_qp_mbox_in { struct mlx5_inbox_hdr hdr; __be32 qpn; - u8 rsvd1[4]; - __be32 optparam; u8 rsvd0[4]; + __be32 optparam; + u8 rsvd1[4]; struct mlx5_qp_context ctx; }; From bae7400e1e13b1ce617eae73227218d7bff3d829 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sat, 4 Jun 2016 15:15:34 +0300 Subject: [PATCH 255/813] IB/mlx5: Fix entries checks in mlx5_ib_create_cq commit 9ea578528656e191c1097798a771ff08bab6f323 upstream. Number of entries shouldn't be greater than the device's max capability. This should be checked before rounding the entries number to power of two. Fixes: 51ee86a4af639 ('IB/mlx5: Fix check of number of entries...') Signed-off-by: Majd Dibbiny Signed-off-by: Noa Osherovich Signed-off-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/cq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 92ddae101ecc..44fec25ace65 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -763,7 +763,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (attr->flags) return ERR_PTR(-EINVAL); - if (entries < 0) + if (entries < 0 || + (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) return ERR_PTR(-EINVAL); entries = roundup_pow_of_two(entries + 1); From cbbfde038e42029e4dfbb303ab5f825fd743973e Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sat, 4 Jun 2016 15:15:32 +0300 Subject: [PATCH 256/813] IB/mlx5: Fix returned values of query QP commit 0540d8148d419bf769e5aa99c77027febd8922f0 upstream. Some variables were not initialized properly: max_recv_wr, max_recv_sge, max_send_wr, qp_context and max_inline_data. Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB...') Signed-off-by: Noa Osherovich Signed-off-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/qp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 307bdbca8938..d16ffc90b159 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -226,6 +226,8 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, qp->rq.max_gs = 0; qp->rq.wqe_cnt = 0; qp->rq.wqe_shift = 0; + cap->max_recv_wr = 0; + cap->max_recv_sge = 0; } else { if (ucmd) { qp->rq.wqe_cnt = ucmd->rq_wqe_count; @@ -3092,17 +3094,19 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr qp_attr->cap.max_recv_sge = qp->rq.max_gs; if (!ibqp->uobject) { - qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_wr = qp->sq.max_post; qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_init_attr->qp_context = ibqp->qp_context; } else { qp_attr->cap.max_send_wr = 0; qp_attr->cap.max_send_sge = 0; } - /* We don't support inline sends for kernel QPs (yet), and we - * don't know what userspace's value should be. - */ - qp_attr->cap.max_inline_data = 0; + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->srq = ibqp->srq; + qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; From a25be0f972d74aa4bd3efe8951eb3c66d605c650 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sat, 4 Jun 2016 15:15:35 +0300 Subject: [PATCH 257/813] IB/mlx5: Fix entries check in mlx5_ib_resize_cq commit 3c4c37746c919c983e439ac6a7328cd2d48c10ed upstream. Verify that number of entries is less than device capability. Add an appropriate warning message for error flow. Fixes: bde51583f49b ('IB/mlx5: Add support for resize CQ') Signed-off-by: Majd Dibbiny Signed-off-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/cq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 44fec25ace65..8184267c7901 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1095,11 +1095,16 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) return -ENOSYS; } - if (entries < 1) + if (entries < 1 || + entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d, max %d\n", + entries, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); return -EINVAL; + } entries = roundup_pow_of_two(entries + 1); - if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) return -EINVAL; if (entries == ibcq->cqe + 1) From f868cae619b0b6e56afca0d6ee5377d5855f64f1 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 22 Jun 2016 17:27:26 +0300 Subject: [PATCH 258/813] IB/mlx5: Fix post send fence logic commit c9b254955b9f8814966f5dabd34c39d0e0a2b437 upstream. If the caller specified IB_SEND_FENCE in the send flags of the work request and no previous work request stated that the successive one should be fenced, the work request would be executed without a fence. This could result in RDMA read or atomic operations failure due to a MR being invalidated. Fix this by adding the mlx5 enumeration for fencing RDMA/atomic operations and fix the logic to apply this. Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/qp.c | 7 ++++--- include/linux/mlx5/qp.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d16ffc90b159..cfcfbb6b84d7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2527,10 +2527,11 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr) return MLX5_FENCE_MODE_SMALL_AND_FENCE; else return fence; - - } else { - return 0; + } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) { + return MLX5_FENCE_MODE_FENCE; } + + return 0; } static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 554a5ef50c39..a8786d27ab81 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -160,6 +160,7 @@ enum { enum { MLX5_FENCE_MODE_NONE = 0 << 5, MLX5_FENCE_MODE_INITIATOR_SMALL = 1 << 5, + MLX5_FENCE_MODE_FENCE = 2 << 5, MLX5_FENCE_MODE_STRONG_ORDERING = 3 << 5, MLX5_FENCE_MODE_SMALL_AND_FENCE = 4 << 5, }; From d1859e0e73e50397d958cc27d53129c2a33e6880 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sat, 4 Jun 2016 15:15:29 +0300 Subject: [PATCH 259/813] IB/mlx5: Return PORT_ERR in Active to Initializing tranisition commit 2788cf3bd90af3791c3195c52391bcf34fa67b40 upstream. FW port-change events are fired on Active <-> non Active port state transitions only. When the port state changes from Active to Initializing (Active -> Down -> Initializing), a single event is fired. The HCA transitions from Down to Initializing unless prevented from doing so, hence the driver should also propagate events when the port state is Initializing to consumers so they'll be aware that the port is no longer Active and act accordingly. Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB...') Signed-off-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fd17443aeacd..bfc940ff9c8a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -962,14 +962,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, break; case MLX5_DEV_EVENT_PORT_DOWN: + case MLX5_DEV_EVENT_PORT_INITIALIZED: ibev.event = IB_EVENT_PORT_ERR; port = (u8)param; break; - case MLX5_DEV_EVENT_PORT_INITIALIZED: - /* not used by ULPs */ - return; - case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; port = (u8)param; From 041a8254284b766ba90425b1576f86f72b7dfbf2 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Fri, 6 May 2016 22:45:27 +0300 Subject: [PATCH 260/813] IB/SA: Use correct free function commit 0f377d86252d11bfea941852785e3094b93601a7 upstream. Fixes a direct call to kfree_skb when nlmsg_free should be used. Fixes: 2ca546b92a02 ('IB/sa: Route SA pathrecord query through netlink') Signed-off-by: Mark Bloch Reviewed-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Reviewed-by: Ira Weiny Reviewed-by: Steve Wise Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/sa_query.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a95a32ba596e..d3b7ecd106f7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -534,7 +534,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); if (!data) { - kfree_skb(skb); + nlmsg_free(skb); return -EMSGSIZE; } From 9bb807338af3c4dcef05ad979394ec4effffeb56 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Sat, 4 Jun 2016 15:15:19 +0300 Subject: [PATCH 261/813] IB/IPoIB: Don't update neigh validity for unresolved entries commit 61c78eea9516a921799c17b4c20558e2aa780fd3 upstream. ipoib_neigh_get unconditionally updates the "alive" variable member on any packet send. This prevents the neighbor garbage collection from cleaning out a dead neighbor entry if we are still queueing packets for it. If the queue for this neighbor is full, then don't update the alive timestamp. That way the neighbor can time out even if packets are still being queued as long as none of them are being sent. Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup in xmit path") Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7d3281866ffc..942dffca6a9d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1131,7 +1131,9 @@ struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) neigh = NULL; goto out_unlock; } - neigh->alive = jiffies; + + if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) + neigh->alive = jiffies; goto out_unlock; } } From 1d13a91a689fc8b7f6bdbc00adc5322dc9e338d0 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Fri, 6 May 2016 22:45:24 +0300 Subject: [PATCH 262/813] IB/IWPM: Fix a potential skb leak commit 5ed935e861a4cbf2158ad3386d6d26edd60d2658 upstream. In case ibnl_put_msg fails in send_nlmsg_done, the function returns with -ENOMEM without freeing. This patch fixes this behavior. Fixes: 30dc5e63d6a5 ("RDMA/core: Add support for iWARP Port Mapper user space service") Signed-off-by: Mark Bloch Reviewed-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/iwpm_util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 5fb089e91353..fb43a242847b 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -634,6 +634,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; From 155c27dd5575e9a5466587ece499c61d9089615a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 22 Jun 2016 17:27:28 +0300 Subject: [PATCH 263/813] IB/mlx4: Fix the SQ size of an RC QP commit f2940e2c76bb554a7fbdd28ca5b90904117a9e96 upstream. When calculating the required size of an RC QP send queue, leave enough space for masked atomic operations, which require more space than "regular" atomic operation. Fixes: 6fa8f719844b ("IB/mlx4: Add support for masked atomic operations") Signed-off-by: Yishai Hadas Reviewed-by: Jack Morgenstein Reviewed-by: Eran Ben Elisha Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 13eaaf45288f..482e9783641a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -357,7 +357,7 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + - sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: From d057209617da2af6e2d40dc1473c85813fb443f8 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 22 Jun 2016 17:27:29 +0300 Subject: [PATCH 264/813] IB/mlx4: Fix error flow when sending mads under SRIOV commit a6100603a4a87fc436199362bdb81cb849faaf6e upstream. Fix mad send error flow to prevent double freeing address handles, and leaking tx_ring entries when SRIOV is active. If ib_mad_post_send fails, the address handle pointer in the tx_ring entry must be set to NULL (or there will be a double-free) and tx_tail must be incremented (or there will be a leak of tx_ring entries). The tx_ring is handled the same way in the send-completion handler. Fixes: 37bfc7c1e83f ("IB/mlx4: SR-IOV multiplex and demultiplex MADs") Signed-off-by: Yishai Hadas Reviewed-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/mad.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 870e56b6b25f..05179f47bbde 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -526,7 +526,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); spin_unlock(&tun_qp->tx_lock); if (ret) - goto out; + goto end; tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); if (tun_qp->tx_ring[tun_tx_ix].ah) @@ -595,9 +595,15 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, wr.wr.send_flags = IB_SEND_SIGNALED; ret = ib_post_send(src_qp, &wr.wr, &bad_wr); -out: - if (ret) - ib_destroy_ah(ah); + if (!ret) + return 0; + out: + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + tun_qp->tx_ring[tun_tx_ix].ah = NULL; +end: + ib_destroy_ah(ah); return ret; } @@ -1278,9 +1284,15 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, ret = ib_post_send(send_qp, &wr.wr, &bad_wr); + if (!ret) + return 0; + + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + sqp->tx_ring[wire_tx_ix].ah = NULL; out: - if (ret) - ib_destroy_ah(ah); + ib_destroy_ah(ah); return ret; } From fd8c10a80ee746348a59c9a98dd6a7eeec73fc28 Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Wed, 22 Jun 2016 17:27:31 +0300 Subject: [PATCH 265/813] IB/mlx4: Fix memory leak if QP creation failed commit 5b420d9cf7382c6e1512e96e02d18842d272049c upstream. When RC, UC, or RAW QPs are created, a qp object is allocated (kzalloc). If at a later point (in procedure create_qp_common) the qp creation fails, this qp object must be freed. Fixes: 1ffeb2eb8be99 ("IB/mlx4: SR-IOV IB context objects and proxy/tunnel SQP support") Signed-off-by: Dotan Barak Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 482e9783641a..ea1e2ddaddf5 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1162,8 +1162,10 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, { err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, 0, &qp, gfp); - if (err) + if (err) { + kfree(qp); return ERR_PTR(err); + } qp->ibqp.qp_num = qp->mqp.qpn; qp->xrcdn = xrcdn; From 2c00c2171c0d525f49e8c36d79082ee33fba8b10 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Thu, 16 Jun 2016 10:51:46 -0700 Subject: [PATCH 266/813] of: fix memory leak related to safe_name() commit d9fc880723321dbf16b2981e3f3e916b73942210 upstream. Fix a memory leak resulting from memory allocation in safe_name(). This patch fixes all call sites of safe_name(). Mathieu Malaterre reported the memory leak on boot: On my PowerMac device-tree would generate a duplicate name: [ 0.023043] device-tree: Duplicate name in PowerPC,G4@0, renamed to "l2-cache#1" in this case a newly allocated name is generated by `safe_name`. However in this case it is never deallocated. The bug was found using kmemleak reported as: unreferenced object 0xdf532e60 (size 32): comm "swapper", pid 1, jiffies 4294892300 (age 1993.532s) hex dump (first 32 bytes): 6c 32 2d 63 61 63 68 65 23 31 00 dd e4 dd 1e c2 l2-cache#1...... ec d4 ba ce 04 ec cc de 8e 85 e9 ca c4 ec cc 9e ................ backtrace: [] kvasprintf+0x64/0xc8 [] kasprintf+0x4c/0x5c [] safe_name.isra.1+0x80/0xc4 [] __of_attach_node_sysfs+0x6c/0x11c [] of_core_init+0x8c/0xf8 [] kernel_init_freeable+0xd4/0x208 [] kernel_init+0x24/0x11c [] ret_from_kernel_thread+0x5c/0x64 Link: https://bugzilla.kernel.org/show_bug.cgi?id=120331 Signed-off-by: Frank Rowand Reported-by: mathieu.malaterre@gmail.com Tested-by: Mathieu Malaterre Signed-off-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- drivers/of/base.c | 30 +++++++++++++++++++++--------- drivers/of/dynamic.c | 2 +- drivers/of/of_private.h | 3 +++ 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 017dd94f16ea..942461f36616 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -112,6 +112,7 @@ static ssize_t of_node_property_read(struct file *filp, struct kobject *kobj, return memory_read_from_buffer(buf, count, &offset, pp->value, pp->length); } +/* always return newly allocated name, caller must free after use */ static const char *safe_name(struct kobject *kobj, const char *orig_name) { const char *name = orig_name; @@ -126,9 +127,12 @@ static const char *safe_name(struct kobject *kobj, const char *orig_name) name = kasprintf(GFP_KERNEL, "%s#%i", orig_name, ++i); } - if (name != orig_name) + if (name == orig_name) { + name = kstrdup(orig_name, GFP_KERNEL); + } else { pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n", kobject_name(kobj), name); + } return name; } @@ -159,6 +163,7 @@ int __of_add_property_sysfs(struct device_node *np, struct property *pp) int __of_attach_node_sysfs(struct device_node *np) { const char *name; + struct kobject *parent; struct property *pp; int rc; @@ -171,15 +176,16 @@ int __of_attach_node_sysfs(struct device_node *np) np->kobj.kset = of_kset; if (!np->parent) { /* Nodes without parents are new top level trees */ - rc = kobject_add(&np->kobj, NULL, "%s", - safe_name(&of_kset->kobj, "base")); + name = safe_name(&of_kset->kobj, "base"); + parent = NULL; } else { name = safe_name(&np->parent->kobj, kbasename(np->full_name)); - if (!name || !name[0]) - return -EINVAL; - - rc = kobject_add(&np->kobj, &np->parent->kobj, "%s", name); + parent = &np->parent->kobj; } + if (!name) + return -ENOMEM; + rc = kobject_add(&np->kobj, parent, "%s", name); + kfree(name); if (rc) return rc; @@ -1753,6 +1759,12 @@ int __of_remove_property(struct device_node *np, struct property *prop) return 0; } +void __of_sysfs_remove_bin_file(struct device_node *np, struct property *prop) +{ + sysfs_remove_bin_file(&np->kobj, &prop->attr); + kfree(prop->attr.attr.name); +} + void __of_remove_property_sysfs(struct device_node *np, struct property *prop) { if (!IS_ENABLED(CONFIG_SYSFS)) @@ -1760,7 +1772,7 @@ void __of_remove_property_sysfs(struct device_node *np, struct property *prop) /* at early boot, bail here and defer setup to of_init() */ if (of_kset && of_node_is_attached(np)) - sysfs_remove_bin_file(&np->kobj, &prop->attr); + __of_sysfs_remove_bin_file(np, prop); } /** @@ -1830,7 +1842,7 @@ void __of_update_property_sysfs(struct device_node *np, struct property *newprop return; if (oldprop) - sysfs_remove_bin_file(&np->kobj, &oldprop->attr); + __of_sysfs_remove_bin_file(np, oldprop); __of_add_property_sysfs(np, newprop); } diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c index 53826b84e0ec..2d72ddcf534f 100644 --- a/drivers/of/dynamic.c +++ b/drivers/of/dynamic.c @@ -55,7 +55,7 @@ void __of_detach_node_sysfs(struct device_node *np) /* only remove properties if on sysfs */ if (of_node_is_attached(np)) { for_each_property_of_node(np, pp) - sysfs_remove_bin_file(&np->kobj, &pp->attr); + __of_sysfs_remove_bin_file(np, pp); kobject_del(&np->kobj); } diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 8e882e706cd8..46ddbee22ce3 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -81,6 +81,9 @@ extern int __of_attach_node_sysfs(struct device_node *np); extern void __of_detach_node(struct device_node *np); extern void __of_detach_node_sysfs(struct device_node *np); +extern void __of_sysfs_remove_bin_file(struct device_node *np, + struct property *prop); + /* iterators for transactions, used for overlays */ /* forward iterator */ #define for_each_transaction_entry(_oft, _te) \ From 752aaae53df7cf134f02285505174a74a2215e3e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 23 Jun 2016 19:30:38 +0200 Subject: [PATCH 267/813] ubi: Make volume resize power cut aware commit 4946784bd3924b1374f05eebff2fd68660bae866 upstream. When the volume resize operation shrinks a volume, LEBs will be unmapped. Since unmapping will not erase these LEBs immediately we have to wait for that operation to finish. Otherwise in case of a power cut right after writing the new volume table the UBI attach process can find more LEBs than the volume table knows. This will render the UBI image unattachable. Fix this issue by waiting for erase to complete and write the new volume table afterward. Reported-by: Boris Brezillon Reviewed-by: Boris Brezillon Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/vmt.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 1ae17bb9b889..3ea4c022cbb9 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -488,13 +488,6 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) spin_unlock(&ubi->volumes_lock); } - /* Change volume table record */ - vtbl_rec = ubi->vtbl[vol_id]; - vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs); - err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); - if (err) - goto out_acc; - if (pebs < 0) { for (i = 0; i < -pebs; i++) { err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i); @@ -512,6 +505,24 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) spin_unlock(&ubi->volumes_lock); } + /* + * When we shrink a volume we have to flush all pending (erase) work. + * Otherwise it can happen that upon next attach UBI finds a LEB with + * lnum > highest_lnum and refuses to attach. + */ + if (pebs < 0) { + err = ubi_wl_flush(ubi, vol_id, UBI_ALL); + if (err) + goto out_acc; + } + + /* Change volume table record */ + vtbl_rec = ubi->vtbl[vol_id]; + vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs); + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + if (err) + goto out_acc; + vol->reserved_pebs = reserved_pebs; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = reserved_pebs; From 79d6bc128c0b3370acbfedf7da44e64678c25c27 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 4 Jul 2016 22:06:51 +0200 Subject: [PATCH 268/813] ubi: Fix early logging commit bc743f34dfa011e62edd0ea4ae8455be06c083b5 upstream. We cannot use ubi_* logging functions before the UBI object is initialized. Fixes: 3260870331 ("UBI: Extend UBI layer debug/messaging capabilities") Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/build.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 22fd19c0c5d3..f3798cc21fe5 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -869,7 +869,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && mtd->index == ubi->mtd->index) { - ubi_err(ubi, "mtd%d is already attached to ubi%d", + pr_err("ubi: mtd%d is already attached to ubi%d", mtd->index, i); return -EEXIST; } @@ -884,7 +884,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, * no sense to attach emulated MTD devices, so we prohibit this. */ if (mtd->type == MTD_UBIVOLUME) { - ubi_err(ubi, "refuse attaching mtd%d - it is already emulated on top of UBI", + pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI", mtd->index); return -EINVAL; } @@ -895,7 +895,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, if (!ubi_devices[ubi_num]) break; if (ubi_num == UBI_MAX_DEVICES) { - ubi_err(ubi, "only %d UBI devices may be created", + pr_err("ubi: only %d UBI devices may be created", UBI_MAX_DEVICES); return -ENFILE; } @@ -905,7 +905,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, /* Make sure ubi_num is not busy */ if (ubi_devices[ubi_num]) { - ubi_err(ubi, "already exists"); + pr_err("ubi: ubi%i already exists", ubi_num); return -EEXIST; } } From 4056337b1e81a1b137aa562133dc5430cd2fd19e Mon Sep 17 00:00:00 2001 From: Iosif Harutyunov Date: Fri, 22 Jul 2016 23:22:42 +0000 Subject: [PATCH 269/813] ubi: Fix race condition between ubi device creation and udev commit 714fb87e8bc05ff78255afc0dca981e8c5242785 upstream. Install the UBI device object before we arm sysfs. Otherwise udev tries to read sysfs attributes before UBI is ready and udev rules will not match. Signed-off-by: Iosif Harutyunov [rw: massaged commit message] Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/build.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index f3798cc21fe5..27de0463226e 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -987,6 +987,9 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, goto out_detach; } + /* Make device "available" before it becomes accessible via sysfs */ + ubi_devices[ubi_num] = ubi; + err = uif_init(ubi, &ref); if (err) goto out_detach; @@ -1031,7 +1034,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); - ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; @@ -1042,6 +1044,7 @@ out_uif: ubi_assert(ref); uif_close(ubi); out_detach: + ubi_devices[ubi_num] = NULL; ubi_wl_close(ubi); ubi_free_internal_volumes(ubi); vfree(ubi->vtbl); From 862312014cfd0770418760d24f980b2f45095e93 Mon Sep 17 00:00:00 2001 From: Feng Li Date: Tue, 12 Jul 2016 06:15:44 +0800 Subject: [PATCH 270/813] iscsi-target: Fix panic when adding second TCP connection to iSCSI session commit 8abc718de6e9e52d8a6bfdb735060554aeae25e4 upstream. In MC/S scenario, the conn->sess has been set NULL in iscsi_login_non_zero_tsih_s1 when the second connection comes here, then kernel panic. The conn->sess will be assigned in iscsi_login_non_zero_tsih_s2. So we should check whether it's NULL before calling. Signed-off-by: Feng Li Tested-by: Sumit Rai Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target_login.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 96e78c823d13..316f66172335 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1357,8 +1357,9 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) } login->zero_tsih = zero_tsih; - conn->sess->se_sess->sup_prot_ops = - conn->conn_transport->iscsit_get_sup_prot_ops(conn); + if (conn->sess) + conn->sess->se_sess->sup_prot_ops = + conn->conn_transport->iscsit_get_sup_prot_ops(conn); tpg = conn->tpg; if (!tpg) { From 6492c1c5b95658bc070d5d231bc32568b84b49bb Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Tue, 17 May 2016 22:19:10 -0700 Subject: [PATCH 271/813] target: Fix ordered task target_setup_cmd_from_cdb exception hang commit dff0ca9ea7dc8be2181a62df4a722c32ce68ff4a upstream. If a command with a Simple task attribute is failed due to a Unit Attention, then a subsequent command with an Ordered task attribute will hang forever. The reason for this is that the Unit Attention status is checked for in target_setup_cmd_from_cdb, before the call to target_execute_cmd, which calls target_handle_task_attr, which in turn increments dev->simple_cmds. However, transport_generic_request_failure still calls transport_complete_task_attr, which will decrement dev->simple_cmds. In this case, simple_cmds is now -1. So when a command with the Ordered task attribute is sent, target_handle_task_attr sees that dev->simple_cmds is not 0, so it decides it can't execute the command until all the (nonexistent) Simple commands have completed. Reported-by: Michael Cyr Tested-by: Michael Cyr Reported-by: Bryant G. Ly Tested-by: Bryant G. Ly Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_internal.h | 1 + drivers/target/target_core_sbc.c | 2 +- drivers/target/target_core_transport.c | 62 +++++++++++++++----------- include/target/target_core_fabric.h | 1 - 4 files changed, 37 insertions(+), 29 deletions(-) diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index dae0750c2032..253a91bff943 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -148,6 +148,7 @@ sense_reason_t target_cmd_size_check(struct se_cmd *cmd, unsigned int size); void target_qf_do_work(struct work_struct *work); bool target_check_wce(struct se_device *dev); bool target_check_fua(struct se_device *dev); +void __target_execute_cmd(struct se_cmd *, bool); /* target_core_stat.c */ void target_stat_setup_dev_default_groups(struct se_device *); diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index 98698d875742..c220bb8dfa9d 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -594,7 +594,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes cmd->transport_state |= CMD_T_ACTIVE|CMD_T_BUSY|CMD_T_SENT; spin_unlock_irq(&cmd->t_state_lock); - __target_execute_cmd(cmd); + __target_execute_cmd(cmd, false); kfree(buf); return ret; diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index d151bc3d6971..7f4cdc82234b 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1270,23 +1270,6 @@ target_setup_cmd_from_cdb(struct se_cmd *cmd, unsigned char *cdb) trace_target_sequencer_start(cmd); - /* - * Check for an existing UNIT ATTENTION condition - */ - ret = target_scsi3_ua_check(cmd); - if (ret) - return ret; - - ret = target_alua_state_check(cmd); - if (ret) - return ret; - - ret = target_check_reservation(cmd); - if (ret) { - cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT; - return ret; - } - ret = dev->transport->parse_cdb(cmd); if (ret == TCM_UNSUPPORTED_SCSI_OPCODE) pr_warn_ratelimited("%s/%s: Unsupported SCSI Opcode 0x%02x, sending CHECK_CONDITION.\n", @@ -1749,20 +1732,45 @@ queue_full: } EXPORT_SYMBOL(transport_generic_request_failure); -void __target_execute_cmd(struct se_cmd *cmd) +void __target_execute_cmd(struct se_cmd *cmd, bool do_checks) { sense_reason_t ret; - if (cmd->execute_cmd) { - ret = cmd->execute_cmd(cmd); - if (ret) { - spin_lock_irq(&cmd->t_state_lock); - cmd->transport_state &= ~(CMD_T_BUSY|CMD_T_SENT); - spin_unlock_irq(&cmd->t_state_lock); + if (!cmd->execute_cmd) { + ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + goto err; + } + if (do_checks) { + /* + * Check for an existing UNIT ATTENTION condition after + * target_handle_task_attr() has done SAM task attr + * checking, and possibly have already defered execution + * out to target_restart_delayed_cmds() context. + */ + ret = target_scsi3_ua_check(cmd); + if (ret) + goto err; - transport_generic_request_failure(cmd, ret); + ret = target_alua_state_check(cmd); + if (ret) + goto err; + + ret = target_check_reservation(cmd); + if (ret) { + cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT; + goto err; } } + + ret = cmd->execute_cmd(cmd); + if (!ret) + return; +err: + spin_lock_irq(&cmd->t_state_lock); + cmd->transport_state &= ~(CMD_T_BUSY|CMD_T_SENT); + spin_unlock_irq(&cmd->t_state_lock); + + transport_generic_request_failure(cmd, ret); } static int target_write_prot_action(struct se_cmd *cmd) @@ -1887,7 +1895,7 @@ void target_execute_cmd(struct se_cmd *cmd) return; } - __target_execute_cmd(cmd); + __target_execute_cmd(cmd, true); } EXPORT_SYMBOL(target_execute_cmd); @@ -1911,7 +1919,7 @@ static void target_restart_delayed_cmds(struct se_device *dev) list_del(&cmd->se_delayed_node); spin_unlock(&dev->delayed_cmd_lock); - __target_execute_cmd(cmd); + __target_execute_cmd(cmd, true); if (cmd->sam_task_attr == TCM_ORDERED_TAG) break; diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 7fb2557a760e..ce9ea736f1d7 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -163,7 +163,6 @@ int core_tmr_alloc_req(struct se_cmd *, void *, u8, gfp_t); void core_tmr_release_req(struct se_tmr_req *); int transport_generic_handle_tmr(struct se_cmd *); void transport_generic_request_failure(struct se_cmd *, sense_reason_t); -void __target_execute_cmd(struct se_cmd *); int transport_lookup_tmr_lun(struct se_cmd *, u64); void core_allocate_nexus_loss_ua(struct se_node_acl *acl); From 60ba156dda2c11ff7a44d78ec64abd21b9813115 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 25 May 2016 12:25:04 -0700 Subject: [PATCH 272/813] target: Fix missing complete during ABORT_TASK + CMD_T_FABRIC_STOP commit 5e2c956b8aa24d4f33ff7afef92d409eed164746 upstream. During transport_generic_free_cmd() with a concurrent TMR ABORT_TASK and shutdown CMD_T_FABRIC_STOP bit set, the caller will be blocked on se_cmd->cmd_wait_stop completion until the final kref_put() -> target_release_cmd_kref() has been invoked to call complete(). However, when ABORT_TASK is completed with FUNCTION_COMPLETE in core_tmr_abort_task(), the aborted se_cmd will have already been removed from se_sess->sess_cmd_list via list_del_init(). This results in target_release_cmd_kref() hitting the legacy list_empty() == true check, invoking ->release_cmd() but skipping complete() to wakeup se_cmd->cmd_wait_stop blocked earlier in transport_generic_free_cmd() code. To address this bug, it's safe to go ahead and drop the original list_empty() check so that fabric_stop invokes the complete() as expected, since list_del_init() can safely be used on a empty list. Cc: Mike Christie Cc: Quinn Tran Cc: Himanshu Madhani Cc: Christoph Hellwig Cc: Hannes Reinecke Tested-by: Nicholas Bellinger Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_transport.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7f4cdc82234b..93fb7c0dfa3c 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2541,12 +2541,6 @@ static void target_release_cmd_kref(struct kref *kref) bool fabric_stop; spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); - if (list_empty(&se_cmd->se_cmd_list)) { - spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); - target_free_cmd_mem(se_cmd); - se_cmd->se_tfo->release_cmd(se_cmd); - return; - } spin_lock(&se_cmd->t_state_lock); fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP); From f318588b758514c35f0a9227195178a3b2b4b733 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 2 Jun 2016 14:56:45 -0700 Subject: [PATCH 273/813] target: Fix race between iscsi-target connection shutdown + ABORT_TASK commit 064cdd2d91c2805d788876082f31cc63506f22c3 upstream. This patch fixes a race in iscsit_release_commands_from_conn() -> iscsit_free_cmd() -> transport_generic_free_cmd() + wait_for_tasks=1, where CMD_T_FABRIC_STOP could end up being set after the final kref_put() is called from core_tmr_abort_task() context. This results in transport_generic_free_cmd() blocking indefinately on se_cmd->cmd_wait_comp, because the target_release_cmd_kref() check for CMD_T_FABRIC_STOP returns false. To address this bug, make iscsit_release_commands_from_conn() do list_splice and set CMD_T_FABRIC_STOP early while holding iscsi_conn->cmd_lock. Also make iscsit_aborted_task() only remove iscsi_cmd_t if CMD_T_FABRIC_STOP has not already been set. Finally in target_release_cmd_kref(), only honor fabric_stop if CMD_T_ABORTED has been set. Cc: Mike Christie Cc: Quinn Tran Cc: Himanshu Madhani Cc: Christoph Hellwig Cc: Hannes Reinecke Tested-by: Nicholas Bellinger Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 28 +++++++++++++++++--------- drivers/target/target_core_transport.c | 3 ++- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 72204fbf2bb1..bd810c109277 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -492,7 +492,8 @@ static void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) bool scsi_cmd = (cmd->iscsi_opcode == ISCSI_OP_SCSI_CMD); spin_lock_bh(&conn->cmd_lock); - if (!list_empty(&cmd->i_conn_node)) + if (!list_empty(&cmd->i_conn_node) && + !(cmd->se_cmd.transport_state & CMD_T_FABRIC_STOP)) list_del_init(&cmd->i_conn_node); spin_unlock_bh(&conn->cmd_lock); @@ -4194,6 +4195,7 @@ transport_err: static void iscsit_release_commands_from_conn(struct iscsi_conn *conn) { + LIST_HEAD(tmp_list); struct iscsi_cmd *cmd = NULL, *cmd_tmp = NULL; struct iscsi_session *sess = conn->sess; /* @@ -4202,18 +4204,26 @@ static void iscsit_release_commands_from_conn(struct iscsi_conn *conn) * has been reset -> returned sleeping pre-handler state. */ spin_lock_bh(&conn->cmd_lock); - list_for_each_entry_safe(cmd, cmd_tmp, &conn->conn_cmd_list, i_conn_node) { + list_splice_init(&conn->conn_cmd_list, &tmp_list); - list_del_init(&cmd->i_conn_node); - spin_unlock_bh(&conn->cmd_lock); + list_for_each_entry(cmd, &tmp_list, i_conn_node) { + struct se_cmd *se_cmd = &cmd->se_cmd; - iscsit_increment_maxcmdsn(cmd, sess); - - iscsit_free_cmd(cmd, true); - - spin_lock_bh(&conn->cmd_lock); + if (se_cmd->se_tfo != NULL) { + spin_lock(&se_cmd->t_state_lock); + se_cmd->transport_state |= CMD_T_FABRIC_STOP; + spin_unlock(&se_cmd->t_state_lock); + } } spin_unlock_bh(&conn->cmd_lock); + + list_for_each_entry_safe(cmd, cmd_tmp, &tmp_list, i_conn_node) { + list_del_init(&cmd->i_conn_node); + + iscsit_increment_maxcmdsn(cmd, sess); + iscsit_free_cmd(cmd, true); + + } } static void iscsit_stop_timers_for_cmds( diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 93fb7c0dfa3c..8bd7bf6cd986 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2543,7 +2543,8 @@ static void target_release_cmd_kref(struct kref *kref) spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); spin_lock(&se_cmd->t_state_lock); - fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP); + fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP) && + (se_cmd->transport_state & CMD_T_ABORTED); spin_unlock(&se_cmd->t_state_lock); if (se_cmd->cmd_wait_set || fabric_stop) { From 51d841908029ff6b892a93e4df8175162ca8dcc8 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 2 Jun 2016 20:12:37 -0500 Subject: [PATCH 274/813] target: Fix max_unmap_lba_count calc overflow commit ea263c7fada4af8ec7fe5fcfd6e7d7705a89351b upstream. max_discard_sectors only 32bits, and some non scsi backend devices will set this to the max 0xffffffff, so we can end up overflowing during the max_unmap_lba_count calculation. This fixes a regression caused by my patch: commit 8a9ebe717a133ba7bc90b06047f43cc6b8bcb8b3 Author: Mike Christie Date: Mon Jan 18 14:09:27 2016 -0600 target: Fix WRITE_SAME/DISCARD conversion to linux 512b sectors which can result in extra discards being sent to due the overflow causing max_unmap_lba_count to be smaller than what the backing device can actually support. Signed-off-by: Mike Christie Reviewed-by: Bart Van Assche Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_device.c | 8 +++++--- drivers/target/target_core_file.c | 3 +-- drivers/target/target_core_iblock.c | 3 +-- include/target/target_core_backend.h | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 3436a83568ea..dcd5ed26eb18 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -832,13 +832,15 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) * in ATA and we need to set TPE=1 */ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct request_queue *q, int block_size) + struct request_queue *q) { + int block_size = queue_logical_block_size(q); + if (!blk_queue_discard(q)) return false; - attrib->max_unmap_lba_count = (q->limits.max_discard_sectors << 9) / - block_size; + attrib->max_unmap_lba_count = + q->limits.max_discard_sectors >> (ilog2(block_size) - 9); /* * Currently hardcoded to 1 in Linux/SCSI code.. */ diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 75f0f08b2a34..79291869bce6 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -161,8 +161,7 @@ static int fd_configure_device(struct se_device *dev) dev_size, div_u64(dev_size, fd_dev->fd_block_size), fd_dev->fd_block_size); - if (target_configure_unmap_from_queue(&dev->dev_attrib, q, - fd_dev->fd_block_size)) + if (target_configure_unmap_from_queue(&dev->dev_attrib, q)) pr_debug("IFILE: BLOCK Discard support available," " disabled by default\n"); /* diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 2c53dcefff3e..4620c1dcdbc7 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -121,8 +121,7 @@ static int iblock_configure_device(struct se_device *dev) dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q); dev->dev_attrib.hw_queue_depth = q->nr_requests; - if (target_configure_unmap_from_queue(&dev->dev_attrib, q, - dev->dev_attrib.hw_block_size)) + if (target_configure_unmap_from_queue(&dev->dev_attrib, q)) pr_debug("IBLOCK: BLOCK Discard support available," " disabled by default\n"); diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 28ee5c2e6bcd..711322a8ee35 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -96,6 +96,6 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd, bool target_sense_desc_format(struct se_device *dev); sector_t target_to_linux_sector(struct se_device *dev, sector_t lb); bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct request_queue *q, int block_size); + struct request_queue *q); #endif /* TARGET_CORE_BACKEND_H */ From f5ba9a6e48bfb2b00a912a648b69063501637ed3 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 13 Jun 2016 22:58:09 -0700 Subject: [PATCH 275/813] target: Fix ordered task CHECK_CONDITION early exception handling commit 410c29dfbfdf73d0d0b5d14a21868ab038eca703 upstream. If a Simple command is sent with a failure, target_setup_cmd_from_cdb returns with TCM_UNSUPPORTED_SCSI_OPCODE or TCM_INVALID_CDB_FIELD. So in the cases where target_setup_cmd_from_cdb returns an error, we never get far enough to call target_execute_cmd to increment simple_cmds. Since simple_cmds isn't incremented, the result of the failure from target_setup_cmd_from_cdb causes transport_generic_request_failure to decrement simple_cmds, due to call to transport_complete_task_attr. With this dev->simple_cmds or dev->dev_ordered_sync is now -1, not 0. So when a subsequent command with an Ordered Task is sent, it causes a hang, since dev->simple_cmds is at -1. Tested-by: Bryant G. Ly Signed-off-by: Bryant G. Ly Tested-by: Michael Cyr Signed-off-by: Michael Cyr Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_transport.c | 7 ++++++- include/target/target_core_base.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 8bd7bf6cd986..7bc3778a1ac9 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1815,6 +1815,8 @@ static bool target_handle_task_attr(struct se_cmd *cmd) if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) return false; + cmd->se_cmd_flags |= SCF_TASK_ATTR_SET; + /* * Check for the existence of HEAD_OF_QUEUE, and if true return 1 * to allow the passed struct se_cmd list of tasks to the front of the list. @@ -1937,6 +1939,9 @@ static void transport_complete_task_attr(struct se_cmd *cmd) if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) return; + if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET)) + goto restart; + if (cmd->sam_task_attr == TCM_SIMPLE_TAG) { atomic_dec_mb(&dev->simple_cmds); dev->dev_cur_ordered_id++; @@ -1953,7 +1958,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", dev->dev_cur_ordered_id); } - +restart: target_restart_delayed_cmds(dev); } diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 689f4d207122..59081c73b296 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -139,6 +139,7 @@ enum se_cmd_flags_table { SCF_COMPARE_AND_WRITE_POST = 0x00100000, SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC = 0x00200000, SCF_ACK_KREF = 0x00400000, + SCF_TASK_ATTR_SET = 0x01000000, }; /* struct se_dev_entry->lun_flags and struct se_lun->lun_access */ From 7484fbecff63dd81e396565cdbe28ba598219fdd Mon Sep 17 00:00:00 2001 From: KT Liao Date: Wed, 13 Jul 2016 11:12:12 -0700 Subject: [PATCH 276/813] Input: elan_i2c - properly wake up touchpad on ASUS laptops commit 2de4fcc64685def3e586856a2dc636df44532395 upstream. Some ASUS laptops were shipped with touchpads that require to be woken up first, before trying to switch them into absolute reporting mode, otherwise touchpad would fail to work while flooding the logs with: elan_i2c i2c-ELAN1000:00: invalid report id data (1) Among affected devices are Asus E202SA, N552VW, X456UF, UX305CA, and others. We detect such devices by checking the IC type and product ID numbers and adjusting order of operations accordingly. Signed-off-by: KT Liao Reported-by: Chris Chiu Reported-by: Vlad Glagolev Tested-by: Vlad Glagolev Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elan_i2c_core.c | 79 +++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 2f589857a039..d15b33813021 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -4,7 +4,8 @@ * Copyright (c) 2013 ELAN Microelectronics Corp. * * Author: 林政維 (Duson Lin) - * Version: 1.6.0 + * Author: KT Liao + * Version: 1.6.2 * * Based on cyapa driver: * copyright (c) 2011-2012 Cypress Semiconductor, Inc. @@ -40,7 +41,7 @@ #include "elan_i2c.h" #define DRIVER_NAME "elan_i2c" -#define ELAN_DRIVER_VERSION "1.6.1" +#define ELAN_DRIVER_VERSION "1.6.2" #define ELAN_VENDOR_ID 0x04f3 #define ETP_MAX_PRESSURE 255 #define ETP_FWIDTH_REDUCE 90 @@ -199,9 +200,41 @@ static int elan_sleep(struct elan_tp_data *data) return error; } +static int elan_query_product(struct elan_tp_data *data) +{ + int error; + + error = data->ops->get_product_id(data->client, &data->product_id); + if (error) + return error; + + error = data->ops->get_sm_version(data->client, &data->ic_type, + &data->sm_version); + if (error) + return error; + + return 0; +} + +static int elan_check_ASUS_special_fw(struct elan_tp_data *data) +{ + if (data->ic_type != 0x0E) + return false; + + switch (data->product_id) { + case 0x05 ... 0x07: + case 0x09: + case 0x13: + return true; + default: + return false; + } +} + static int __elan_initialize(struct elan_tp_data *data) { struct i2c_client *client = data->client; + bool woken_up = false; int error; error = data->ops->initialize(client); @@ -210,6 +243,27 @@ static int __elan_initialize(struct elan_tp_data *data) return error; } + error = elan_query_product(data); + if (error) + return error; + + /* + * Some ASUS devices were shipped with firmware that requires + * touchpads to be woken up first, before attempting to switch + * them into absolute reporting mode. + */ + if (elan_check_ASUS_special_fw(data)) { + error = data->ops->sleep_control(client, false); + if (error) { + dev_err(&client->dev, + "failed to wake device up: %d\n", error); + return error; + } + + msleep(200); + woken_up = true; + } + data->mode |= ETP_ENABLE_ABS; error = data->ops->set_mode(client, data->mode); if (error) { @@ -218,11 +272,13 @@ static int __elan_initialize(struct elan_tp_data *data) return error; } - error = data->ops->sleep_control(client, false); - if (error) { - dev_err(&client->dev, - "failed to wake device up: %d\n", error); - return error; + if (!woken_up) { + error = data->ops->sleep_control(client, false); + if (error) { + dev_err(&client->dev, + "failed to wake device up: %d\n", error); + return error; + } } return 0; @@ -248,10 +304,6 @@ static int elan_query_device_info(struct elan_tp_data *data) { int error; - error = data->ops->get_product_id(data->client, &data->product_id); - if (error) - return error; - error = data->ops->get_version(data->client, false, &data->fw_version); if (error) return error; @@ -261,11 +313,6 @@ static int elan_query_device_info(struct elan_tp_data *data) if (error) return error; - error = data->ops->get_sm_version(data->client, &data->ic_type, - &data->sm_version); - if (error) - return error; - error = data->ops->get_version(data->client, true, &data->iap_version); if (error) return error; From 7bda3b121a7f44f34b0470c1ac3496a78769d019 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 24 Jun 2016 10:55:44 -0400 Subject: [PATCH 277/813] SUNRPC: Don't allocate a full sockaddr_storage for tracing commit db1bb44c4c7e8d49ed674dc59e5222d99c698088 upstream. We're always tracing IPv4 or IPv6 addresses, so we can save a lot of space on the ringbuffer by allocating the correct sockaddr size. Signed-off-by: Trond Myklebust Fixes: 83a712e0afef "sunrpc: add some tracepoints around ..." Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- include/trace/events/sunrpc.h | 47 +++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 003dca933803..5664ca07c9c7 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -529,20 +529,27 @@ TRACE_EVENT(svc_xprt_do_enqueue, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) - __field_struct(struct sockaddr_storage, ss) __field(int, pid) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, xprt != NULL ? + xprt->xpt_remotelen : 0) ), TP_fast_assign( __entry->xprt = xprt; - xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); __entry->pid = rqst? rqst->rq_task->pid : 0; - __entry->flags = xprt ? xprt->xpt_flags : 0; + if (xprt) { + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = xprt->xpt_flags; + } else + __entry->flags = 0; ), TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt, - (struct sockaddr *)&__entry->ss, + __get_dynamic_array_len(addr) != 0 ? + (struct sockaddr *)__get_dynamic_array(addr) : NULL, __entry->pid, show_svc_xprt_flags(__entry->flags)) ); @@ -553,18 +560,25 @@ TRACE_EVENT(svc_xprt_dequeue, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) - __field_struct(struct sockaddr_storage, ss) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, xprt != NULL ? + xprt->xpt_remotelen : 0) ), TP_fast_assign( - __entry->xprt = xprt, - xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); - __entry->flags = xprt ? xprt->xpt_flags : 0; + __entry->xprt = xprt; + if (xprt) { + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = xprt->xpt_flags; + } else + __entry->flags = 0; ), TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt, - (struct sockaddr *)&__entry->ss, + __get_dynamic_array_len(addr) != 0 ? + (struct sockaddr *)__get_dynamic_array(addr) : NULL, show_svc_xprt_flags(__entry->flags)) ); @@ -592,19 +606,26 @@ TRACE_EVENT(svc_handle_xprt, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) __field(int, len) - __field_struct(struct sockaddr_storage, ss) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, xprt != NULL ? + xprt->xpt_remotelen : 0) ), TP_fast_assign( __entry->xprt = xprt; - xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); __entry->len = len; - __entry->flags = xprt ? xprt->xpt_flags : 0; + if (xprt) { + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = xprt->xpt_flags; + } else + __entry->flags = 0; ), TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt, - (struct sockaddr *)&__entry->ss, + __get_dynamic_array_len(addr) != 0 ? + (struct sockaddr *)__get_dynamic_array(addr) : NULL, __entry->len, show_svc_xprt_flags(__entry->flags)) ); #endif /* _TRACE_SUNRPC_H */ From e86d99f90b6fe8beb72f024e99c107fd42b65235 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 14 Jun 2016 14:59:38 +0100 Subject: [PATCH 278/813] MIPS: mm: Fix definition of R6 cache instruction commit 4f53989b0652ffe2605221c81ca8ffcfc90aed2a upstream. Commit a168b8f1cde6 ("MIPS: mm: Add MIPS R6 instruction encodings") added an incorrect definition of the redefined MIPSr6 cache instruction. Executing any kernel code including this instuction results in a reserved instruction exception and kernel panic. Fix the instruction definition. Fixes: a168b8f1cde6588ff7a67699fa11e01bc77a5ddd Signed-off-by: Matt Redfearn Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/13663/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/mm/uasm-mips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index b4a837893562..5abe51cad899 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -65,7 +65,7 @@ static struct insn insn_table[] = { #ifndef CONFIG_CPU_MIPSR6 { insn_cache, M(cache_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, #else - { insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 }, + { insn_cache, M6(spec3_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 }, #endif { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD }, From 0b37e9799616c34e2cd9eb8c9174e952e2825b64 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 22 Jul 2016 11:46:31 +0800 Subject: [PATCH 279/813] MIPS: Don't register r4k sched clock when CPUFREQ enabled commit 07d69579e7fec27e371296d8ca9d6076fc401b5c upstream. Don't register r4k sched clock when CPUFREQ enabled because sched clock need a constant frequency. Signed-off-by: Huacai Chen Cc: John Crispin Cc: Steven J . Hill Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/13820/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/csrc-r4k.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index 1f910563fdf6..d76275da54cb 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -23,7 +23,7 @@ static struct clocksource clocksource_mips = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static u64 notrace r4k_read_sched_clock(void) +static u64 __maybe_unused notrace r4k_read_sched_clock(void) { return read_c0_count(); } @@ -82,7 +82,9 @@ int __init init_r4k_clocksource(void) clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); +#ifndef CONFIG_CPU_FREQ sched_clock_register(r4k_read_sched_clock, 32, mips_hpt_frequency); +#endif return 0; } From 23e0fce7b2e31a9672137e0887d521068f467b04 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 21 Jul 2016 14:27:51 +0800 Subject: [PATCH 280/813] MIPS: hpet: Increase HPET_MIN_PROG_DELTA and decrease HPET_MIN_CYCLES commit 3ef06653987d4c4536b408321edf0e5caa2a317f upstream. At first, we prefer to use mips clockevent device, so we decrease the rating of hpet clockevent device. For hpet, if HPET_MIN_PROG_DELTA (minimum delta of hpet programming) is too small and HPET_MIN_CYCLES (threshold of -ETIME checking) is too large, then hpet_next_event() can easily return -ETIME. After commit c6eb3f70d44828 ("hrtimer: Get rid of hrtimer softirq") this will cause a RCU stall. So, HPET_MIN_PROG_DELTA must be sufficient that we don't re-trip the -ETIME check -- if we do, we will return -ETIME, forward the next event time, try to set it, return -ETIME again, and basically lock the system up. Meanwhile, HPET_MIN_CYCLES doesn't need to be too large, 16 cycles is enough. This solution is similar to commit f9eccf24615672 ("clocksource/drivers /vt8500: Increase the minimum delta"). By the way, this patch ensures hpet count/compare to be 32-bit long. Signed-off-by: Huacai Chen Cc: John Crispin Cc: Steven J . Hill Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/13819/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/loongson64/loongson-3/hpet.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/mips/loongson64/loongson-3/hpet.c b/arch/mips/loongson64/loongson-3/hpet.c index a2631a52ca99..444802e78554 100644 --- a/arch/mips/loongson64/loongson-3/hpet.c +++ b/arch/mips/loongson64/loongson-3/hpet.c @@ -13,8 +13,8 @@ #define SMBUS_PCI_REG64 0x64 #define SMBUS_PCI_REGB4 0xb4 -#define HPET_MIN_CYCLES 64 -#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) +#define HPET_MIN_CYCLES 16 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES * 12) static DEFINE_SPINLOCK(hpet_lock); DEFINE_PER_CPU(struct clock_event_device, hpet_clockevent_device); @@ -157,14 +157,14 @@ static int hpet_tick_resume(struct clock_event_device *evt) static int hpet_next_event(unsigned long delta, struct clock_event_device *evt) { - unsigned int cnt; - int res; + u32 cnt; + s32 res; cnt = hpet_read(HPET_COUNTER); - cnt += delta; + cnt += (u32) delta; hpet_write(HPET_T0_CMP, cnt); - res = (int)(cnt - hpet_read(HPET_COUNTER)); + res = (s32)(cnt - hpet_read(HPET_COUNTER)); return res < HPET_MIN_CYCLES ? -ETIME : 0; } @@ -230,7 +230,7 @@ void __init setup_hpet_timer(void) cd = &per_cpu(hpet_clockevent_device, cpu); cd->name = "hpet"; - cd->rating = 320; + cd->rating = 100; cd->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; cd->set_state_shutdown = hpet_set_state_shutdown; cd->set_state_periodic = hpet_set_state_periodic; From a57a55b50558b6134a6465e61655f3d67bfa395c Mon Sep 17 00:00:00 2001 From: Chris Blake Date: Mon, 30 May 2016 07:26:37 -0500 Subject: [PATCH 281/813] PCI: Mark Atheros AR9485 and QCA9882 to avoid bus reset commit 9ac0108c2bac3f1d0255f64fb89fc27e71131b24 upstream. Similar to the AR93xx series, the AR94xx and the Qualcomm QCA988x also have the same quirk for the Bus Reset. Fixes: c3e59ee4e766 ("PCI: Mark Atheros AR93xx to avoid bus reset") Signed-off-by: Chris Blake Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 7e327309cf69..3c4752a288e2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3115,13 +3115,15 @@ static void quirk_no_bus_reset(struct pci_dev *dev) } /* - * Atheros AR93xx chips do not behave after a bus reset. The device will - * throw a Link Down error on AER-capable systems and regardless of AER, - * config space of the device is never accessible again and typically - * causes the system to hang or reset when access is attempted. + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and + * regardless of AER, config space of the device is never accessible again + * and typically causes the system to hang or reset when access is attempted. * http://www.spinics.net/lists/linux-pci/msg34797.html */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0030, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset); static void quirk_no_pm_reset(struct pci_dev *dev) { From 32b04db4f2565382a3cded17290068a4691880a4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Jun 2016 21:28:00 +0300 Subject: [PATCH 282/813] x86/platform/intel_mid_pci: Rework IRQ0 workaround commit bb27570525a71f48347ed0e0c265063e7952bb61 upstream. On Intel Merrifield platform several PCI devices have a bogus configuration, i.e. the IRQ0 had been assigned to few of them. These are PCI root bridge, eMMC0, HS UART common registers, PWM, and HDMI. The actual interrupt line can be allocated to one device exclusively, in our case to eMMC0, the rest should cope without it and basically known drivers for them are not using interrupt line at all. Rework IRQ0 workaround, which was previously done to avoid conflict between eMMC0 and HS UART common registers, to behave differently based on the device in question, i.e. allocate interrupt line to eMMC0, but silently skip interrupt allocation for the rest except HS UART common registers which are not used anyway. With this rework IOSF MBI driver in particular would be used. Signed-off-by: Andy Shevchenko Acked-by: Thomas Gleixner Cc: Bjorn Helgaas Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: 39d9b77b8deb ("x86/pci/intel_mid_pci: Work around for IRQ0 assignment") Link: http://lkml.kernel.org/r/1465842481-136852-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/pci/intel_mid_pci.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 8b93e634af84..ae97f24a4371 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -37,6 +37,7 @@ /* Quirks for the listed devices */ #define PCI_DEVICE_ID_INTEL_MRFL_MMC 0x1190 +#define PCI_DEVICE_ID_INTEL_MRFL_HSU 0x1191 /* Fixed BAR fields */ #define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ @@ -224,14 +225,21 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) /* Special treatment for IRQ0 */ if (dev->irq == 0) { + /* + * Skip HS UART common registers device since it has + * IRQ0 assigned and not used by the kernel. + */ + if (dev->device == PCI_DEVICE_ID_INTEL_MRFL_HSU) + return -EBUSY; /* * TNG has IRQ0 assigned to eMMC controller. But there * are also other devices with bogus PCI configuration * that have IRQ0 assigned. This check ensures that - * eMMC gets it. + * eMMC gets it. The rest of devices still could be + * enabled without interrupt line being allocated. */ if (dev->device != PCI_DEVICE_ID_INTEL_MRFL_MMC) - return -EBUSY; + return 0; } break; default: From 6b1f21a1112e96e419b075406e54dc915b4eade6 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Wed, 3 Aug 2016 09:00:14 +0800 Subject: [PATCH 283/813] ACPI / EC: Work around method reentrancy limit in ACPICA for _Qxx commit e1191bd4f62d9086a1a47adc286e7fcffc1fa55c upstream. A regression is caused by the following commit: Commit: 02b771b64b73226052d6e731a0987db3b47281e9 Subject: ACPI / EC: Fix an issue caused by the serialized _Qxx evaluations In this commit, using system workqueue causes that the maximum parallel executions of _Qxx can exceed 255. This violates the method reentrancy limit in ACPICA and generates the following error log: ACPI Error: Method reached maximum reentrancy limit (255) (20150818/dsmethod-341) This patch creates a seperate workqueue and limits the number of parallel _Qxx evaluations down to a configurable value (can be tuned against number of online CPUs). Since EC events are handled after driver probe, we can create the workqueue in acpi_ec_init(). Fixes: 02b771b64b73 (ACPI / EC: Fix an issue caused by the serialized _Qxx evaluations) Link: https://bugzilla.kernel.org/show_bug.cgi?id=135691 Reported-and-tested-by: Helen Buus Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/ec.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index b420fb46669d..43f20328f830 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -101,6 +101,7 @@ enum ec_command { #define ACPI_EC_UDELAY_POLL 550 /* Wait 1ms for EC transaction polling */ #define ACPI_EC_CLEAR_MAX 100 /* Maximum number of events to query * when trying to clear the EC */ +#define ACPI_EC_MAX_QUERIES 16 /* Maximum number of parallel queries */ enum { EC_FLAGS_QUERY_PENDING, /* Query is pending */ @@ -121,6 +122,10 @@ static unsigned int ec_delay __read_mostly = ACPI_EC_DELAY; module_param(ec_delay, uint, 0644); MODULE_PARM_DESC(ec_delay, "Timeout(ms) waited until an EC command completes"); +static unsigned int ec_max_queries __read_mostly = ACPI_EC_MAX_QUERIES; +module_param(ec_max_queries, uint, 0644); +MODULE_PARM_DESC(ec_max_queries, "Maximum parallel _Qxx evaluations"); + static bool ec_busy_polling __read_mostly; module_param(ec_busy_polling, bool, 0644); MODULE_PARM_DESC(ec_busy_polling, "Use busy polling to advance EC transaction"); @@ -174,6 +179,7 @@ static void acpi_ec_event_processor(struct work_struct *work); struct acpi_ec *boot_ec, *first_ec; EXPORT_SYMBOL(first_ec); +static struct workqueue_struct *ec_query_wq; static int EC_FLAGS_VALIDATE_ECDT; /* ASUStec ECDTs need to be validated */ static int EC_FLAGS_SKIP_DSDT_SCAN; /* Not all BIOS survive early DSDT scan */ @@ -1097,7 +1103,7 @@ static int acpi_ec_query(struct acpi_ec *ec, u8 *data) * work queue execution. */ ec_dbg_evt("Query(0x%02x) scheduled", value); - if (!schedule_work(&q->work)) { + if (!queue_work(ec_query_wq, &q->work)) { ec_dbg_evt("Query(0x%02x) overlapped", value); result = -EBUSY; } @@ -1657,15 +1663,41 @@ static struct acpi_driver acpi_ec_driver = { }, }; +static inline int acpi_ec_query_init(void) +{ + if (!ec_query_wq) { + ec_query_wq = alloc_workqueue("kec_query", 0, + ec_max_queries); + if (!ec_query_wq) + return -ENODEV; + } + return 0; +} + +static inline void acpi_ec_query_exit(void) +{ + if (ec_query_wq) { + destroy_workqueue(ec_query_wq); + ec_query_wq = NULL; + } +} + int __init acpi_ec_init(void) { - int result = 0; + int result; + /* register workqueue for _Qxx evaluations */ + result = acpi_ec_query_init(); + if (result) + goto err_exit; /* Now register the driver for the EC */ result = acpi_bus_register_driver(&acpi_ec_driver); - if (result < 0) - return -ENODEV; + if (result) + goto err_exit; +err_exit: + if (result) + acpi_ec_query_exit(); return result; } @@ -1675,5 +1707,6 @@ static void __exit acpi_ec_exit(void) { acpi_bus_unregister_driver(&acpi_ec_driver); + acpi_ec_query_exit(); } #endif /* 0 */ From 74d55e5d96aaecbff198b3f7bcdc10c1c865ce71 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Tue, 7 Jun 2016 20:13:08 -0300 Subject: [PATCH 284/813] lpfc: fix oops in lpfc_sli4_scmd_to_wqidx_distr() from lpfc_send_taskmgmt() commit 05a05872c8d4b4357c9d913e6d73ae64882bddf5 upstream. The lpfc_sli4_scmd_to_wqidx_distr() function expects the scsi_cmnd 'lpfc_cmd->pCmd' not to be null, and point to the midlayer command. That's not true in the .eh_(device|target|bus)_reset_handler path, because lpfc_send_taskmgmt() sends commands not from the midlayer, so does not set 'lpfc_cmd->pCmd'. That is true in the .queuecommand path because lpfc_queuecommand() stores the scsi_cmnd from midlayer in lpfc_cmd->pCmd; and lpfc_cmd is stored by lpfc_scsi_prep_cmnd() in piocbq->context1 -- which is passed to lpfc_sli4_scmd_to_wqidx_distr() as lpfc_cmd parameter. This problem can be hit on SCSI EH, and immediately with sg_reset. These 2 test-cases demonstrate the problem/fix with next-20160601. Test-case 1) sg_reset # strace sg_reset --device /dev/sdm <...> open("/dev/sdm", O_RDWR|O_NONBLOCK) = 3 ioctl(3, SG_SCSI_RESET, 0x3fffde6d0994 +++ killed by SIGSEGV +++ Segmentation fault # dmesg Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xd00000001c88442c Oops: Kernel access of bad area, sig: 11 [#1] <...> CPU: 104 PID: 16333 Comm: sg_reset Tainted: G W 4.7.0-rc1-next-20160601-00004-g95b89dc #6 <...> NIP [d00000001c88442c] lpfc_sli4_scmd_to_wqidx_distr+0xc/0xd0 [lpfc] LR [d00000001c826fe8] lpfc_sli_calc_ring.part.27+0x98/0xd0 [lpfc] Call Trace: [c000003c9ec876f0] [c000003c9ec87770] 0xc000003c9ec87770 (unreliable) [c000003c9ec87720] [d00000001c82e004] lpfc_sli_issue_iocb+0xd4/0x260 [lpfc] [c000003c9ec87780] [d00000001c831a3c] lpfc_sli_issue_iocb_wait+0x15c/0x5b0 [lpfc] [c000003c9ec87880] [d00000001c87f27c] lpfc_send_taskmgmt+0x24c/0x650 [lpfc] [c000003c9ec87950] [d00000001c87fd7c] lpfc_device_reset_handler+0x10c/0x200 [lpfc] [c000003c9ec87a10] [c000000000610694] scsi_try_bus_device_reset+0x44/0xc0 [c000003c9ec87a40] [c0000000006113e8] scsi_ioctl_reset+0x198/0x2c0 [c000003c9ec87bf0] [c00000000060fe5c] scsi_ioctl+0x13c/0x4b0 [c000003c9ec87c80] [c0000000006629b0] sd_ioctl+0xf0/0x120 [c000003c9ec87cd0] [c00000000046e4f8] blkdev_ioctl+0x248/0xb70 [c000003c9ec87d30] [c0000000002a1f60] block_ioctl+0x70/0x90 [c000003c9ec87d50] [c00000000026d334] do_vfs_ioctl+0xc4/0x890 [c000003c9ec87de0] [c00000000026db60] SyS_ioctl+0x60/0xc0 [c000003c9ec87e30] [c000000000009120] system_call+0x38/0x108 Instruction dump: <...> With fix: # strace sg_reset --device /dev/sdm <...> open("/dev/sdm", O_RDWR|O_NONBLOCK) = 3 ioctl(3, SG_SCSI_RESET, 0x3fffe103c554) = 0 close(3) = 0 exit_group(0) = ? +++ exited with 0 +++ # dmesg [ 424.658649] lpfc 0006:01:00.4: 4:(0):0713 SCSI layer issued Device Reset (1, 0) return x2002 Test-case 2) SCSI EH Using this debug patch to wire an SCSI EH trigger, for lpfc_scsi_cmd_iocb_cmpl(): - cmd->scsi_done(cmd); + if ((phba->pport ? phba->pport->cfg_log_verbose : phba->cfg_log_verbose) == 0x32100000) + printk(KERN_ALERT "lpfc: skip scsi_done()\n"); + else + cmd->scsi_done(cmd); # echo 0x32100000 > /sys/class/scsi_host/host11/lpfc_log_verbose # dd if=/dev/sdm of=/dev/null iflag=direct & <...> After a while: # dmesg lpfc 0006:01:00.4: 4:(0):3053 lpfc_log_verbose changed from 0 (x0) to 839909376 (x32100000) lpfc: skip scsi_done() <...> Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xd0000000199e448c Oops: Kernel access of bad area, sig: 11 [#1] <...> CPU: 96 PID: 28556 Comm: scsi_eh_11 Tainted: G W 4.7.0-rc1-next-20160601-00004-g95b89dc #6 <...> NIP [d0000000199e448c] lpfc_sli4_scmd_to_wqidx_distr+0xc/0xd0 [lpfc] LR [d000000019986fe8] lpfc_sli_calc_ring.part.27+0x98/0xd0 [lpfc] Call Trace: [c000000ff0d0b890] [c000000ff0d0b900] 0xc000000ff0d0b900 (unreliable) [c000000ff0d0b8c0] [d00000001998e004] lpfc_sli_issue_iocb+0xd4/0x260 [lpfc] [c000000ff0d0b920] [d000000019991a3c] lpfc_sli_issue_iocb_wait+0x15c/0x5b0 [lpfc] [c000000ff0d0ba20] [d0000000199df27c] lpfc_send_taskmgmt+0x24c/0x650 [lpfc] [c000000ff0d0baf0] [d0000000199dfd7c] lpfc_device_reset_handler+0x10c/0x200 [lpfc] [c000000ff0d0bbb0] [c000000000610694] scsi_try_bus_device_reset+0x44/0xc0 [c000000ff0d0bbe0] [c0000000006126cc] scsi_eh_ready_devs+0x49c/0x9c0 [c000000ff0d0bcb0] [c000000000614160] scsi_error_handler+0x580/0x680 [c000000ff0d0bd80] [c0000000000ae848] kthread+0x108/0x130 [c000000ff0d0be30] [c0000000000094a8] ret_from_kernel_thread+0x5c/0xb4 Instruction dump: <...> With fix: # dmesg lpfc 0006:01:00.4: 4:(0):3053 lpfc_log_verbose changed from 0 (x0) to 839909376 (x32100000) lpfc: skip scsi_done() <...> lpfc 0006:01:00.4: 4:(0):0713 SCSI layer issued Device Reset (0, 0) return x2002 <...> lpfc 0006:01:00.4: 4:(0):0723 SCSI layer issued Target Reset (1, 0) return x2002 <...> lpfc 0006:01:00.4: 4:(0):0714 SCSI layer issued Bus Reset Data: x2002 <...> lpfc 0006:01:00.4: 4:(0):3172 SCSI layer issued Host Reset Data: <...> Fixes: 8b0dff14164d ("lpfc: Add support for using block multi-queue") Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Johannes Thumshirn Acked-by: James Smart Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 4679ed4444a7..9e165bc05ee1 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -3859,7 +3859,7 @@ int lpfc_sli4_scmd_to_wqidx_distr(struct lpfc_hba *phba, uint32_t tag; uint16_t hwq; - if (shost_use_blk_mq(cmnd->device->host)) { + if (cmnd && shost_use_blk_mq(cmnd->device->host)) { tag = blk_mq_unique_tag(cmnd->request); hwq = blk_mq_unique_tag_to_hwq(tag); From ef60c9aa980b59a61c5464cc38fdfaecdec61e47 Mon Sep 17 00:00:00 2001 From: Alim Akhtar Date: Tue, 5 Jul 2016 15:28:53 +0530 Subject: [PATCH 285/813] rtc: s3c: Add s3c_rtc_{enable/disable}_clk in s3c_rtc_setfreq() commit 70c96dfac0e231424e17743bd52f6cd2ff1f2439 upstream. As per code flow s3c_rtc_setfreq() will get called with rtc clock disabled and in set_freq we perform h/w registers read/write, which results in a kernel crash on exynos7 platform while probing rtc driver. Below is code flow: s3c_rtc_probe() clk_prepare_enable(info->rtc_clk) // rtc clock enabled s3c_rtc_gettime() // will enable clk if not done, and disable it upon exit s3c_rtc_setfreq() //then this will be called with clk disabled This patch take cares of such issue by adding s3c_rtc_{enable/disable}_clk in s3c_rtc_setfreq(). Fixes: 24e1455493da ("drivers/rtc/rtc-s3c.c: delete duplicate clock control") Signed-off-by: Alim Akhtar Reviewed-by: Krzysztof Kozlowski Reviewed-by: Pankaj Dubey Tested-by: Pankaj Dubey Signed-off-by: Alexandre Belloni Signed-off-by: Greg Kroah-Hartman --- drivers/rtc/rtc-s3c.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index ffb860d18701..f92528822f06 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -149,12 +149,14 @@ static int s3c_rtc_setfreq(struct s3c_rtc *info, int freq) if (!is_power_of_2(freq)) return -EINVAL; + s3c_rtc_enable_clk(info); spin_lock_irq(&info->pie_lock); if (info->data->set_freq) info->data->set_freq(info, freq); spin_unlock_irq(&info->pie_lock); + s3c_rtc_disable_clk(info); return 0; } From c6ec712c44c9f12cded15bff56ab045e844d503b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 29 Jul 2016 13:19:55 -0400 Subject: [PATCH 286/813] dm flakey: error READ bios during the down_interval commit 99f3c90d0d85708e7401a81ce3314e50bf7f2819 upstream. When the corrupt_bio_byte feature was introduced it caused READ bios to no longer be errored with -EIO during the down_interval. This had to do with the complexity of needing to submit READs if the corrupt_bio_byte feature was used. Fix it so READ bios are properly errored with -EIO; doing so early in flakey_map() as long as there isn't a match for the corrupt_bio_byte feature. Fixes: a3998799fb4df ("dm flakey: add corrupt_bio_byte feature") Reported-by: Akira Hayakawa Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-flakey.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 09e2afcafd2d..cd0a93df4cb7 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -289,10 +289,16 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) pb->bio_submitted = true; /* - * Map reads as normal. + * Map reads as normal only if corrupt_bio_byte set. */ - if (bio_data_dir(bio) == READ) - goto map_bio; + if (bio_data_dir(bio) == READ) { + /* If flags were specified, only corrupt those that match. */ + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) + goto map_bio; + else + return -EIO; + } /* * Drop writes? @@ -330,12 +336,13 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) /* * Corrupt successful READs while in down state. - * If flags were specified, only corrupt those that match. */ - if (fc->corrupt_bio_byte && !error && pb->bio_submitted && - (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && - all_corrupt_bio_flags_match(bio, fc)) - corrupt_bio_data(bio, fc); + if (!error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (fc->corrupt_bio_byte) + corrupt_bio_data(bio, fc); + else + return -EIO; + } return error; } From bc2318cc76df4fb80c3b5b9ec0e1633627dabb54 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 28 Apr 2016 09:24:01 +0930 Subject: [PATCH 287/813] module: Invalidate signatures on force-loaded modules commit bca014caaa6130e57f69b5bf527967aa8ee70fdd upstream. Signing a module should only make it trusted by the specific kernel it was built for, not anything else. Loading a signed module meant for a kernel with a different ABI could have interesting effects. Therefore, treat all signatures as invalid when a module is force-loaded. Signed-off-by: Ben Hutchings Signed-off-by: Rusty Russell Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 0e5c71195f18..b14a4f31221f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2606,13 +2606,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2631,7 +2636,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -3444,7 +3449,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; From cc4860773f6f8f35a178bc9615ec52c3fbc86ca4 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 28 Apr 2016 09:24:05 +0930 Subject: [PATCH 288/813] Documentation/module-signing.txt: Note need for version info if reusing a key commit b8612e517c3c9809e1200b72c474dbfd969e5a83 upstream. Signing a module should only make it trusted by the specific kernel it was built for, not anything else. If a module signing key is used for multiple ABI-incompatible kernels, the modules need to include enough version information to distinguish them. Signed-off-by: Ben Hutchings Signed-off-by: Rusty Russell Signed-off-by: Greg Kroah-Hartman --- Documentation/module-signing.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/module-signing.txt b/Documentation/module-signing.txt index a78bf1ffa68c..39b7f612c418 100644 --- a/Documentation/module-signing.txt +++ b/Documentation/module-signing.txt @@ -271,3 +271,9 @@ Since the private key is used to sign modules, viruses and malware could use the private key to sign modules and compromise the operating system. The private key must be either destroyed or moved to a secure location and not kept in the root node of the kernel source tree. + +If you use the same private key to sign modules for multiple kernel +configurations, you must ensure that the module version information is +sufficient to prevent loading a module into a different kernel. Either +set CONFIG_MODVERSIONS=y or ensure that each configuration has a different +kernel release string by changing EXTRAVERSION or CONFIG_LOCALVERSION. From 85184740541c2b80b72ebfa46cfe065ec1d1058f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 20 Aug 2016 18:09:38 +0200 Subject: [PATCH 289/813] Linux 4.4.19 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eaedea88a8a7..695c64ec160c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 18 +SUBLEVEL = 19 EXTRAVERSION = NAME = Blurry Fish Butt From 472dd6904d7bfd38d86ec3b8071b7260b51c290d Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 19 Jul 2016 15:00:04 -0700 Subject: [PATCH 290/813] mm: Add is_migrate_cma_page Code such as hardened user copy[1] needs a way to tell if a page is CMA or not. Add is_migrate_cma_page in a similar way to is_migrate_isolate_page. [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 Signed-off-by: Laura Abbott Signed-off-by: Kees Cook (cherry picked from commit 7c15d9bb8231f998ae7dc0b72415f5215459f7fb) Signed-off-by: Alex Shi --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e23a9e704536..bab4053fb795 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -65,8 +65,10 @@ enum { #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false +# define is_migrate_cma_page(_page) false #endif #define for_each_migratetype_order(order, type) \ From fdb92b0de361f9043f359a1de52e2bedd9da4599 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jul 2016 16:19:48 -0700 Subject: [PATCH 291/813] mm: Implement stack frame object validation This creates per-architecture function arch_within_stack_frames() that should validate if a given object is contained by a kernel stack frame. Initial implementation is on x86. This is based on code from PaX. Signed-off-by: Kees Cook (cherry picked from commit 0f60a8efe4005ab5e65ce000724b04d4ca04a199) Signed-off-by: Alex Shi Conflicts: skip EBPF_JIT in arch/x86/Kconfig --- arch/Kconfig | 9 ++++++ arch/x86/Kconfig | 2 +- arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++ include/linux/thread_info.h | 9 ++++++ 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 4e949e58b192..d4d9845530f1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG endchoice +config HAVE_ARCH_WITHIN_STACK_FRAMES + bool + help + An architecture should select this if it can walk the kernel stack + frames to determine if an object is part of either the arguments + or local variables (i.e. that it excludes saved return addresses, + and similar) by implementing an inline arch_within_stack_frames(), + which is used by CONFIG_HARDENED_USERCOPY. + config HAVE_CONTEXT_TRACKING bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d5e3a27bef2..696ec6a54bbf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -86,7 +86,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_BPF_JIT if X86_64 + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..0c977fc124a7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,6 +177,50 @@ static inline unsigned long current_stack_pointer(void) return sp; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3..5ecb68e86968 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -145,6 +145,15 @@ static inline bool test_and_clear_restore_sigmask(void) #error "no set_restore_sigmask() provided and default one won't work" #endif +#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ From 799abb4f9534fe9323c2f931c2989d4bc276b256 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 7 Jun 2016 11:05:33 -0700 Subject: [PATCH 292/813] mm: Hardened usercopy This is the start of porting PAX_USERCOPY into the mainline kernel. This is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The work is based on code by PaX Team and Brad Spengler, and an earlier port from Casey Schaufler. Additional non-slab page tests are from Rik van Riel. This patch contains the logic for validating several conditions when performing copy_to_user() and copy_from_user() on the kernel object being copied to/from: - address range doesn't wrap around - address range isn't NULL or zero-allocated (with a non-zero copy size) - if on the slab allocator: - object size must be less than or equal to copy size (when check is implemented in the allocator, which appear in subsequent patches) - otherwise, object must not span page allocations (excepting Reserved and CMA ranges) - if on the stack - object must not extend before/after the current process stack - object must be contained by a valid stack frame (when there is arch/build support for identifying stack frames) - object must not overlap with kernel text Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Tested-by: Michael Ellerman (cherry picked from commit f5509cc18daa7f82bcc553be70df2117c8eedc16) Signed-off-by: Alex Shi Conflicts: skip debug_page_ref and KCOV_INSTRUMENT in mm/Makefile --- include/linux/slab.h | 12 ++ include/linux/thread_info.h | 15 ++ mm/Makefile | 4 + mm/usercopy.c | 268 ++++++++++++++++++++++++++++++++++++ security/Kconfig | 28 ++++ 5 files changed, 327 insertions(+) create mode 100644 mm/usercopy.c diff --git a/include/linux/slab.h b/include/linux/slab.h index 2037a861e367..4ef384b172e0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -144,6 +144,18 @@ void kfree(const void *); void kzfree(const void *); size_t ksize(const void *); +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page); +#else +static inline const char *__check_heap_object(const void *ptr, + unsigned long n, + struct page *page) +{ + return NULL; +} +#endif + /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 5ecb68e86968..0ae29ff9ccfd 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -154,6 +154,21 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif +#ifdef CONFIG_HARDENED_USERCOPY +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + __check_object_size(ptr, n, to_user); +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..8b532c94008f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,9 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +# Since __builtin_frame_address does work as used, disable the warning. +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -81,3 +84,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 000000000000..8ebae91a6b55 --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,268 @@ +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: fully on the stack (when can't do frame-checking) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * the check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + return GOOD_STACK; +} + +static void report_usercopy(const void *ptr, unsigned long len, + bool to_user, const char *type) +{ + pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", ptr, type ? : "unknown", len); + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const void *ptr, unsigned long n, unsigned long low, + unsigned long high) +{ + unsigned long check_low = (uintptr_t)ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high < low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, + unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + return ""; + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)__va(__pa(textlow)); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return NULL; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)__va(__pa(texthigh)); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + return ""; + + return NULL; +} + +static inline const char *check_bogus_address(const void *ptr, unsigned long n) +{ + /* Reject if object wraps past end of memory. */ + if (ptr + n < ptr) + return ""; + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + return ""; + + return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page, *endpage; + const void *end = ptr + n - 1; + bool is_reserved, is_cma; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* + * Sometimes the kernel data regions are not marked Reserved (see + * check below). And sometimes [_sdata,_edata) does not cover + * rodata and/or bss, so check each range explicitly. + */ + + /* Allow reads of kernel rodata region (if not marked as Reserved). */ + if (ptr >= (const void *)__start_rodata && + end <= (const void *)__end_rodata) { + if (!to_user) + return ""; + return NULL; + } + + /* Allow kernel data region (if not marked as Reserved). */ + if (ptr >= (const void *)_sdata && end <= (const void *)_edata) + return NULL; + + /* Allow kernel bss region (if not marked as Reserved). */ + if (ptr >= (const void *)__bss_start && + end <= (const void *)__bss_stop) + return NULL; + + /* Is the object wholly within one base page? */ + if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == + ((unsigned long)end & (unsigned long)PAGE_MASK))) + return NULL; + + /* Allow if start and end are inside the same compound page. */ + endpage = virt_to_head_page(end); + if (likely(endpage == page)) + return NULL; + + /* + * Reject if range is entirely either Reserved (i.e. special or + * device memory), or CMA. Otherwise, reject since the object spans + * several independently allocated pages. + */ + is_reserved = PageReserved(page); + is_cma = is_migrate_cma_page(page); + if (!is_reserved && !is_cma) + goto reject; + + for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { + page = virt_to_head_page(ptr); + if (is_reserved && !PageReserved(page)) + goto reject; + if (is_cma && !is_migrate_cma_page(page)) + goto reject; + } + + return NULL; + +reject: + return ""; +} + +/* + * Validates that the given object is: + * - not bogus address + * - known-safe heap or stack object + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + const char *err; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + err = check_bogus_address(ptr, n); + if (err) + goto report; + + /* Check for bad heap object. */ + err = check_heap_object(ptr, n, to_user); + if (err) + goto report; + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + err = ""; + goto report; + } + + /* Check for object in kernel to avoid text exposure. */ + err = check_kernel_text_object(ptr, n); + if (!err) + return; + +report: + report_usercopy(ptr, n, to_user, err); +} +EXPORT_SYMBOL(__check_object_size); diff --git a/security/Kconfig b/security/Kconfig index e45237897b43..46c00a674eec 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -118,6 +118,34 @@ config LSM_MMAP_MIN_ADDR this low address space will need the permission specific to the systems running LSM. +config HAVE_HARDENED_USERCOPY_ALLOCATOR + bool + help + The heap allocator implements __check_heap_object() for + validating memory ranges against heap object sizes in + support of CONFIG_HARDENED_USERCOPY. + +config HAVE_ARCH_HARDENED_USERCOPY + bool + help + The architecture supports CONFIG_HARDENED_USERCOPY by + calling check_object_size() just before performing the + userspace copies in the low level implementation of + copy_to_user() and copy_from_user(). + +config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_ARCH_HARDENED_USERCOPY + select BUG + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and + copy_from_user() functions) by rejecting memory ranges that + are larger than the specified heap object, span multiple + separately allocates pages, are not on the process stack, + or are part of the kernel text. This kills entire classes + of heap overflow exploits and similar kernel memory exposures. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 662dda1b7b258dc918933d57f9249a1b5f16512c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:45:09 -0800 Subject: [PATCH 293/813] x86: reorganize SMAP handling in user space accesses This reorganizes how we do the stac/clac instructions in the user access code. Instead of adding the instructions directly to the same inline asm that does the actual user level access and exception handling, add them at a higher level. This is mainly preparation for the next step, where we will expose an interface to allow users to mark several accesses together as being user space accesses, but it does already clean up some code: - the inlined trivial cases of copy_in_user() now do stac/clac just once over the accesses: they used to do one pair around the user space read, and another pair around the write-back. - the {get,put}_user_ex() macros that are used with the catch/try handling don't do any stac/clac at all, because that happens in the try/catch surrounding them. Other than those two cleanups that happened naturally from the re-organization, this should not make any difference. Yet. Signed-off-by: Linus Torvalds (cherry picked from commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8) Signed-off-by: Alex Shi --- arch/x86/include/asm/uaccess.h | 53 +++++++++++------ arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 09b1b0ab94b7..cc228f4713da 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -134,6 +134,9 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); +#define __uaccess_begin() stac() +#define __uaccess_end() clac() + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -304,6 +307,10 @@ do { \ } \ } while (0) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __put_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -358,9 +365,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %2,%"rtype"1\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -370,6 +377,10 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __get_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -400,7 +411,9 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __uaccess_begin(); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __uaccess_end(); \ __builtin_expect(__pu_err, 0); \ }) @@ -408,7 +421,9 @@ do { \ ({ \ int __gu_err; \ unsigned long __gu_val; \ + __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ }) @@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %"rtype"1,%2\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; }; */ #define uaccess_try do { \ current_thread_info()->uaccess_err = 0; \ - stac(); \ + __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ - clac(); \ + __uaccess_end(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ } while (0) @@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ + __uaccess_begin(); \ switch (size) { \ case 1: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 2: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 4: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void) if (!IS_ENABLED(CONFIG_X86_64)) \ __cmpxchg_wrong_size(); \ \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void) default: \ __cmpxchg_wrong_size(); \ } \ + __uaccess_end(); \ *__uval = __old; \ __ret; \ }) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2f9b39b274a..b89c34c4019b 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + case 1: + __uaccess_begin(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); + __uaccess_end(); return ret; - case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + case 2: + __uaccess_begin(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; - case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + case 4: + __uaccess_begin(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); + __uaccess_end(); return ret; - case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + case 8: + __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); + if (likely(!ret)) + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); + if (likely(!ret)) + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + __uaccess_end(); return ret; default: return copy_user_generic(dst, (__force void *)src, size); @@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + case 1: + __uaccess_begin(); + __put_user_asm(*(u8 *)src, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; - case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + case 2: + __uaccess_begin(); + __put_user_asm(*(u16 *)src, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; - case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + case 4: + __uaccess_begin(); + __put_user_asm(*(u32 *)src, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; - case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + case 8: + __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 10); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + } + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 16); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "er", 8); + } + __uaccess_end(); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) switch (size) { case 1: { u8 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u8 __user *)src, ret, "b", "b", "=q", 1); if (likely(!ret)) __put_user_asm(tmp, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; } case 2: { u16 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u16 __user *)src, ret, "w", "w", "=r", 2); if (likely(!ret)) __put_user_asm(tmp, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; } case 4: { u32 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u32 __user *)src, ret, "l", "k", "=r", 4); if (likely(!ret)) __put_user_asm(tmp, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; } case 8: { u64 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u64 __user *)src, ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; } default: From 798522d907ede95418a20e7153101b4659151e32 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:57:27 -0800 Subject: [PATCH 294/813] Add 'unsafe' user access functions for batched accesses The naming is meant to discourage random use: the helper functions are not really any more "unsafe" than the traditional double-underscore functions (which need the address range checking), but they do need even more infrastructure around them, and should not be used willy-nilly. In addition to checking the access range, these user access functions require that you wrap the user access with a "user_acess_{begin,end}()" around it. That allows architectures that implement kernel user access control (x86: SMAP, arm64: PAN) to do the user access control in the wrapping user_access_begin/end part, and then batch up the actual user space accesses using the new interfaces. The main (and hopefully only) use for these are for core generic access helpers, initially just the generic user string functions (strnlen_user() and strncpy_from_user()). Signed-off-by: Linus Torvalds (cherry picked from commit 5b24a7a2aa2040c8c50c3b71122901d01661ff78) Signed-off-by: Alex Shi --- arch/x86/include/asm/uaccess.h | 25 +++++++++++++++++++++++++ include/linux/uaccess.h | 7 +++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index cc228f4713da..ca59e4f9254e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -762,5 +762,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #undef __copy_from_user_overflow #undef __copy_to_user_overflow +/* + * The "unsafe" user accesses aren't really "unsafe", but the naming + * is a big fat warning: you have to not only do the access_ok() + * checking before using them, but you have to surround them with the + * user_access_begin/end() pair. + */ +#define user_access_begin() __uaccess_begin() +#define user_access_end() __uaccess_end() + +#define unsafe_put_user(x, ptr) \ +({ \ + int __pu_err; \ + __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __builtin_expect(__pu_err, 0); \ +}) + +#define unsafe_get_user(x, ptr) \ +({ \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ +}) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 558129af828a..349557825428 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -111,4 +111,11 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define probe_kernel_address(addr, retval) \ probe_kernel_read(&retval, addr, sizeof(retval)) +#ifndef user_access_begin +#define user_access_begin() do { } while (0) +#define user_access_end() do { } while (0) +#define unsafe_get_user(x, ptr) __get_user(x, ptr) +#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#endif + #endif /* __LINUX_UACCESS_H__ */ From ea2e77f49015ae941d7c98fbe066d387dec8fa55 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 10:05:19 -0800 Subject: [PATCH 295/813] Use the new batched user accesses in generic user string handling This converts the generic user string functions to use the batched user access functions. It makes a big difference on Skylake, which is the first x86 microarchitecture to implement SMAP. The STAC/CLAC instructions are not very fast, and doing them for each access inside the loop that copies strings from user space (which is what the pathname handling does for every pathname the kernel uses, for example) is very inefficient. Signed-off-by: Linus Torvalds (cherry picked from commit 9fd4470ff4974c41b1db43c3b355b9085af9c12a) Signed-off-by: Alex Shi --- lib/strncpy_from_user.c | 11 ++++++++--- lib/strnlen_user.c | 18 ++++++++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index e0af6ff73d14..33840324138c 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,7 +39,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) break; *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { @@ -55,7 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(__get_user(c,src+res))) + if (unlikely(unsafe_get_user(c,src+res))) return -EFAULT; dst[res] = c; if (!c) @@ -107,7 +107,12 @@ long strncpy_from_user(char *dst, const char __user *src, long count) src_addr = (unsigned long)src; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strncpy_from_user(dst, src, count, max); + long retval; + + user_access_begin(); + retval = do_strncpy_from_user(dst, src, count, max); + user_access_end(); + return retval; } return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 3a5f2b366d84..2625943625d7 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,7 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(__get_user(c,(unsigned long __user *)src))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) return 0; c |= aligned_byte_mask(align); @@ -61,7 +61,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) return 0; } res -= align; @@ -112,7 +112,12 @@ long strnlen_user(const char __user *str, long count) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, count, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, count, max); + user_access_end(); + return retval; } return 0; } @@ -141,7 +146,12 @@ long strlen_user(const char __user *str) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, ~0ul, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, ~0ul, max); + user_access_end(); + return retval; } return 0; } From 9a6d5a02d83700dfbb66ff01dd874e722b8ecefd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Feb 2016 14:58:52 -0800 Subject: [PATCH 296/813] x86: fix SMAP in 32-bit environments In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses") I changed how the stac/clac instructions were generated around the user space accesses, which then made it possible to do batched accesses efficiently for user string copies etc. However, in doing so, I completely spaced out, and didn't even think about the 32-bit case. And nobody really even seemed to notice, because SMAP doesn't even exist until modern Skylake processors, and you'd have to be crazy to run 32-bit kernels on a modern CPU. Which brings us to Andy Lutomirski. He actually tested the 32-bit kernel on new hardware, and noticed that it doesn't work. My bad. The trivial fix is to add the required uaccess begin/end markers around the raw accesses in . I feel a bit bad about this patch, just because that header file really should be cleaned up to avoid all the duplicated code in it, and this commit just expands on the problem. But this just fixes the bug without any bigger cleanup surgery. Reported-and-tested-by: Andy Lutomirski Signed-off-by: Linus Torvalds (cherry picked from commit de9e478b9d49f3a0214310d921450cf5bb4a21e6) Signed-off-by: Alex Shi --- arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index f5dcb5204dcd..3fe0eac59462 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -48,20 +48,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __put_user_size(*(u8 *)from, (u8 __user *)to, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __put_user_size(*(u16 *)from, (u16 __user *)to, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); + __uaccess_end(); return ret; case 8: + __uaccess_begin(); __put_user_size(*(u64 *)from, (u64 __user *)to, 8, ret, 8); + __uaccess_end(); return ret; } } @@ -103,13 +111,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -148,13 +162,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -170,13 +190,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } From 30e3024be4f381b3d048ec8a5989f03fb7eb043d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 14:19:37 -0700 Subject: [PATCH 297/813] x86: remove pointless uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_to_user_inatomic()" is mostly the special cases for the constant size, and it's actually never relevant. Every user except for one aren't actually using a constant size anyway, and the one user that uses it is better off just using __put_user() instead. So get rid of the unnecessary complexity. [ The same cleanup should likely happen to __copy_from_user_inatomic() as well, but that one has a lot more users that I need to take a look at first ] Signed-off-by: Linus Torvalds (cherry picked from commit 5b09c3edecd37ec1a52fbd5ae97a19734edc7a77) Signed-off-by: Alex Shi --- arch/x86/include/asm/uaccess_32.h | 36 ---------------------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 +-- 2 files changed, 1 insertion(+), 39 deletions(-) diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3fe0eac59462..537cc883ea29 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,46 +33,10 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. - * - * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault - * we return the initial request size (1, 2 or 4), as copy_*_user should do. - * If a store crosses a page boundary and gets a fault, the x86 will not write - * anything, so this is accurate. */ - static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __put_user_size(*(u8 *)from, (u8 __user *)to, - 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __put_user_size(*(u16 *)from, (u16 __user *)to, - 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __put_user_size(*(u32 *)from, (u32 __user *)to, - 4, ret, 4); - __uaccess_end(); - return ret; - case 8: - __uaccess_begin(); - __put_user_size(*(u64 *)from, (u64 __user *)to, - 8, ret, 8); - __uaccess_end(); - return ret; - } - } return __copy_to_user_ll(to, from, n); } diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 6ed7d63a0688..201947b4377c 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -513,9 +513,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, return ret; if (r->presumed_offset != offset && - __copy_to_user_inatomic(&user_relocs->presumed_offset, - &r->presumed_offset, - sizeof(r->presumed_offset))) { + __put_user(r->presumed_offset, &user_relocs->presumed_offset)) { return -EFAULT; } From 41a69b502d8ff510880646d246d17b8617b8f49a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 17:21:27 -0700 Subject: [PATCH 298/813] x86: remove more uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_from_user_inatomic()" is mostly the special cases for the constant size, and it's actually almost never relevant. Most users aren't actually using a constant size anyway, and the few cases that do small constant copies are better off just using __get_user() instead. So get rid of the unnecessary complexity. Signed-off-by: Linus Torvalds (cherry picked from commit bd28b14591b98f696bc9f94c5ba2e598ca487dfd) Signed-off-by: Alex Shi --- arch/x86/include/asm/uaccess_32.h | 26 -------------------------- kernel/events/uprobes.c | 3 +-- kernel/futex.c | 2 +- mm/maccess.c | 3 +-- 4 files changed, 3 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 537cc883ea29..4b32da24faaf 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -65,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not - * constant, that shouldn't be a problem. - */ - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } return __copy_from_user_ll_nozero(to, from, n); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913abf..4dcc16991b67 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1692,8 +1692,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/futex.c b/kernel/futex.c index 9d8163afd87c..e8af73cc51a7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -681,7 +681,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; diff --git a/mm/maccess.c b/mm/maccess.c index d159b1c96e48..78f9274dd49d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_disable(); do { - ret = __copy_from_user_inatomic(dst++, - (const void __user __force *)src++, 1); + ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; From 4f80bcbe91072b5b83ff3fe0b7cb1e29cfa638be Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:04:01 -0700 Subject: [PATCH 299/813] x86/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in copy_*_user() and __copy_*_user() because copy_*_user() actually calls down to _copy_*_user() and not __copy_*_user(). Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks (cherry picked from commit 5b710f34e194c6b7710f69fdb5d798fdf35b98c1) Signed-off-by: Alex Shi --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 10 ++++++---- arch/x86/include/asm/uaccess_32.h | 2 ++ arch/x86/include/asm/uaccess_64.h | 2 ++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 696ec6a54bbf..924bbffc56f0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -77,6 +77,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ca59e4f9254e..dd73cf90fb18 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -731,9 +731,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) * case, and do only runtime checking for non-constant sizes. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_from_user_overflow(); else __copy_from_user_overflow(sz, n); @@ -749,9 +750,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_to_user_overflow(); else __copy_to_user_overflow(sz, n); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 4b32da24faaf..7d3bdd1ed697 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -37,6 +37,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); return __copy_to_user_ll(to, from, n); } @@ -95,6 +96,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index b89c34c4019b..2957c8237c28 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -53,6 +53,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -117,6 +118,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { From 49f10dde93814929a4e8a4283025cb4c6a6c8e50 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:06:53 -0700 Subject: [PATCH 300/813] ARM: uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook (cherry picked from commit dfd45b6103c973bfcea2341d89e36faf947dbc33) Signed-off-by: Alex Shi --- arch/arm/Kconfig | 1 + arch/arm/include/asm/uaccess.h | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 34e1569a11ee..51f1775e3adb 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -35,6 +35,7 @@ config ARM select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK select HAVE_BPF_JIT diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 35c9db857ebe..7fb59199c6bb 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n); static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(to, n, false); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_from_user(to, from, n); uaccess_restore(__ua_flags); return n; @@ -511,11 +514,15 @@ static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { #ifndef CONFIG_UACCESS_WITH_MEMCPY - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(from, n, true); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_to_user(to, from, n); uaccess_restore(__ua_flags); return n; #else + check_object_size(from, n, true); return arm_copy_to_user(to, from, n); #endif } From 3308a2cca15539d1be33a0c2abb0635596ab8fdf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:59:42 -0700 Subject: [PATCH 301/813] arm64/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm64. As done by KASAN in -next, renames the low-level functions to __arch_copy_*_user() so a static inline can do additional work before the copy. Signed-off-by: Kees Cook (cherry picked from commit faf5b63e294151d6ac24ca6906d6f221bd3496cd) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/uaccess.h | 29 ++++++++++++++++++++++------- arch/arm64/kernel/arm64ksyms.c | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 97583a1878db..8dbe3cba855c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -49,6 +49,7 @@ config ARM64 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 0685d74572af..c3d445b42351 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -269,24 +269,39 @@ do { \ -EFAULT; \ }) -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + check_object_size(to, n, false); + return __arch_copy_from_user(to, from, n); +} + +static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + check_object_size(from, n, true); + return __arch_copy_to_user(to, from, n); +} + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else /* security hole - plug it */ + if (access_ok(VERIFY_READ, from, n)) { + check_object_size(to, n, false); + n = __arch_copy_from_user(to, from, n); + } else /* security hole - plug it */ memset(to, 0, n); return n; } static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + if (access_ok(VERIFY_WRITE, to, n)) { + check_object_size(from, n, true); + n = __arch_copy_to_user(to, from, n); + } return n; } diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 3b6d8cc9dfe0..c654df05b7d7 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -33,8 +33,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); /* user mem (segment) */ -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); +EXPORT_SYMBOL(__arch_copy_from_user); +EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_in_user); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 17e8306dca29..0b90497d4424 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -66,7 +66,7 @@ .endm end .req x5 -ENTRY(__copy_from_user) +ENTRY(__arch_copy_from_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret -ENDPROC(__copy_from_user) +ENDPROC(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 21faae60f988..7a7efe255034 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -65,7 +65,7 @@ .endm end .req x5 -ENTRY(__copy_to_user) +ENTRY(__arch_copy_to_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret -ENDPROC(__copy_to_user) +ENDPROC(__arch_copy_to_user) .section .fixup,"ax" .align 2 From 434bef236c17900f7c4248c0c97676f7f2a0baa3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:09:50 -0700 Subject: [PATCH 302/813] ia64/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on ia64. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook (cherry picked from commit 73d35887e24da77e8d1321b2e92bd9b9128e2fc2) Signed-off-by: Alex Shi --- arch/ia64/Kconfig | 1 + arch/ia64/include/asm/uaccess.h | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index eb0249e37981..2c86a4ef6742 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -53,6 +53,7 @@ config IA64 select MODULES_USE_ELF_RELA select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 4f3fb6ccbf21..3d6b840c5c99 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -241,12 +241,18 @@ extern unsigned long __must_check __copy_user (void __user *to, const void __use static inline unsigned long __copy_to_user (void __user *to, const void *from, unsigned long count) { + if (!__builtin_constant_p(count)) + check_object_size(from, count, true); + return __copy_user(to, (__force void __user *) from, count); } static inline unsigned long __copy_from_user (void *to, const void __user *from, unsigned long count) { + if (!__builtin_constant_p(count)) + check_object_size(to, count, false); + return __copy_user((__force void __user *) to, from, count); } @@ -258,8 +264,11 @@ __copy_from_user (void *to, const void __user *from, unsigned long count) const void *__cu_from = (from); \ long __cu_len = (n); \ \ - if (__access_ok(__cu_to, __cu_len, get_fs())) \ - __cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len); \ + if (__access_ok(__cu_to, __cu_len, get_fs())) { \ + if (!__builtin_constant_p(n)) \ + check_object_size(__cu_from, __cu_len, true); \ + __cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len); \ + } \ __cu_len; \ }) @@ -270,8 +279,11 @@ __copy_from_user (void *to, const void __user *from, unsigned long count) long __cu_len = (n); \ \ __chk_user_ptr(__cu_from); \ - if (__access_ok(__cu_from, __cu_len, get_fs())) \ + if (__access_ok(__cu_from, __cu_len, get_fs())) { \ + if (!__builtin_constant_p(n)) \ + check_object_size(__cu_to, __cu_len, false); \ __cu_len = __copy_user((__force void __user *) __cu_to, __cu_from, __cu_len); \ + } \ __cu_len; \ }) From 225237bf68dc39bc952781d2b54732d70187fd28 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:10:01 -0700 Subject: [PATCH 303/813] powerpc/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on powerpc. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Michael Ellerman (cherry picked from commit 1d3c1324746fed0e34a5b94d3ed303e7521ed603) Signed-off-by: Alex Shi --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/uaccess.h | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index db49e0d796b1..ec7b8f1e4822 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -160,6 +160,7 @@ config PPC select EDAC_ATOMIC_SCRUB select ARCH_HAS_DMA_SET_COHERENT_MASK select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_HARDENED_USERCOPY config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 2a8ebae0936b..b39a69370057 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -325,10 +325,15 @@ static inline unsigned long copy_from_user(void *to, { unsigned long over; - if (access_ok(VERIFY_READ, from, n)) + if (access_ok(VERIFY_READ, from, n)) { + if (!__builtin_constant_p(n)) + check_object_size(to, n, false); return __copy_tofrom_user((__force void __user *)to, from, n); + } if ((unsigned long)from < TASK_SIZE) { over = (unsigned long)from + n - TASK_SIZE; + if (!__builtin_constant_p(n - over)) + check_object_size(to, n - over, false); return __copy_tofrom_user((__force void __user *)to, from, n - over) + over; } @@ -340,10 +345,15 @@ static inline unsigned long copy_to_user(void __user *to, { unsigned long over; - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(VERIFY_WRITE, to, n)) { + if (!__builtin_constant_p(n)) + check_object_size(from, n, true); return __copy_tofrom_user(to, (__force void __user *)from, n); + } if ((unsigned long)to < TASK_SIZE) { over = (unsigned long)to + n - TASK_SIZE; + if (!__builtin_constant_p(n)) + check_object_size(from, n - over, true); return __copy_tofrom_user(to, (__force void __user *)from, n - over) + over; } @@ -387,6 +397,10 @@ static inline unsigned long __copy_from_user_inatomic(void *to, if (ret == 0) return 0; } + + if (!__builtin_constant_p(n)) + check_object_size(to, n, false); + return __copy_tofrom_user((__force void __user *)to, from, n); } @@ -413,6 +427,9 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to, if (ret == 0) return 0; } + if (!__builtin_constant_p(n)) + check_object_size(from, n, true); + return __copy_tofrom_user(to, (__force const void __user *)from, n); } From 17427c2db3e44dca3d074befd3829134e32dfea3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:10:13 -0700 Subject: [PATCH 304/813] sparc/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on sparc. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook (cherry picked from commit 9d9208a15800f9f06f102f9aac1e8b323c3b8575) Signed-off-by: Alex Shi --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/uaccess_32.h | 14 ++++++++++---- arch/sparc/include/asm/uaccess_64.h | 11 +++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 56442d2d7bbc..3736be630113 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -43,6 +43,7 @@ config SPARC select ODD_RT_SIGACTION select OLD_SIGSUSPEND select ARCH_HAS_SG_CHAIN + select HAVE_ARCH_HARDENED_USERCOPY config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h index 64ee103dc29d..4cfb77913cd2 100644 --- a/arch/sparc/include/asm/uaccess_32.h +++ b/arch/sparc/include/asm/uaccess_32.h @@ -313,22 +313,28 @@ unsigned long __copy_user(void __user *to, const void __user *from, unsigned lon static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) { - if (n && __access_ok((unsigned long) to, n)) + if (n && __access_ok((unsigned long) to, n)) { + if (!__builtin_constant_p(n)) + check_object_size(from, n, true); return __copy_user(to, (__force void __user *) from, n); - else + } else return n; } static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { + if (!__builtin_constant_p(n)) + check_object_size(from, n, true); return __copy_user(to, (__force void __user *) from, n); } static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) { - if (n && __access_ok((unsigned long) from, n)) + if (n && __access_ok((unsigned long) from, n)) { + if (!__builtin_constant_p(n)) + check_object_size(to, n, false); return __copy_user((__force void __user *) to, from, n); - else + } else return n; } diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index ea6e9a20f3ff..6069e9040388 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -250,8 +250,12 @@ unsigned long copy_from_user_fixup(void *to, const void __user *from, static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long size) { - unsigned long ret = ___copy_from_user(to, from, size); + unsigned long ret; + if (!__builtin_constant_p(size)) + check_object_size(to, size, false); + + ret = ___copy_from_user(to, from, size); if (unlikely(ret)) ret = copy_from_user_fixup(to, from, size); @@ -267,8 +271,11 @@ unsigned long copy_to_user_fixup(void __user *to, const void *from, static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long size) { - unsigned long ret = ___copy_to_user(to, from, size); + unsigned long ret; + if (!__builtin_constant_p(size)) + check_object_size(from, size, true); + ret = ___copy_to_user(to, from, size); if (unlikely(ret)) ret = copy_to_user_fixup(to, from, size); return ret; From 41e3ca9b2fc29b61dda146c9ba79736ba11e28e8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 Jul 2016 11:38:39 -0700 Subject: [PATCH 305/813] s390/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on s390. Signed-off-by: Kees Cook (cherry picked from commit 97433ea4fda62349bfa42089455593cbcb57e06c) Signed-off-by: Alex Shi --- arch/s390/Kconfig | 1 + arch/s390/lib/uaccess.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3a55f493c7da..60530fd93d6d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -117,6 +117,7 @@ config S390 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_EARLY_PFN_TO_NID + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index ae4de559e3a0..6986c20166f0 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -104,6 +104,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { + check_object_size(to, n, false); if (static_branch_likely(&have_mvcos)) return copy_from_user_mvcos(to, from, n); return copy_from_user_mvcp(to, from, n); @@ -177,6 +178,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); if (static_branch_likely(&have_mvcos)) return copy_to_user_mvcos(to, from, n); return copy_to_user_mvcs(to, from, n); From 784bd0f8d7303ff0025e2ea44b68b6fc6323544d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:20:59 -0700 Subject: [PATCH 306/813] mm: SLAB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLAB allocator to catch any copies that may span objects. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks (cherry picked from commit 04385fc5e8fffed84425d909a783c0f0c587d847) Signed-off-by: Alex Shi --- init/Kconfig | 1 + mm/slab.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..fa031a140397 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1719,6 +1719,7 @@ choice config SLAB bool "SLAB" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work well in all environments. It organizes cache hot objects in diff --git a/mm/slab.c b/mm/slab.c index 4765c97ce690..24a615d42d74 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4228,6 +4228,36 @@ static int __init slab_proc_init(void) module_init(slab_proc_init); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + /* Find and validate object. */ + cachep = page->slab_cache; + objnr = obj_to_index(cachep, page, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within object size. */ + if (offset <= cachep->object_size && n <= cachep->object_size - offset) + return NULL; + + return cachep->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + /** * ksize - get the actual amount of memory allocated for a given object * @objp: Pointer to the object From 3ad78bad4fd43467f1fc6dff63076789b30c116b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:24:05 -0700 Subject: [PATCH 307/813] mm: SLUB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLUB allocator to catch any copies that may span objects. Includes a redzone handling fix discovered by Michael Ellerman. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Michael Ellerman Reviwed-by: Laura Abbott (cherry picked from commit ed18adc1cdd00a5c55a20fbdaed4804660772281) Signed-off-by: Alex Shi --- init/Kconfig | 1 + mm/slub.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index fa031a140397..e1d1d6936f92 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1727,6 +1727,7 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). diff --git a/mm/slub.c b/mm/slub.c index 65d5f92d51d2..fbadb3753d4d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3585,6 +3585,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *s; + unsigned long offset; + size_t object_size; + + /* Find object and usable object size. */ + s = page->slab_cache; + object_size = slab_ksize(s); + + /* Reject impossible pointers. */ + if (ptr < page_address(page)) + return s->name; + + /* Find offset within object. */ + offset = (ptr - page_address(page)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (offset < s->red_left_pad) + return s->name; + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; + + return s->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + static size_t __ksize(const void *object) { struct page *page; From 8ef7c21dd8130e6ce469bbe2747fbc0a5d3e0488 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Jun 2016 14:49:51 -0700 Subject: [PATCH 308/813] hugetlb: fix nr_pmds accounting with shared page tables commit c17b1f42594eb71b8d3eb5a6dfc907a7eb88a51d upstream. We account HugeTLB's shared page table to all processes who share it. The accounting happens during huge_pmd_share(). If somebody populates pud entry under us, we should decrease pagetable's refcount and decrease nr_pmds of the process. By mistake, I increase nr_pmds again in this case. :-/ It will lead to "BUG: non-zero nr_pmds on freeing mm: 2" on process' exit. Let's fix this by increasing nr_pmds only when we're sure that the page table will be used. Link: http://lkml.kernel.org/r/20160617122506.GC6534@node.shutemov.name Fixes: dc6c9a35b66b ("mm: account pmd page tables to the process") Signed-off-by: Kirill A. Shutemov Reported-by: zhongjiang Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c31f184daf8..125c7dd55322 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4213,7 +4213,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { - mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -4228,9 +4227,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); + mm_inc_nr_pmds(mm); } else { put_page(virt_to_page(spte)); - mm_inc_nr_pmds(mm); } spin_unlock(ptl); out: From ebabe4ad97125e061396869fb038fa434c57a22a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 5 Aug 2016 15:37:39 +0200 Subject: [PATCH 309/813] x86/mm: Disable preemption during CR3 read+write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5cf0791da5c162ebc14b01eb01631cfa7ed4fa6e upstream. There's a subtle preemption race on UP kernels: Usually current->mm (and therefore mm->pgd) stays the same during the lifetime of a task so it does not matter if a task gets preempted during the read and write of the CR3. But then, there is this scenario on x86-UP: TaskA is in do_exit() and exit_mm() sets current->mm = NULL followed by: -> mmput() -> exit_mmap() -> tlb_finish_mmu() -> tlb_flush_mmu() -> tlb_flush_mmu_tlbonly() -> tlb_flush() -> flush_tlb_mm_range() -> __flush_tlb_up() -> __flush_tlb() -> __native_flush_tlb() At this point current->mm is NULL but current->active_mm still points to the "old" mm. Let's preempt taskA _after_ native_read_cr3() by taskB. TaskB has its own mm so CR3 has changed. Now preempt back to taskA. TaskA has no ->mm set so it borrows taskB's mm and so CR3 remains unchanged. Once taskA gets active it continues where it was interrupted and that means it writes its old CR3 value back. Everything is fine because userland won't need its memory anymore. Now the fun part: Let's preempt taskA one more time and get back to taskB. This time switch_mm() won't do a thing because oldmm (->active_mm) is the same as mm (as per context_switch()). So we remain with a bad CR3 / PGD and return to userland. The next thing that happens is handle_mm_fault() with an address for the execution of its code in userland. handle_mm_fault() realizes that it has a PTE with proper rights so it returns doing nothing. But the CPU looks at the wrong PGD and insists that something is wrong and faults again. And again. And one more time… This pagefault circle continues until the scheduler gets tired of it and puts another task on the CPU. It gets little difficult if the task is a RT task with a high priority. The system will either freeze or it gets fixed by the software watchdog thread which usually runs at RT-max prio. But waiting for the watchdog will increase the latency of the RT task which is no good. Fix this by disabling preemption across the critical code section. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1470404259-26290-1-git-send-email-bigeasy@linutronix.de [ Prettified the changelog. ] Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6df2029405a3..3142218e546f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -86,7 +86,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) From 77b0e10991abb866b832a7caa470c95e7117befa Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 11 Aug 2016 17:45:21 +0200 Subject: [PATCH 310/813] uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions commit 68187872c76a96ed4db7bfb064272591f02e208b upstream. Since instruction decoder now supports EVEX-encoded instructions, two fixes are needed to correctly handle them in uprobes. Extended bits for MODRM.rm field need to be sanitized just like we do it for VEX3, to avoid encoding wrong register for register-relative access. EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be ignored by the CPU (since GPRs go only up to 15, not 31), but let's be paranoid here: proper encoding for register-relative access should have EVEX.x = 1. Secondly, we should fetch vex.vvvv for EVEX too. This is now super easy because instruction decoder populates vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2. Signed-off-by: Denys Vlasenko Acked-by: Masami Hiramatsu Acked-by: Srikar Dronamraju Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX") Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlasenk@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/uprobes.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index bf4db6eaec8f..c6aace2bbe08 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. From b529544b0165e8b4bf2b6a9b32a7efa5eb4bdc99 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 10 Aug 2016 15:59:09 -0700 Subject: [PATCH 311/813] tools/testing/nvdimm: fix SIGTERM vs hotplug crash commit d8d378fa1a0c98ecb50ca52c9bf3bc14e25aa2d2 upstream. The unit tests crash when hotplug races the previous probe. This race requires that the loading of the nfit_test module be terminated with SIGTERM, and the module to be unloaded while the ars scan is still running. In contrast to the normal nfit driver, the unit test calls acpi_nfit_init() twice to simulate hotplug, whereas the nominal case goes through the acpi_nfit_notify() event handler. The acpi_nfit_notify() path is careful to flush the previous region registration before servicing the hotplug event. The unit test was missing this guarantee. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] pwq_activate_delayed_work+0x47/0x170 [..] Call Trace: [] pwq_dec_nr_in_flight+0x66/0xa0 [] process_one_work+0x2d0/0x680 [] ? process_one_work+0x171/0x680 [] worker_thread+0x4e/0x480 [] ? process_one_work+0x680/0x680 [] ? process_one_work+0x680/0x680 [] kthread+0xf3/0x110 [] ret_from_fork+0x1f/0x40 [] ? kthread_create_on_node+0x230/0x230 Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- tools/testing/nvdimm/test/nfit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 51cf8256c6cd..f0d1c8ff8e8a 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include #include @@ -1246,6 +1247,7 @@ static int nfit_test_probe(struct platform_device *pdev) if (nfit_test->setup != nfit_test0_setup) return 0; + flush_work(&acpi_desc->work); nfit_test->setup_hotplug = 1; nfit_test->setup(nfit_test); From e3f2840dbab5996a1ab45cf071a68e72b39f3a9a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Aug 2016 13:36:08 -0400 Subject: [PATCH 312/813] SUNRPC: Handle EADDRNOTAVAIL on connection failures commit 1f4c17a03ba7f430d63dba8c8e08ff1e2712581d upstream. If the connect attempt immediately fails with an EADDRNOTAVAIL error, then that means our choice of source port number was bad. This error is expected when we set the SO_REUSEPORT socket option and we have 2 sockets sharing the same source and destination address and port combinations. Signed-off-by: Trond Myklebust Fixes: 402e23b4ed9ed ("SUNRPC: Fix stupid typo in xs_sock_set_reuseport") Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/xprtsock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 027c9ef8a263..1ba417207465 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2286,6 +2286,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* SYN_SENT! */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + break; + case -EADDRNOTAVAIL: + /* Source port number is unavailable. Try a new one! */ + transport->srcport = 0; } out: return ret; From 94e88c12e0fed8f5f8c86769260f284f2a4ac798 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 3 Aug 2016 20:19:48 -0400 Subject: [PATCH 313/813] SUNRPC: allow for upcalls for same uid but different gss service commit 9130b8dbc6ac20f2dc5846e1647f5b60eafab6e3 upstream. It's possible to have simultaneous upcalls for the same UIDs but different GSS service. In that case, we need to allow for the upcall to gssd to proceed so that not the same context is used by two different GSS services. Some servers lock the use of context to the GSS service. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/auth_gss/auth_gss.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 799e65b944b9..06095cc8815e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -340,12 +340,14 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) } static struct gss_upcall_msg * -__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid) +__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; + if (auth && pos->auth->service != auth->service) + continue; atomic_inc(&pos->count); dprintk("RPC: %s found msg %p\n", __func__, pos); return pos; @@ -365,7 +367,7 @@ gss_add_msg(struct gss_upcall_msg *gss_msg) struct gss_upcall_msg *old; spin_lock(&pipe->lock); - old = __gss_find_upcall(pipe, gss_msg->uid); + old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth); if (old == NULL) { atomic_inc(&gss_msg->count); list_add(&gss_msg->list, &pipe->in_downcall); @@ -714,7 +716,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); - gss_msg = __gss_find_upcall(pipe, uid); + gss_msg = __gss_find_upcall(pipe, uid, NULL); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; From 93ed332bd4105e8af3e12e12fa510728147badfc Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 23 Oct 2015 17:19:46 +1100 Subject: [PATCH 314/813] powerpc/eeh: eeh_pci_enable(): fix checking of post-request state commit 949e9b827eb4736d96df520c67d07a54c64e99b8 upstream. In eeh_pci_enable(), after making the request to set the new options, we call eeh_ops->wait_state() to check that the request finished successfully. At the moment, if eeh_ops->wait_state() returns 0, we return 0 without checking that it reflects the expected outcome. This can lead to callers further up the chain incorrectly assuming the slot has been successfully unfrozen and continuing to attempt recovery. On powernv, this will occur if pnv_eeh_get_pe_state() or pnv_eeh_get_phb_state() return 0, which in turn occurs if the relevant OPAL call returns OPAL_EEH_STOPPED_MMIO_DMA_FREEZE or OPAL_EEH_PHB_ERROR respectively. On pseries, this will occur if pseries_eeh_get_state() returns 0, which in turn occurs if RTAS reports that the PE is in the MMIO Stopped and DMA Stopped states. Obviously, none of these cases represent a successful completion of a request to thaw MMIO or DMA. Fix the check so that a wait_state() return value of 0 won't be considered successful for the EEH_OPT_THAW_MMIO or EEH_OPT_THAW_DMA cases. Signed-off-by: Andrew Donnellan Acked-by: Gavin Shan Reviewed-by: Daniel Axtens Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/eeh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index b34e8a54f7db..98949b0df00a 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -677,7 +677,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) /* Check if the request is finished successfully */ if (active_flag) { rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); - if (rc <= 0) + if (rc < 0) return rc; if (rc & active_flag) From 23d399881f32bb3dedb19d53ea7a1c3d816c6dc8 Mon Sep 17 00:00:00 2001 From: Piotr Karasinski Date: Sat, 6 Aug 2016 21:23:05 +0200 Subject: [PATCH 315/813] ALSA: usb-audio: Add a sample rate quirk for Creative Live! Cam Socialize HD (VF0610) commit 7627e40c66b5547e12b6c5673646ceea84797a74 upstream. VF0610 does not support reading the sample rate which leads to many lines of "cannot get freq at ep 0x82". This patch adds the USB ID (0x041E:4080) to snd_usb_get_sample_rate_quirk() list. Signed-off-by: Piotr Karasinski Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index db11ecf0b74d..f08087912cf5 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1129,6 +1129,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) { /* devices which do not support reading the sample rate. */ switch (chip->usb_id) { + case USB_ID(0x041E, 0x4080): /* Creative Live Cam VF0610 */ case USB_ID(0x045E, 0x075D): /* MS Lifecam Cinema */ case USB_ID(0x045E, 0x076D): /* MS Lifecam HD-5000 */ case USB_ID(0x045E, 0x076E): /* MS Lifecam HD-5001 */ From 602857f223868365265e2d0f818b7bbbf31d48da Mon Sep 17 00:00:00 2001 From: "Vittorio Gambaletta (VittGam)" Date: Mon, 8 Aug 2016 12:35:40 +0200 Subject: [PATCH 316/813] ALSA: usb-audio: Add quirk for ELP HD USB Camera commit 41f5e3bdbf706a9e98194bf0c4b62a875c02f170 upstream. The ELP HD USB Camera (05a3:9420) needs this quirk for suppressing the unsupported sample rate inquiry. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=98481 Signed-off-by: Vittorio Gambaletta Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index f08087912cf5..a3e1252ce242 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1140,6 +1140,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ + case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */ case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */ From 4919b2a3b4a906869e0e2563028b5c4d47172a2e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 25 Jul 2016 16:59:52 +0100 Subject: [PATCH 317/813] arm64: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO commit 3146bc64d12377a74dbda12b96ea32da3774ae07 upstream. AT_VECTOR_SIZE_ARCH should be defined with the maximum number of NEW_AUX_ENT entries that ARCH_DLINFO can contain, but it wasn't defined for arm64 at all even though ARCH_DLINFO will contain one NEW_AUX_ENT for the VDSO address. This shouldn't be a problem as AT_VECTOR_SIZE_BASE includes space for AT_BASE_PLATFORM which arm64 doesn't use, but lets define it now and add the comment above ARCH_DLINFO as found in several other architectures to remind future modifiers of ARCH_DLINFO to keep AT_VECTOR_SIZE_ARCH up to date. Fixes: f668cd1673aa ("arm64: ELF definitions") Signed-off-by: James Hogan Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/elf.h | 1 + arch/arm64/include/uapi/asm/auxvec.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index bc6492b9a924..44dd892a4bbe 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -136,6 +136,7 @@ typedef struct user_fpsimd_state elf_fpregset_t; #define SET_PERSONALITY(ex) clear_thread_flag(TIF_32BIT); +/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ diff --git a/arch/arm64/include/uapi/asm/auxvec.h b/arch/arm64/include/uapi/asm/auxvec.h index 22d6d8885854..4cf0c17787a8 100644 --- a/arch/arm64/include/uapi/asm/auxvec.h +++ b/arch/arm64/include/uapi/asm/auxvec.h @@ -19,4 +19,6 @@ /* vDSO location */ #define AT_SYSINFO_EHDR 33 +#define AT_VECTOR_SIZE_ARCH 1 /* entries in ARCH_DLINFO */ + #endif From 9dddd02de0366eee747488e8b6829e265b3bf385 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 20 Aug 2016 11:51:38 +0200 Subject: [PATCH 318/813] parisc: Fix order of EREFUSED define in errno.h commit 3eb53b20d7bd1374598cfb1feaa081fcac0e76cd upstream. When building gccgo in userspace, errno.h gets parsed and the go include file sysinfo.go is generated. Since EREFUSED is defined to the same value as ECONNREFUSED, and ECONNREFUSED is defined later on in errno.h, this leads to go complaining that EREFUSED isn't defined yet. Fix this trivial problem by moving the define of EREFUSED down after ECONNREFUSED in errno.h (and clean up the indenting while touching this line). Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/include/uapi/asm/errno.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index c0ae62520d15..274d5bc6ecce 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -97,10 +97,10 @@ #define ENOTCONN 235 /* Transport endpoint is not connected */ #define ESHUTDOWN 236 /* Cannot send after transport endpoint shutdown */ #define ETOOMANYREFS 237 /* Too many references: cannot splice */ -#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */ #define ETIMEDOUT 238 /* Connection timed out */ #define ECONNREFUSED 239 /* Connection refused */ -#define EREMOTERELEASE 240 /* Remote peer released connection */ +#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */ +#define EREMOTERELEASE 240 /* Remote peer released connection */ #define EHOSTDOWN 241 /* Host is down */ #define EHOSTUNREACH 242 /* No route to host */ From fa54eea35b3882b18da94d932b78c519c3972bd8 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 2 Aug 2016 14:16:31 +0000 Subject: [PATCH 319/813] virtio: fix memory leak in virtqueue_add() commit 58625edf9e2515ed41dac2a24fa8004030a87b87 upstream. When using the indirect buffers feature, 'desc' is allocated in virtqueue_add() but isn't freed before leaving on a ring full error, causing a memory leak. For example, it seems rather clear that this can trigger with virtio net if mergeable buffers are not used. Signed-off-by: Wei Yongjun Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/virtio/virtio_ring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ee663c458b20..dc2b94142f53 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -202,6 +202,8 @@ static inline int virtqueue_add(struct virtqueue *_vq, * host should service the ring ASAP. */ if (out_sgs) vq->notify(&vq->vq); + if (indirect) + kfree(desc); END_USE(vq); return -ENOSPC; } From b4ddd78fb95ea9da6ca317806681481fae2c6dfc Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Aug 2016 16:16:23 -0600 Subject: [PATCH 320/813] vfio/pci: Fix NULL pointer oops in error interrupt setup handling commit c8952a707556e04374d7b2fdb3a079d63ddf6f2f upstream. There are multiple cases in vfio_pci_set_ctx_trigger_single() where we assume we can safely read from our data pointer without actually checking whether the user has passed any data via the count field. VFIO_IRQ_SET_DATA_NONE in particular is entirely broken since we attempt to pull an int32_t file descriptor out before even checking the data type. The other data types assume the data pointer contains one element of their type as well. In part this is good news because we were previously restricted from doing much sanitization of parameters because it was missed in the past and we didn't want to break existing users. Clearly DATA_NONE is completely broken, so it must not have any users and we can fix it up completely. For DATA_BOOL and DATA_EVENTFD, we'll just protect ourselves, returning error when count is zero since we previously would have oopsed. Signed-off-by: Alex Williamson Reported-by: Chris Thompson Reviewed-by: Eric Auger Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/pci/vfio_pci_intrs.c | 85 ++++++++++++++++++------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 3b3ba15558b7..20e9a86d2dcf 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -563,67 +563,80 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, } static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, - uint32_t flags, void *data) + unsigned int count, uint32_t flags, + void *data) { - int32_t fd = *(int32_t *)data; - - if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) - return -EINVAL; - /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (*ctx) - eventfd_signal(*ctx, 1); - return 0; + if (*ctx) { + if (count) { + eventfd_signal(*ctx, 1); + } else { + eventfd_ctx_put(*ctx); + *ctx = NULL; + } + return 0; + } } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { - uint8_t trigger = *(uint8_t *)data; + uint8_t trigger; + + if (!count) + return -EINVAL; + + trigger = *(uint8_t *)data; if (trigger && *ctx) eventfd_signal(*ctx, 1); + + return 0; + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t fd; + + if (!count) + return -EINVAL; + + fd = *(int32_t *)data; + if (fd == -1) { + if (*ctx) + eventfd_ctx_put(*ctx); + *ctx = NULL; + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + + if (*ctx) + eventfd_ctx_put(*ctx); + + *ctx = efdctx; + } return 0; } - /* Handle SET_DATA_EVENTFD */ - if (fd == -1) { - if (*ctx) - eventfd_ctx_put(*ctx); - *ctx = NULL; - return 0; - } else if (fd >= 0) { - struct eventfd_ctx *efdctx; - efdctx = eventfd_ctx_fdget(fd); - if (IS_ERR(efdctx)) - return PTR_ERR(efdctx); - if (*ctx) - eventfd_ctx_put(*ctx); - *ctx = efdctx; - return 0; - } else - return -EINVAL; + return -EINVAL; } static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) { - if (index != VFIO_PCI_ERR_IRQ_INDEX) + if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - /* - * We should sanitize start & count, but that wasn't caught - * originally, so this IRQ index must forever ignore them :-( - */ - - return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); + return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, + count, flags, data); } static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) { - if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) + if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); + return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, + count, flags, data); } int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, From 8f5b7e3f415c208962b3fe4308486c4484d37b9f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 15 Aug 2016 10:23:04 +0300 Subject: [PATCH 321/813] perf intel-pt: Fix occasional decoding errors when tracing system-wide commit 3d918fb13abdbeca7947578f5d7e426eafad7f5e upstream. In order to successfully decode Intel PT traces, context switch events are needed from the moment the trace starts. Currently that is ensured by using the 'immediate' flag which enables the switch event when it is opened. However, since commit 86c2786994bd ("perf intel-pt: Add support for PERF_RECORD_SWITCH") that might not always happen. When tracing system-wide the context switch event is added to the tracking event which was not set as 'immediate'. Change that so it is. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Fixes: 86c2786994bd ("perf intel-pt: Add support for PERF_RECORD_SWITCH") Link: http://lkml.kernel.org/r/1471245784-22580-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/arch/x86/util/intel-pt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index b02af064f0f9..c53f78767568 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -499,7 +499,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, struct intel_pt_recording *ptr = container_of(itr, struct intel_pt_recording, itr); struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu; - bool have_timing_info; + bool have_timing_info, need_immediate = false; struct perf_evsel *evsel, *intel_pt_evsel = NULL; const struct cpu_map *cpus = evlist->cpus; bool privileged = geteuid() == 0 || perf_event_paranoid() < 0; @@ -653,6 +653,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, ptr->have_sched_switch = 3; } else { opts->record_switch_events = true; + need_immediate = true; if (cpu_wide) ptr->have_sched_switch = 3; else @@ -698,6 +699,9 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, tracking_evsel->attr.freq = 0; tracking_evsel->attr.sample_period = 1; + if (need_immediate) + tracking_evsel->immediate = true; + /* In per-cpu case, always need the time of mmap events etc */ if (!cpu_map__empty(cpus)) { perf_evsel__set_sample_bit(tracking_evsel, TIME); From 47f972e0973cfddcafcb85b8284c76c85c7733b4 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 29 Jul 2016 14:59:12 -0600 Subject: [PATCH 322/813] libnvdimm, nd_blk: mask off reserved status bits commit 68202c9f0ad6e16ee806fbadbc5838d55fe5aa5c upstream. The "NVDIMM Block Window Driver Writer's Guide": http://pmem.io/documents/NVDIMM_DriverWritersGuide-July-2016.pdf ...defines the layout of the block window status register. For the July 2016 version of the spec linked to above, this happens in Figure 4 on page 26. The only bits defined in this spec are bits 31, 5, 4, 2, 1 and 0. The rest of the bits in the status register are reserved, and there is a warning following the diagram that says: Note: The driver cannot assume the value of the RESERVED bits in the status register are zero. These reserved bits need to be masked off, and the driver must avoid checking the state of those bits. This change ensures that for hardware implementations that set these reserved bits in the status register, the driver won't incorrectly fail the block I/Os. Reviewed-by: Lee, Chun-Yi Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/nfit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 11d8209e6e5d..5230e8449d30 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -1072,11 +1072,12 @@ static u32 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw) { struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; u64 offset = nfit_blk->stat_offset + mmio->size * bw; + const u32 STATUS_MASK = 0x80000037; if (mmio->num_lines) offset = to_interleave_offset(offset, mmio); - return readl(mmio->addr.base + offset); + return readl(mmio->addr.base + offset) & STATUS_MASK; } static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, From bb404d159f78a89bcdb6b77c3daf23dd11a258a4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Aug 2016 22:38:36 +0200 Subject: [PATCH 323/813] ALSA: hda - Manage power well properly for resume commit a52ff34e5ec61749c62c6618b76a9d6dbecee450 upstream. For SKL and later Intel chips, we control the power well per codec basis via link_power callback since the commit [03b135cebc47: ALSA: hda - remove dependency on i915 power well for SKL]. However, there are a few exceptional cases where the gfx registers are accessed from the audio driver: namely the wakeup override bit toggling at (both system and runtime) resume. This seems causing a kernel warning when accessed during the power well down (and likely resulting in the bogus register accesses). This patch puts the proper power up / down sequence around the resume code so that the wakeup bit is fiddled properly while the power is up. (The other callback, sync_audio_rate, is used only in the PCM callback, so it's guaranteed in the power-on.) Also, by this proper power up/down, the instantaneous flip of wakeup bit in the resume callback that was introduced by the commit [033ea349a7cd: ALSA: hda - Fix Skylake codec timeout] becomes superfluous, as snd_hdac_display_power() already does it. So we can clean it up together. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96214 Fixes: 03b135cebc47 ('ALSA: hda - remove dependency on i915 power well for SKL') Cc: # v4.2+ Tested-by: Hans de Goede Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e769e5764cba..12f7f6fdae4d 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -944,20 +944,23 @@ static int azx_resume(struct device *dev) struct snd_card *card = dev_get_drvdata(dev); struct azx *chip; struct hda_intel *hda; + struct hdac_bus *bus; if (!card) return 0; chip = card->private_data; hda = container_of(chip, struct hda_intel, chip); + bus = azx_bus(chip); if (chip->disabled || hda->init_failed || !chip->running) return 0; - if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL - && hda->need_i915_power) { - snd_hdac_display_power(azx_bus(chip), true); - haswell_set_bclk(hda); + if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) { + snd_hdac_display_power(bus, true); + if (hda->need_i915_power) + haswell_set_bclk(hda); } + if (chip->msi) if (pci_enable_msi(pci) < 0) chip->msi = 0; @@ -967,6 +970,11 @@ static int azx_resume(struct device *dev) hda_intel_init_chip(chip, true); + /* power down again for link-controlled chips */ + if ((chip->driver_caps & AZX_DCAPS_I915_POWERWELL) && + !hda->need_i915_power) + snd_hdac_display_power(bus, false); + snd_power_change_state(card, SNDRV_CTL_POWER_D0); trace_azx_resume(chip); @@ -1046,6 +1054,7 @@ static int azx_runtime_resume(struct device *dev) chip = card->private_data; hda = container_of(chip, struct hda_intel, chip); + bus = azx_bus(chip); if (chip->disabled || hda->init_failed) return 0; @@ -1053,15 +1062,9 @@ static int azx_runtime_resume(struct device *dev) return 0; if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) { - bus = azx_bus(chip); - if (hda->need_i915_power) { - snd_hdac_display_power(bus, true); + snd_hdac_display_power(bus, true); + if (hda->need_i915_power) haswell_set_bclk(hda); - } else { - /* toggle codec wakeup bit for STATESTS read */ - snd_hdac_set_codec_wakeup(bus, true); - snd_hdac_set_codec_wakeup(bus, false); - } } /* Read STATESTS before controller reset */ @@ -1081,6 +1084,11 @@ static int azx_runtime_resume(struct device *dev) azx_writew(chip, WAKEEN, azx_readw(chip, WAKEEN) & ~STATESTS_INT_MASK); + /* power down again for link-controlled chips */ + if ((chip->driver_caps & AZX_DCAPS_I915_POWERWELL) && + !hda->need_i915_power) + snd_hdac_display_power(bus, false); + trace_azx_runtime_resume(chip); return 0; } From d5537e988eeca346438116a37d6001d7e60d04a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Feb 2016 09:15:52 -0700 Subject: [PATCH 324/813] NVMe: Don't unmap controller registers on reset Commit b00a726a9fd82ddd4c10344e46f0d371e1674303 upstream. Unmapping the registers on reset or shutdown is not necessary. Keeping the mapping simplifies reset handling. This was backported to 4.4 stable tree because it prevents a race between the reset_work and the shutdown hook, that may provoke the Oops below, in the nvme_wait_ready function. The Oops is easily reproducible on systems that will kexec/reboot immediately after booting, which is actually the common use case for kexec based bootloaders, like Petitboot. This patch removes the unnecessary early unmapping of the PCI configuration in the shutdown hook, allowing a proper handling of the reset work. Unable to handle kernel paging request for data at address 0x0000001c Faulting instruction address: 0xd000000000720b38 cpu 0x1b: Vector: 300 (Data Access) at [c000007f7a9a38a0] pc: d000000000720b38: nvme_wait_ready+0x50/0x120 [nvme] lr: d000000000720b7c: nvme_wait_ready+0x94/0x120 [nvme] sp: c000007f7a9a3b20 msr: 9000000000009033 dar: 1c dsisr: 40000000 current = 0xc000007f7a926c80 paca = 0xc00000000fe85100 softe: 0 irq_happened: 0x01 pid = 2608, comm = kworker/27:1 enter ? for help [c000007f7a9a3bb0] d00000000072572c nvme_setup_io_queues+0xc08/0x1218 [nvme] [c000007f7a9a3c70] c00000000006bbd8 process_one_work+0x228/0x378 [c000007f7a9a3d00] c00000000006c050 worker_thread+0x2e0/0x420 [c000007f7a9a3d80] c00000000007161c kthread+0xfc/0x108 [c000007f7a9a3e30] c0000000000094b4 ret_from_kernel_thread+0x5c/0xa8 Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Gabriel Krisman Bertazi [Backport to v4.4.y] Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/pci.c | 71 +++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0c67b57be83c..289a5df0d44a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2672,10 +2672,10 @@ static int nvme_dev_add(struct nvme_dev *dev) return 0; } -static int nvme_dev_map(struct nvme_dev *dev) +static int nvme_pci_enable(struct nvme_dev *dev) { u64 cap; - int bars, result = -ENOMEM; + int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_enable_device_mem(pdev)) @@ -2683,24 +2683,14 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (!bars) - goto disable_pci; - - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable_pci; if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) - goto disable; - if (readl(&dev->bar->csts) == -1) { result = -ENODEV; - goto unmap; + goto disable; } /* @@ -2710,7 +2700,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!pdev->irq) { result = pci_enable_msix(pdev, dev->entry, 1); if (result < 0) - goto unmap; + goto disable; } cap = lo_hi_readq(&dev->bar->cap); @@ -2734,17 +2724,20 @@ static int nvme_dev_map(struct nvme_dev *dev) return 0; - unmap: - iounmap(dev->bar); - dev->bar = NULL; disable: pci_release_regions(pdev); - disable_pci: - pci_disable_device(pdev); + return result; } static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->bar) + iounmap(dev->bar); + pci_release_regions(to_pci_dev(dev->dev)); +} + +static void nvme_pci_disable(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -2753,12 +2746,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev) else if (pdev->msix_enabled) pci_disable_msix(pdev); - if (dev->bar) { - iounmap(dev->bar); - dev->bar = NULL; - pci_release_regions(pdev); - } - if (pci_is_enabled(pdev)) pci_disable_device(pdev); } @@ -2962,7 +2949,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_dev_list_remove(dev); - if (dev->bar) { + if (pci_is_enabled(to_pci_dev(dev->dev))) { nvme_freeze_queues(dev); csts = readl(&dev->bar->csts); } @@ -2976,7 +2963,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_shutdown_ctrl(dev); nvme_disable_queue(dev, 0); } - nvme_dev_unmap(dev); + nvme_pci_disable(dev); for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); @@ -3136,7 +3123,7 @@ static void nvme_probe_work(struct work_struct *work) bool start_thread = false; int result; - result = nvme_dev_map(dev); + result = nvme_pci_enable(dev); if (result) goto out; @@ -3292,6 +3279,27 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static int nvme_dev_map(struct nvme_dev *dev) +{ + int bars; + struct pci_dev *pdev = to_pci_dev(dev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + return -ENODEV; + if (pci_request_selected_regions(pdev, bars, "nvme")) + return -ENODEV; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto release; + + return 0; +release: + pci_release_regions(pdev); + return -ENODEV; +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -3317,6 +3325,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->reset_work, nvme_reset_work); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); + + result = nvme_dev_map(dev); + if (result) + goto free; + result = nvme_set_instance(dev); if (result) goto put_pci; @@ -3355,6 +3368,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_instance(dev); put_pci: put_device(dev->dev); + nvme_dev_unmap(dev); free: kfree(dev->queues); kfree(dev->entry); @@ -3398,6 +3412,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); + nvme_dev_unmap(dev); kref_put(&dev->kref, nvme_free_dev); } From c7308f636b523f435725aa50286b0498084c36c0 Mon Sep 17 00:00:00 2001 From: "Jason S. McMullan" Date: Wed, 30 Sep 2015 15:35:05 +0900 Subject: [PATCH 325/813] PCI: Support PCIe devices with short cfg_size commit c20aecf6963d1273d8f6d61c042b4845441ca592 upstream. If a device quirk modifies the pci_dev->cfg_size to be less than PCI_CFG_SPACE_EXP_SIZE (4096), but greater than PCI_CFG_SPACE_SIZE (256), the PCI sysfs interface truncates the readable size to PCI_CFG_SPACE_SIZE. Allow sysfs access to config space up to cfg_size, even if the device doesn't support the entire 4096-byte PCIe config space. Note that pci_read_config() and pci_write_config() limit access to dev->cfg_size even though pcie_config_attr contains 4096 (the maximum size). Signed-off-by: Jason S. McMullan [simon: edited changelog] Signed-off-by: Simon Horman [bhelgaas: more changelog edits] Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-sysfs.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index eead54cd01b2..d7508704c992 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1372,10 +1372,10 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) if (!sysfs_initialized) return -EACCES; - if (pdev->cfg_size < PCI_CFG_SPACE_EXP_SIZE) - retval = sysfs_create_bin_file(&pdev->dev.kobj, &pci_config_attr); - else + if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) retval = sysfs_create_bin_file(&pdev->dev.kobj, &pcie_config_attr); + else + retval = sysfs_create_bin_file(&pdev->dev.kobj, &pci_config_attr); if (retval) goto err; @@ -1427,10 +1427,10 @@ err_rom_file: err_resource_files: pci_remove_resource_files(pdev); err_config_file: - if (pdev->cfg_size < PCI_CFG_SPACE_EXP_SIZE) - sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); - else + if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr); + else + sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); err: return retval; } @@ -1464,10 +1464,10 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) pci_remove_capabilities_sysfs(pdev); - if (pdev->cfg_size < PCI_CFG_SPACE_EXP_SIZE) - sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); - else + if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr); + else + sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); pci_remove_resource_files(pdev); From 657170ec1fcdd8799230caac1aaf66e002ed198f Mon Sep 17 00:00:00 2001 From: "Jason S. McMullan" Date: Wed, 30 Sep 2015 15:35:06 +0900 Subject: [PATCH 326/813] PCI: Add Netronome vendor and device IDs commit a755e169031dac9ebaed03302c4921687c271d62 upstream. Device IDs for the Netronome NFP3200, NFP3240, NFP6000, and NFP6000 SR-IOV devices. Signed-off-by: Jason S. McMullan [simon: edited changelog] Signed-off-by: Simon Horman Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d9ba49cedc5d..526e2c12ae59 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2495,6 +2495,12 @@ #define PCI_DEVICE_ID_KORENIX_JETCARDF2 0x1700 #define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff +#define PCI_VENDOR_ID_NETRONOME 0x19ee +#define PCI_DEVICE_ID_NETRONOME_NFP3200 0x3200 +#define PCI_DEVICE_ID_NETRONOME_NFP3240 0x3240 +#define PCI_DEVICE_ID_NETRONOME_NFP6000 0x6000 +#define PCI_DEVICE_ID_NETRONOME_NFP6000_VF 0x6003 + #define PCI_VENDOR_ID_QMI 0x1a32 #define PCI_VENDOR_ID_AZWAVE 0x1a3b From 2d8ffbfa2d1ef639160798d2465d474917a735ee Mon Sep 17 00:00:00 2001 From: "Jason S. McMullan" Date: Wed, 30 Sep 2015 15:35:07 +0900 Subject: [PATCH 327/813] PCI: Limit config space size for Netronome NFP6000 family commit 9f33a2ae59f24452c1076749deb615bccd435ca9 upstream. The NFP6000 has an erratum where reading/writing to PCI config space addresses above 0x600 can cause the NFP to generate PCIe completion timeouts. Limit the NFP6000's config space size to 0x600 bytes. Signed-off-by: Jason S. McMullan [simon: edited changelog] Signed-off-by: Simon Horman Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 3c4752a288e2..53d35fb14213 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -287,6 +287,17 @@ static void quirk_citrine(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CITRINE, quirk_citrine); +/* + * This chip can cause bus lockups if config addresses above 0x600 + * are read or written. + */ +static void quirk_nfp6000(struct pci_dev *dev) +{ + dev->cfg_size = 0x600; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP6000, quirk_nfp6000); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP6000_VF, quirk_nfp6000); + /* On IBM Crocodile ipr SAS adapters, expand BAR to system page size */ static void quirk_extend_bar_to_page(struct pci_dev *dev) { From 6bd24be19f0c5cdeee8a0782d770b9fec23ac4a2 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 11 Dec 2015 11:30:11 +0900 Subject: [PATCH 328/813] PCI: Add Netronome NFP4000 PF device ID commit 69874ec233871a62e1bc8c89e643993af93a8630 upstream. Add the device ID for the PF of the NFP4000. The device ID for the VF, 0x6003, is already present as PCI_DEVICE_ID_NETRONOME_NFP6000_VF. Signed-off-by: Simon Horman Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 526e2c12ae59..37f05cb1dfd6 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2498,6 +2498,7 @@ #define PCI_VENDOR_ID_NETRONOME 0x19ee #define PCI_DEVICE_ID_NETRONOME_NFP3200 0x3200 #define PCI_DEVICE_ID_NETRONOME_NFP3240 0x3240 +#define PCI_DEVICE_ID_NETRONOME_NFP4000 0x4000 #define PCI_DEVICE_ID_NETRONOME_NFP6000 0x6000 #define PCI_DEVICE_ID_NETRONOME_NFP6000_VF 0x6003 From 0bbe3343438ea9ffe661ff267e0fe35afb7c42cc Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 11 Dec 2015 11:30:12 +0900 Subject: [PATCH 329/813] PCI: Limit config space size for Netronome NFP4000 commit c2e771b02792d222cbcd9617fe71482a64f52647 upstream. Like the NFP6000, the NFP4000 as an erratum where reading/writing to PCI config space addresses above 0x600 can cause the NFP to generate PCIe completion timeouts. Limit the NFP4000's PF's config space size to 0x600 bytes as is already done for the NFP6000. The NFP4000's VF is 0x6004 (PCI_DEVICE_ID_NETRONOME_NFP6000_VF), the same device ID as the NFP6000's VF. Thus, its config space is already limited by the existing use of quirk_nfp6000(). Signed-off-by: Simon Horman Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 53d35fb14213..42774bc39786 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -295,6 +295,7 @@ static void quirk_nfp6000(struct pci_dev *dev) { dev->cfg_size = 0x600; } +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP4000, quirk_nfp6000); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP6000, quirk_nfp6000); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP6000_VF, quirk_nfp6000); From a3043ecef71f5b880fe1b1d2aa77b3a896b86a0c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 15 Apr 2016 14:06:57 +0300 Subject: [PATCH 330/813] mmc: sdhci-acpi: Reduce Baytrail eMMC/SD/SDIO hangs commit 6e1c7d6103fe7031035cec321307c6356809adf4 upstream. Baytrail eMMC/SD/SDIO host controllers have been known to hang. A change to a hardware setting has been found to reduce the occurrence of such hangs. This patch ensures the correct setting. This patch applies cleanly to v4.4+. It could go to earlier kernels also, so I will send backports to the stable list in due course. Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org # v4.4+ Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/Kconfig | 1 + drivers/mmc/host/sdhci-acpi.c | 81 +++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 1dee533634c9..2e6d2fff1096 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -97,6 +97,7 @@ config MMC_RICOH_MMC config MMC_SDHCI_ACPI tristate "SDHCI support for ACPI enumerated SDHCI controllers" depends on MMC_SDHCI && ACPI + select IOSF_MBI if X86 help This selects support for ACPI enumerated SDHCI controllers, identified by ACPI Compatibility ID PNP0D40 or specific diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 8aea3fa6938b..5a05bf400ca8 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -41,6 +41,11 @@ #include #include +#ifdef CONFIG_X86 +#include +#include +#endif + #include "sdhci.h" enum { @@ -146,6 +151,75 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = { .ops = &sdhci_acpi_ops_int, }; +#ifdef CONFIG_X86 + +static bool sdhci_acpi_byt(void) +{ + static const struct x86_cpu_id byt[] = { + { X86_VENDOR_INTEL, 6, 0x37 }, + {} + }; + + return x86_match_cpu(byt); +} + +#define BYT_IOSF_SCCEP 0x63 +#define BYT_IOSF_OCP_NETCTRL0 0x1078 +#define BYT_IOSF_OCP_TIMEOUT_BASE GENMASK(10, 8) + +static void sdhci_acpi_byt_setting(struct device *dev) +{ + u32 val = 0; + + if (!sdhci_acpi_byt()) + return; + + if (iosf_mbi_read(BYT_IOSF_SCCEP, 0x06, BYT_IOSF_OCP_NETCTRL0, + &val)) { + dev_err(dev, "%s read error\n", __func__); + return; + } + + if (!(val & BYT_IOSF_OCP_TIMEOUT_BASE)) + return; + + val &= ~BYT_IOSF_OCP_TIMEOUT_BASE; + + if (iosf_mbi_write(BYT_IOSF_SCCEP, 0x07, BYT_IOSF_OCP_NETCTRL0, + val)) { + dev_err(dev, "%s write error\n", __func__); + return; + } + + dev_dbg(dev, "%s completed\n", __func__); +} + +static bool sdhci_acpi_byt_defer(struct device *dev) +{ + if (!sdhci_acpi_byt()) + return false; + + if (!iosf_mbi_available()) + return true; + + sdhci_acpi_byt_setting(dev); + + return false; +} + +#else + +static inline void sdhci_acpi_byt_setting(struct device *dev) +{ +} + +static inline bool sdhci_acpi_byt_defer(struct device *dev) +{ + return false; +} + +#endif + static int bxt_get_cd(struct mmc_host *mmc) { int gpio_cd = mmc_gpio_get_cd(mmc); @@ -337,6 +411,9 @@ static int sdhci_acpi_probe(struct platform_device *pdev) if (acpi_bus_get_status(device) || !device->status.present) return -ENODEV; + if (sdhci_acpi_byt_defer(dev)) + return -EPROBE_DEFER; + hid = acpi_device_hid(device); uid = device->pnp.unique_id; @@ -460,6 +537,8 @@ static int sdhci_acpi_resume(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); + sdhci_acpi_byt_setting(&c->pdev->dev); + return sdhci_resume_host(c->host); } @@ -483,6 +562,8 @@ static int sdhci_acpi_runtime_resume(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); + sdhci_acpi_byt_setting(&c->pdev->dev); + return sdhci_runtime_resume_host(c->host); } From cd84d3158f21dca9dde35f6cfb51f31dc2f4bde9 Mon Sep 17 00:00:00 2001 From: Hoan Tran Date: Fri, 17 Jun 2016 15:16:31 -0700 Subject: [PATCH 331/813] ACPI: CPPC: Return error if _CPC is invalid on a CPU commit 8343c40d3de32ebfe8f48b043964e4ba0e7701f7 upstream. Based on 8.4.7.1 section of ACPI 6.1 specification, if the platform supports CPPC, the _CPC object must exist under all processor objects. If cpc_desc_ptr pointer is invalid on any CPUs, acpi_get_psd_map() should return error and CPPC cpufreq driver can not be registered. Signed-off-by: Hoan Tran Reviewed-by: Prashanth Prakash Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/cppc_acpi.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6730f965b379..079b8db32dfe 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -216,8 +216,10 @@ int acpi_get_psd_map(struct cpudata **all_cpu_data) continue; cpc_ptr = per_cpu(cpc_desc_ptr, i); - if (!cpc_ptr) - continue; + if (!cpc_ptr) { + retval = -EFAULT; + goto err_ret; + } pdomain = &(cpc_ptr->domain_info); cpumask_set_cpu(i, pr->shared_cpu_map); @@ -239,8 +241,10 @@ int acpi_get_psd_map(struct cpudata **all_cpu_data) continue; match_cpc_ptr = per_cpu(cpc_desc_ptr, j); - if (!match_cpc_ptr) - continue; + if (!match_cpc_ptr) { + retval = -EFAULT; + goto err_ret; + } match_pdomain = &(match_cpc_ptr->domain_info); if (match_pdomain->domain != pdomain->domain) @@ -270,8 +274,10 @@ int acpi_get_psd_map(struct cpudata **all_cpu_data) continue; match_cpc_ptr = per_cpu(cpc_desc_ptr, j); - if (!match_cpc_ptr) - continue; + if (!match_cpc_ptr) { + retval = -EFAULT; + goto err_ret; + } match_pdomain = &(match_cpc_ptr->domain_info); if (match_pdomain->domain != pdomain->domain) From 5f009361d67ddf6d8a26e5605f083e795d49c0f4 Mon Sep 17 00:00:00 2001 From: Hoan Tran Date: Wed, 25 May 2016 12:09:23 -0700 Subject: [PATCH 332/813] ACPI / CPPC: Prevent cpc_desc_ptr points to the invalid data commit 2324d15447a9db168b1f85e3feac635b1ff8edb8 upstream. When CPPC fails to request a PCC channel, the CPC data is freed and cpc_desc_ptr points to the invalid data. Avoid this issue by moving the cpc_desc_ptr assignment after the PCC channel request. Signed-off-by: Hoan Tran Acked-by: Ashwin Chaugule Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/cppc_acpi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 079b8db32dfe..0afd1981e350 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -508,9 +508,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) /* Store CPU Logical ID */ cpc_ptr->cpu_id = pr->id; - /* Plug it into this CPUs CPC descriptor. */ - per_cpu(cpc_desc_ptr, pr->id) = cpc_ptr; - /* Parse PSD data for this CPU */ ret = acpi_get_psd(cpc_ptr, handle); if (ret) @@ -523,6 +520,9 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) goto out_free; } + /* Plug PSD data into this CPUs CPC descriptor. */ + per_cpu(cpc_desc_ptr, pr->id) = cpc_ptr; + /* Everything looks okay */ pr_debug("Parsed CPC struct for CPU: %d\n", pr->id); From fcdcf9773ea89860a14b31154592d37651c08202 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 17 Aug 2016 18:10:11 +0300 Subject: [PATCH 333/813] um: Don't discard .text.exit section commit dad2232844073295c64e9cc2d734a0ade043e0f6 upstream. Commit e41f501d3912 ("vmlinux.lds: account for destructor sections") added '.text.exit' to EXIT_TEXT which is discarded at link time by default. This breaks compilation of UML: `.text.exit' referenced in section `.fini_array' of /usr/lib/gcc/x86_64-linux-gnu/6/../../../x86_64-linux-gnu/libc.a(sdlerror.o): defined in discarded section `.text.exit' of /usr/lib/gcc/x86_64-linux-gnu/6/../../../x86_64-linux-gnu/libc.a(sdlerror.o) Apparently UML doesn't want to discard exit text, so let's place all EXIT_TEXT sections in .exit.text. Fixes: e41f501d3912 ("vmlinux.lds: account for destructor sections") Reported-by: Stefan Traby Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- arch/um/include/asm/common.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index 1dd5bd8a8c59..133055311dce 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -81,7 +81,7 @@ .altinstr_replacement : { *(.altinstr_replacement) } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } + .exit.text : { EXIT_TEXT } .exit.data : { *(.exit.data) } .preinit_array : { From fd59f98be0a7dcc668006e2d7efbf637c67f15fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:22 +0900 Subject: [PATCH 334/813] genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP commit b6140914fd079e43ea75a53429b47128584f033a upstream. No user and we definitely don't want to grow one. Signed-off-by: Thomas Gleixner Reviewed-by: Bart Van Assche Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-2-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- include/linux/msi.h | 6 ++---- kernel/irq/msi.c | 8 ++------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/include/linux/msi.h b/include/linux/msi.h index f71a25e5fd25..546c3d3faffb 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -254,12 +254,10 @@ enum { * callbacks. */ MSI_FLAG_USE_DEF_CHIP_OPS = (1 << 1), - /* Build identity map between hwirq and irq */ - MSI_FLAG_IDENTITY_MAP = (1 << 2), /* Support multiple PCI MSI interrupts */ - MSI_FLAG_MULTI_PCI_MSI = (1 << 3), + MSI_FLAG_MULTI_PCI_MSI = (1 << 2), /* Support PCI MSIX interrupts */ - MSI_FLAG_PCI_MSIX = (1 << 4), + MSI_FLAG_PCI_MSIX = (1 << 3), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b74a2a1..128c1e4e0e28 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = ops->msi_check(domain, info, dev); if (ret == 0) @@ -278,12 +278,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false); if (virq < 0) { ret = -ENOSPC; From 6722e247878e1a6ba99be420a062611d7b6361c5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 Jul 2016 17:18:33 +0100 Subject: [PATCH 335/813] genirq/msi: Make sure PCI MSIs are activated early commit f3b0946d629c8bfbd3e5f038e30cb9c711a35f10 upstream. Bharat Kumar Gogada reported issues with the generic MSI code, where the end-point ended up with garbage in its MSI configuration (both for the vector and the message). It turns out that the two MSI paths in the kernel are doing slightly different things: generic MSI: disable MSI -> allocate MSI -> enable MSI -> setup EP PCI MSI: disable MSI -> allocate MSI -> setup EP -> enable MSI And it turns out that end-points are allowed to latch the content of the MSI configuration registers as soon as MSIs are enabled. In Bharat's case, the end-point ends up using whatever was there already, which is not what you want. In order to make things converge, we introduce a new MSI domain flag (MSI_FLAG_ACTIVATE_EARLY) that is unconditionally set for PCI/MSI. When set, this flag forces the programming of the end-point as soon as the MSIs are allocated. A consequence of this is that we have an extra activate in irq_startup, but that should be without much consequence. tglx: - Several people reported a VMWare regression with PCI/MSI-X passthrough. It turns out that the patch also cures that issue. - We need to have a look at the MSI disable interrupt path, where we write the msg to all zeros without disabling MSI in the PCI device. Is that correct? Fixes: 52f518a3a7c2 "x86/MSI: Use hierarchical irqdomains to manage MSI interrupts" Reported-and-tested-by: Bharat Kumar Gogada Reported-and-tested-by: Foster Snowhill Reported-by: Matthias Prager Reported-by: Jason Taylor Signed-off-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/1468426713-31431-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- drivers/pci/msi.c | 2 ++ include/linux/msi.h | 2 ++ kernel/irq/msi.c | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 7eaa4c87fec7..10a6a8e5db88 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1278,6 +1278,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) pci_msi_domain_update_chip_ops(info); + info->flags |= MSI_FLAG_ACTIVATE_EARLY; + domain = msi_create_irq_domain(fwnode, info, parent); if (!domain) return NULL; diff --git a/include/linux/msi.h b/include/linux/msi.h index 546c3d3faffb..f0f43ec45ee7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -258,6 +258,8 @@ enum { MSI_FLAG_MULTI_PCI_MSI = (1 << 2), /* Support PCI MSIX interrupts */ MSI_FLAG_PCI_MSIX = (1 << 3), + /* Needs early activate, required for PCI */ + MSI_FLAG_ACTIVATE_EARLY = (1 << 4), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 128c1e4e0e28..4b21779d5163 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -303,6 +303,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; From e0a9e843cad4aaacc1e806d0f57c69fae79c4197 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 9 Aug 2016 08:27:17 +0100 Subject: [PATCH 336/813] crypto: caam - fix non-hmac hashes commit a0118c8b2be9297aed8e915c60b4013326b256d4 upstream. Since 6de62f15b581 ("crypto: algif_hash - Require setkey before accept(2)"), the AF_ALG interface requires userspace to provide a key to any algorithm that has a setkey method. However, the non-HMAC algorithms are not keyed, so setting a key is unnecessary. Fix this by removing the setkey method from the non-keyed hash algorithms. Fixes: 6de62f15b581 ("crypto: algif_hash - Require setkey before accept(2)") Signed-off-by: Russell King Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/caam/caamhash.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 49106ea42887..99d5e11db194 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -1873,6 +1873,7 @@ caam_hash_alloc(struct caam_hash_template *template, template->name); snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", template->driver_name); + t_alg->ahash_alg.setkey = NULL; } alg->cra_module = THIS_MODULE; alg->cra_init = caam_hash_cra_init; From 1595854f53a0aac0835ac7b1e3cfc6e0e0153441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Horia=20Geant=C4=83?= Date: Thu, 4 Aug 2016 20:02:46 +0300 Subject: [PATCH 337/813] crypto: caam - fix echainiv(authenc) encrypt shared descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1d2d87e81ea21f64c19b95ef228b865a6880e17e upstream. There are a few things missed by the conversion to the new AEAD interface: 1 - echainiv(authenc) encrypt shared descriptor The shared descriptor is incorrect: due to the order of operations, at some point in time MATH3 register is being overwritten. 2 - buffer used for echainiv(authenc) encrypt shared descriptor Encrypt and givencrypt shared descriptors (for AEAD ops) are mutually exclusive and thus use the same buffer in context state: sh_desc_enc. However, there's one place missed by s/sh_desc_givenc/sh_desc_enc, leading to errors when echainiv(authenc(...)) algorithms are used: DECO: desc idx 14: Header Error. Invalid length or parity, or certain other problems. While here, also fix a typo: dma_mapping_error() is checking for validity of sh_desc_givenc_dma instead of sh_desc_enc_dma. Fixes: 479bcc7c5b9e ("crypto: caam - Convert authenc to new AEAD interface") Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/caam/caamalg.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index ea8189f4b021..e356005a7212 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -614,7 +614,7 @@ skip_enc: keys_fit_inline = true; /* aead_givencrypt shared descriptor */ - desc = ctx->sh_desc_givenc; + desc = ctx->sh_desc_enc; /* Note: Context registers are saved. */ init_sh_desc_key_aead(desc, ctx, keys_fit_inline, is_rfc3686); @@ -645,13 +645,13 @@ copy_iv: append_operation(desc, ctx->class2_alg_type | OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT); - /* ivsize + cryptlen = seqoutlen - authsize */ - append_math_sub_imm_u32(desc, REG3, SEQOUTLEN, IMM, ctx->authsize); - /* Read and write assoclen bytes */ append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ); append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ); + /* ivsize + cryptlen = seqoutlen - authsize */ + append_math_sub_imm_u32(desc, REG3, SEQOUTLEN, IMM, ctx->authsize); + /* Skip assoc data */ append_seq_fifo_store(desc, 0, FIFOST_TYPE_SKIP | FIFOLDST_VLF); @@ -697,7 +697,7 @@ copy_iv: ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc, desc_bytes(desc), DMA_TO_DEVICE); - if (dma_mapping_error(jrdev, ctx->sh_desc_givenc_dma)) { + if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) { dev_err(jrdev, "unable to map shared descriptor\n"); return -ENOMEM; } From 161427073a79db33d6a99223ce50a25386e27c21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Horia=20Geant=C4=83?= Date: Thu, 4 Aug 2016 20:02:47 +0300 Subject: [PATCH 338/813] crypto: caam - defer aead_set_sh_desc in case of zero authsize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2fdea258fde036a87d3396ec9c0ef66f10768530 upstream. To be able to generate shared descriptors for AEAD, the authentication size needs to be known. However, there is no imposed order of calling .setkey, .setauthsize callbacks. Thus, in case authentication size is not known at .setkey time, defer it until .setauthsize is called. The authsize != 0 check was incorrectly removed when converting the driver to the new AEAD interface. Fixes: 479bcc7c5b9e ("crypto: caam - Convert authenc to new AEAD interface") Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/caam/caamalg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index e356005a7212..6dc597126b79 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -441,6 +441,9 @@ static int aead_set_sh_desc(struct crypto_aead *aead) OP_ALG_AAI_CTR_MOD128); const bool is_rfc3686 = alg->caam.rfc3686; + if (!ctx->authsize) + return 0; + /* NULL encryption / decryption */ if (!ctx->enckeylen) return aead_null_set_sh_desc(aead); From cb68ec1ce547aef16a9f257ce025b2489c59f227 Mon Sep 17 00:00:00 2001 From: Marc Ohlf Date: Wed, 3 Aug 2016 11:51:54 +0200 Subject: [PATCH 339/813] usb: ehci: change order of register cleanup during shutdown commit bc337b51508beb2d039aff5074a76cfe1c212030 upstream. In ehci_turn_off_all_ports() all EHCI port registers are cleared to zero. On some hardware, this can lead to an system hang, when ehci_port_power() accesses the already cleared registers. This patch changes the order of cleanup. First call ehci_port_power() which respects the current bits in port status registers and afterwards cleanup the hard way by setting everything to zero. Signed-off-by: Marc Ohlf Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-hcd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index 48c92bf78bd0..f7661d9750fd 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -332,11 +332,11 @@ static void ehci_turn_off_all_ports(struct ehci_hcd *ehci) int port = HCS_N_PORTS(ehci->hcs_params); while (port--) { - ehci_writel(ehci, PORT_RWC_BITS, - &ehci->regs->port_status[port]); spin_unlock_irq(&ehci->lock); ehci_port_power(ehci, port, false); spin_lock_irq(&ehci->lock); + ehci_writel(ehci, PORT_RWC_BITS, + &ehci->regs->port_status[port]); } } From 97b23f9de12f36cd8f50201f4572bf80cdc6becf Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 11 Aug 2016 10:31:14 +0800 Subject: [PATCH 340/813] usb: misc: usbtest: add fix for driver hang commit 539587511835ea12d8daa444cbed766cf2bc3612 upstream. In sg_timeout(), req->status is set to "-ETIMEDOUT" before calling into usb_sg_cancel(). usb_sg_cancel() will do nothing and return directly if req->status has been set to a non-zero value. This will cause driver hang whenever transfer time out is triggered. This patch fixes this issue. It could be backported to stable kernel with version later than v3.15. Cc: Alan Stern Signed-off-by: Lu Baolu Suggested-by: Alan Stern Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usbtest.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index 1a812eafe670..1624b09d9748 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -558,7 +558,6 @@ static void sg_timeout(unsigned long _req) { struct usb_sg_request *req = (struct usb_sg_request *) _req; - req->status = -ETIMEDOUT; usb_sg_cancel(req); } @@ -589,8 +588,10 @@ static int perform_sglist( mod_timer(&sg_timer, jiffies + msecs_to_jiffies(SIMPLE_IO_TIMEOUT)); usb_sg_wait(req); - del_timer_sync(&sg_timer); - retval = req->status; + if (!del_timer_sync(&sg_timer)) + retval = -ETIMEDOUT; + else + retval = req->status; /* FIXME check resulting data pattern */ From 7dc7ec87feb4b926f8f7fc0cf7f86455b124b017 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 1 Apr 2016 17:13:11 +0300 Subject: [PATCH 341/813] usb: dwc3: pci: add Intel Kabylake PCI ID commit 4491ed5042f0419b22a4b08331adb54af31e2caa upstream. Intel Kabylake PCH has the same DWC3 than Intel Sunrisepoint. Add the new ID to the supported devices. Signed-off-by: Heikki Krogerus Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 009d83048c8c..3d731d1b5c60 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -36,6 +36,7 @@ #define PCI_DEVICE_ID_INTEL_SPTH 0xa130 #define PCI_DEVICE_ID_INTEL_BXT 0x0aaa #define PCI_DEVICE_ID_INTEL_APL 0x5aaa +#define PCI_DEVICE_ID_INTEL_KBP 0xa2b0 static const struct acpi_gpio_params reset_gpios = { 0, 0, false }; static const struct acpi_gpio_params cs_gpios = { 1, 0, false }; @@ -214,6 +215,7 @@ static const struct pci_device_id dwc3_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SPTH), }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BXT), }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_APL), }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBP), }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_NL_USB), }, { } /* Terminating Entry */ }; From 13af8c64ee5c05407d523fa79517a6b841f4fdc8 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 29 Jul 2016 03:17:58 +0300 Subject: [PATCH 342/813] usb: dwc3: gadget: increment request->actual once commit c7de573471832dff7d31f0c13b0f143d6f017799 upstream. When using SG lists, we would end up setting request->actual to: num_mapped_sgs * (request->length - count) Let's fix that up by incrementing request->actual only once. Reported-by: Brian E Rogers Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 70900e6ca9bc..fb79dca9484b 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1892,14 +1892,6 @@ static int __dwc3_cleanup_done_trbs(struct dwc3 *dwc, struct dwc3_ep *dep, s_pkt = 1; } - /* - * We assume here we will always receive the entire data block - * which we should receive. Meaning, if we program RX to - * receive 4K but we receive only 2K, we assume that's all we - * should receive and we simply bounce the request back to the - * gadget driver for further processing. - */ - req->request.actual += req->request.length - count; if (s_pkt) return 1; if ((event->status & DEPEVT_STATUS_LST) && @@ -1919,6 +1911,7 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep, struct dwc3_trb *trb; unsigned int slot; unsigned int i; + int count = 0; int ret; do { @@ -1935,6 +1928,8 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep, slot++; slot %= DWC3_TRB_NUM; trb = &dep->trb_pool[slot]; + count += trb->size & DWC3_TRB_SIZE_MASK; + ret = __dwc3_cleanup_done_trbs(dwc, dep, req, trb, event, status); @@ -1942,6 +1937,14 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep, break; } while (++i < req->request.num_mapped_sgs); + /* + * We assume here we will always receive the entire data block + * which we should receive. Meaning, if we program RX to + * receive 4K but we receive only 2K, we assume that's all we + * should receive and we simply bounce the request back to the + * gadget driver for further processing. + */ + req->request.actual += req->request.length - count; dwc3_gadget_giveback(dep, req, status); if (ret) From dbb9fe1fc7a73bf99976e0cb12fdbd2a0106731c Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 10 Dec 2015 09:59:25 +0200 Subject: [PATCH 343/813] usb: define USB_SPEED_SUPER_PLUS speed for SuperSpeedPlus USB3.1 devices commit 8a1b2725a60d3267135c15e80984b4406054f650 upstream. Add a new USB_SPEED_SUPER_PLUS device speed, and make sure usb core can handle the new speed. In most cases the behaviour is the same as with USB_SPEED_SUPER SuperSpeed devices. In a few places we add a "Plus" string to inform the user of the new speed. Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/common.c | 1 + drivers/usb/core/config.c | 3 ++- drivers/usb/core/devices.c | 10 ++++++---- drivers/usb/core/hcd-pci.c | 2 +- drivers/usb/core/hcd.c | 6 +++--- drivers/usb/core/hub.c | 26 +++++++++++++++----------- drivers/usb/core/urb.c | 3 ++- drivers/usb/core/usb.h | 2 +- include/uapi/linux/usb/ch9.h | 1 + 9 files changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c index 673d53038ed2..a00bfb93acc3 100644 --- a/drivers/usb/common/common.c +++ b/drivers/usb/common/common.c @@ -50,6 +50,7 @@ static const char *const speed_names[] = { [USB_SPEED_HIGH] = "high-speed", [USB_SPEED_WIRELESS] = "wireless", [USB_SPEED_SUPER] = "super-speed", + [USB_SPEED_SUPER_PLUS] = "super-speed-plus", }; const char *usb_speed_string(enum usb_device_speed speed) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 5050760f5e17..bbcf4009f99e 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -191,6 +191,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, if (usb_endpoint_xfer_int(d)) { i = 1; switch (to_usb_device(ddev)->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: case USB_SPEED_HIGH: /* Many device manufacturers are using full-speed @@ -274,7 +275,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, } /* Parse a possible SuperSpeed endpoint companion descriptor */ - if (to_usb_device(ddev)->speed == USB_SPEED_SUPER) + if (to_usb_device(ddev)->speed >= USB_SPEED_SUPER) usb_parse_ss_endpoint_companion(ddev, cfgno, inum, asnum, endpoint, buffer, size); diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c index 2a3bbdf7eb94..332ed277a06c 100644 --- a/drivers/usb/core/devices.c +++ b/drivers/usb/core/devices.c @@ -221,7 +221,7 @@ static char *usb_dump_endpoint_descriptor(int speed, char *start, char *end, break; case USB_ENDPOINT_XFER_INT: type = "Int."; - if (speed == USB_SPEED_HIGH || speed == USB_SPEED_SUPER) + if (speed == USB_SPEED_HIGH || speed >= USB_SPEED_SUPER) interval = 1 << (desc->bInterval - 1); else interval = desc->bInterval; @@ -230,7 +230,7 @@ static char *usb_dump_endpoint_descriptor(int speed, char *start, char *end, return start; } interval *= (speed == USB_SPEED_HIGH || - speed == USB_SPEED_SUPER) ? 125 : 1000; + speed >= USB_SPEED_SUPER) ? 125 : 1000; if (interval % 1000) unit = 'u'; else { @@ -322,7 +322,7 @@ static char *usb_dump_config_descriptor(char *start, char *end, if (start > end) return start; - if (speed == USB_SPEED_SUPER) + if (speed >= USB_SPEED_SUPER) mul = 8; else mul = 2; @@ -534,6 +534,8 @@ static ssize_t usb_device_dump(char __user **buffer, size_t *nbytes, speed = "480"; break; case USB_SPEED_SUPER: speed = "5000"; break; + case USB_SPEED_SUPER_PLUS: + speed = "10000"; break; default: speed = "??"; } @@ -553,7 +555,7 @@ static ssize_t usb_device_dump(char __user **buffer, size_t *nbytes, /* super/high speed reserves 80%, full/low reserves 90% */ if (usbdev->speed == USB_SPEED_HIGH || - usbdev->speed == USB_SPEED_SUPER) + usbdev->speed >= USB_SPEED_SUPER) max = 800; else max = FRAME_TIME_MAX_USECS_ALLOC; diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index b8b580e5ae6e..40378487e023 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -206,7 +206,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) * The xHCI driver has its own irq management * make sure irq setup is not touched for xhci in generic hcd code */ - if ((driver->flags & HCD_MASK) != HCD_USB3) { + if ((driver->flags & HCD_MASK) < HCD_USB3) { if (!dev->irq) { dev_err(&dev->dev, "Found HC with no IRQ. Check BIOS/PCI %s setup!\n", diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 1c102d60cd9f..f44ce09367bc 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1078,7 +1078,7 @@ static int register_root_hub(struct usb_hcd *hcd) retval = usb_get_bos_descriptor(usb_dev); if (!retval) { usb_dev->lpm_capable = usb_device_supports_lpm(usb_dev); - } else if (usb_dev->speed == USB_SPEED_SUPER) { + } else if (usb_dev->speed >= USB_SPEED_SUPER) { mutex_unlock(&usb_bus_list_lock); dev_dbg(parent_dev, "can't read %s bos descriptor %d\n", dev_name(&usb_dev->dev), retval); @@ -2112,7 +2112,7 @@ int usb_alloc_streams(struct usb_interface *interface, hcd = bus_to_hcd(dev->bus); if (!hcd->driver->alloc_streams || !hcd->driver->free_streams) return -EINVAL; - if (dev->speed != USB_SPEED_SUPER) + if (dev->speed < USB_SPEED_SUPER) return -EINVAL; if (dev->state < USB_STATE_CONFIGURED) return -ENODEV; @@ -2160,7 +2160,7 @@ int usb_free_streams(struct usb_interface *interface, dev = interface_to_usbdev(interface); hcd = bus_to_hcd(dev->bus); - if (dev->speed != USB_SPEED_SUPER) + if (dev->speed < USB_SPEED_SUPER) return -EINVAL; /* Double-free is not allowed */ diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 84df093639ac..34762c6e6150 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -298,7 +298,7 @@ static void usb_set_lpm_parameters(struct usb_device *udev) unsigned int hub_u1_del; unsigned int hub_u2_del; - if (!udev->lpm_capable || udev->speed != USB_SPEED_SUPER) + if (!udev->lpm_capable || udev->speed < USB_SPEED_SUPER) return; hub = usb_hub_to_struct_hub(udev->parent); @@ -2645,7 +2645,7 @@ static unsigned hub_is_wusb(struct usb_hub *hub) */ static bool use_new_scheme(struct usb_device *udev, int retry) { - if (udev->speed == USB_SPEED_SUPER) + if (udev->speed >= USB_SPEED_SUPER) return false; return USE_NEW_SCHEME(retry); @@ -3985,7 +3985,7 @@ int usb_disable_lpm(struct usb_device *udev) struct usb_hcd *hcd; if (!udev || !udev->parent || - udev->speed != USB_SPEED_SUPER || + udev->speed < USB_SPEED_SUPER || !udev->lpm_capable || udev->state < USB_STATE_DEFAULT) return 0; @@ -4042,7 +4042,7 @@ void usb_enable_lpm(struct usb_device *udev) struct usb_hcd *hcd; if (!udev || !udev->parent || - udev->speed != USB_SPEED_SUPER || + udev->speed < USB_SPEED_SUPER || !udev->lpm_capable || udev->state < USB_STATE_DEFAULT) return; @@ -4308,7 +4308,9 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, retval = -ENODEV; - if (oldspeed != USB_SPEED_UNKNOWN && oldspeed != udev->speed) { + /* Don't allow speed changes at reset, except usb 3.0 to faster */ + if (oldspeed != USB_SPEED_UNKNOWN && oldspeed != udev->speed && + !(oldspeed == USB_SPEED_SUPER && udev->speed > oldspeed)) { dev_dbg(&udev->dev, "device reset changed speed!\n"); goto fail; } @@ -4320,6 +4322,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, * reported as 0xff in the device descriptor). WUSB1.0[4.8.1]. */ switch (udev->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: case USB_SPEED_WIRELESS: /* fixed at 512 */ udev->ep0.desc.wMaxPacketSize = cpu_to_le16(512); @@ -4346,7 +4349,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, else speed = usb_speed_string(udev->speed); - if (udev->speed != USB_SPEED_SUPER) + if (udev->speed < USB_SPEED_SUPER) dev_info(&udev->dev, "%s %s USB device number %d using %s\n", (udev->config) ? "reset" : "new", speed, @@ -4476,11 +4479,12 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, devnum, retval); goto fail; } - if (udev->speed == USB_SPEED_SUPER) { + if (udev->speed >= USB_SPEED_SUPER) { devnum = udev->devnum; dev_info(&udev->dev, - "%s SuperSpeed USB device number %d using %s\n", + "%s SuperSpeed%s USB device number %d using %s\n", (udev->config) ? "reset" : "new", + (udev->speed == USB_SPEED_SUPER_PLUS) ? "Plus" : "", devnum, udev->bus->controller->driver->name); } @@ -4519,7 +4523,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, * got from those devices show they aren't superspeed devices. Warm * reset the port attached by the devices can fix them. */ - if ((udev->speed == USB_SPEED_SUPER) && + if ((udev->speed >= USB_SPEED_SUPER) && (le16_to_cpu(udev->descriptor.bcdUSB) < 0x0300)) { dev_err(&udev->dev, "got a wrong device descriptor, " "warm reset device\n"); @@ -4530,7 +4534,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, } if (udev->descriptor.bMaxPacketSize0 == 0xff || - udev->speed == USB_SPEED_SUPER) + udev->speed >= USB_SPEED_SUPER) i = 512; else i = udev->descriptor.bMaxPacketSize0; @@ -4740,7 +4744,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, udev->level = hdev->level + 1; udev->wusb = hub_is_wusb(hub); - /* Only USB 3.0 devices are connected to SuperSpeed hubs. */ + /* Devices connected to SuperSpeed hubs are USB 3.0 or later */ if (hub_is_superspeed(hub->hdev)) udev->speed = USB_SPEED_SUPER; else diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index 3d274778caaf..c601e25b609f 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -401,7 +401,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags) /* SuperSpeed isoc endpoints have up to 16 bursts of up to * 3 packets each */ - if (dev->speed == USB_SPEED_SUPER) { + if (dev->speed >= USB_SPEED_SUPER) { int burst = 1 + ep->ss_ep_comp.bMaxBurst; int mult = USB_SS_MULT(ep->ss_ep_comp.bmAttributes); max *= burst; @@ -499,6 +499,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags) } /* too big? */ switch (dev->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: /* units are 125us */ /* Handle up to 2^(16-1) microframes */ if (urb->interval > (1 << 15)) diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h index 05b5e17abf92..53318126ed91 100644 --- a/drivers/usb/core/usb.h +++ b/drivers/usb/core/usb.h @@ -45,7 +45,7 @@ static inline unsigned usb_get_max_power(struct usb_device *udev, struct usb_host_config *c) { /* SuperSpeed power is in 8 mA units; others are in 2 mA units */ - unsigned mul = (udev->speed == USB_SPEED_SUPER ? 8 : 2); + unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2); return c->desc.bMaxPower * mul; } diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 4338eb7b09b3..779a62aafafe 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -954,6 +954,7 @@ enum usb_device_speed { USB_SPEED_HIGH, /* usb 2.0 */ USB_SPEED_WIRELESS, /* wireless (usb 2.5) */ USB_SPEED_SUPER, /* usb 3.0 */ + USB_SPEED_SUPER_PLUS, /* usb 3.1 */ }; From 7edabddaea5c46241aa76672fee0e45e9bf11b77 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 4 Aug 2016 13:32:22 -0700 Subject: [PATCH 344/813] usb: hub: Fix unbalanced reference count/memory leak/deadlocks commit 6bb47e8ab98accb1319bd43c64966340ba3bba9a upstream. Memory leak and unbalanced reference count: If the hub gets disconnected while the core is still activating it, this can result in leaking memory of few USB structures. This will happen if we have done a kref_get() from hub_activate() and scheduled a delayed work item for HUB_INIT2/3. Now if hub_disconnect() gets called before the delayed work expires, then we will cancel the work from hub_quiesce(), but wouldn't do a kref_put(). And so the unbalance. kmemleak reports this as (with the commit e50293ef9775 backported to 3.10 kernel with other changes, though the same is true for mainline as well): unreferenced object 0xffffffc08af5b800 (size 1024): comm "khubd", pid 73, jiffies 4295051211 (age 6482.350s) hex dump (first 32 bytes): 30 68 f3 8c c0 ff ff ff 00 a0 b2 2e c0 ff ff ff 0h.............. 01 00 00 00 00 00 00 00 00 94 7d 40 c0 ff ff ff ..........}@.... backtrace: [] create_object+0x148/0x2a0 [] kmemleak_alloc+0x80/0xbc [] kmem_cache_alloc_trace+0x120/0x1ac [] hub_probe+0x120/0xb84 [] usb_probe_interface+0x1ec/0x298 [] driver_probe_device+0x160/0x374 [] __device_attach+0x28/0x4c [] bus_for_each_drv+0x78/0xac [] device_attach+0x6c/0x9c [] bus_probe_device+0x28/0xa0 [] device_add+0x324/0x604 [] usb_set_configuration+0x660/0x6cc [] generic_probe+0x44/0x84 [] usb_probe_device+0x54/0x74 [] driver_probe_device+0x160/0x374 [] __device_attach+0x28/0x4c Deadlocks: If the hub gets disconnected early enough (i.e. before INIT2/INIT3 are finished and the init_work is still queued), the core may call hub_quiesce() after acquiring interface device locks and it will wait for the work to be cancelled synchronously. But if the work handler is already running in parallel, it may try to acquire the same interface device lock and this may result in deadlock. Fix both the issues by removing the call to cancel_delayed_work_sync(). Fixes: e50293ef9775 ("USB: fix invalid memory access in hub_activate()") Reported-by: Manu Gautam Signed-off-by: Viresh Kumar Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 34762c6e6150..fb5800359e08 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1299,8 +1299,6 @@ static void hub_quiesce(struct usb_hub *hub, enum hub_quiescing_type type) struct usb_device *hdev = hub->hdev; int i; - cancel_delayed_work_sync(&hub->init_work); - /* hub_wq and related activity won't re-trigger */ hub->quiescing = 1; From 52217416177820e2c2a1350be31e071db4ccd305 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 5 Aug 2016 11:49:45 -0400 Subject: [PATCH 345/813] USB: hub: fix up early-exit pathway in hub_activate commit ca5cbc8b02f9b21cc8cd1ab36668763ec34f9ee8 upstream. The early-exit pathway in hub_activate, added by commit e50293ef9775 ("USB: fix invalid memory access in hub_activate()") needs improvement. It duplicates code that is already present at the end of the subroutine, and it neglects to undo the effect of a usb_autopm_get_interface_no_resume() call. This patch fixes both problems by making the early-exit pathway jump directly to the end of the subroutine. It simplifies the code at the end by merging two conditionals that actually test the same condition although they appear different: If type < HUB_INIT3 then type must be either HUB_INIT2 or HUB_INIT, and it can't be HUB_INIT because in that case the subroutine would have exited earlier. Signed-off-by: Alan Stern Reviewed-by: Viresh Kumar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index fb5800359e08..4f304ad715f1 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1039,11 +1039,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) device_lock(hub->intfdev); /* Was the hub disconnected while we were waiting? */ - if (hub->disconnected) { - device_unlock(hub->intfdev); - kref_put(&hub->kref, hub_release); - return; - } + if (hub->disconnected) + goto disconnected; if (type == HUB_INIT2) goto init2; goto init3; @@ -1265,12 +1262,12 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Scan all ports that need attention */ kick_hub_wq(hub); - /* Allow autosuspend if it was suppressed */ - if (type <= HUB_INIT3) + if (type == HUB_INIT2 || type == HUB_INIT3) { + /* Allow autosuspend if it was suppressed */ + disconnected: usb_autopm_put_interface_async(to_usb_interface(hub->intfdev)); - - if (type == HUB_INIT2 || type == HUB_INIT3) device_unlock(hub->intfdev); + } kref_put(&hub->kref, hub_release); } From 4a6eff809af0a1662917338c47bf9291f77d62cc Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 5 Aug 2016 11:51:30 -0400 Subject: [PATCH 346/813] USB: hub: change the locking in hub_activate commit 07d316a22e119fa301fd7dba7f1e1adfd4f72c05 upstream. The locking in hub_activate() is not adequate to provide full mutual exclusion with hub_quiesce(). The subroutine locks the hub's usb_interface, but the callers of hub_quiesce() (such as hub_pre_reset() and hub_event()) hold the lock to the hub's usb_device. This patch changes hub_activate() to make it acquire the same lock as those other routines. Signed-off-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 4f304ad715f1..bcc1e1b729ad 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1036,7 +1036,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Continue a partial initialization */ if (type == HUB_INIT2 || type == HUB_INIT3) { - device_lock(hub->intfdev); + device_lock(&hdev->dev); /* Was the hub disconnected while we were waiting? */ if (hub->disconnected) @@ -1243,7 +1243,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) queue_delayed_work(system_power_efficient_wq, &hub->init_work, msecs_to_jiffies(delay)); - device_unlock(hub->intfdev); + device_unlock(&hdev->dev); return; /* Continues at init3: below */ } else { msleep(delay); @@ -1266,7 +1266,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Allow autosuspend if it was suppressed */ disconnected: usb_autopm_put_interface_async(to_usb_interface(hub->intfdev)); - device_unlock(hub->intfdev); + device_unlock(&hdev->dev); } kref_put(&hub->kref, hub_release); From b8032e6952f0d228b058c8fdb6e3392e19a61c27 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 8 Aug 2016 21:50:52 +0900 Subject: [PATCH 347/813] usb: renesas_usbhs: clear the BRDYSTS in usbhsg_ep_enable() commit 9ab967e6db7412b675ecbff80d5371d53c82cb2e upstream. This patch fixes an issue that unexpected BRDY interruption happens when the usb_ep_{enable,disable}() are called with different direction. In this case, the driver will cause the following message: renesas_usbhs e6590000.usb: irq_ready run_error 1 : -16 This issue causes the followings: 1) A pipe is enabled as transmission 2) The pipe sent a data 3) The pipe is disabled and re-enabled as reception. 4) The pipe got a queue Since the driver doesn't clear the BRDYSTS flags after 2) above, the issue happens. If we add such clearing the flags into the driver, the code will become complicate. So, this patch clears the BRDYSTS flag of reception in usbhsg_ep_enable() to avoid complicate. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/mod_gadget.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c index 5a3abf56d56b..67f7dbda9e79 100644 --- a/drivers/usb/renesas_usbhs/mod_gadget.c +++ b/drivers/usb/renesas_usbhs/mod_gadget.c @@ -618,10 +618,13 @@ static int usbhsg_ep_enable(struct usb_ep *ep, * use dmaengine if possible. * It will use pio handler if impossible. */ - if (usb_endpoint_dir_in(desc)) + if (usb_endpoint_dir_in(desc)) { pipe->handler = &usbhs_fifo_dma_push_handler; - else + } else { pipe->handler = &usbhs_fifo_dma_pop_handler; + usbhs_xxxsts_clear(priv, BRDYSTS, + usbhs_pipe_number(pipe)); + } ret = 0; } From 365a5f484c89142aceee3e9ff2968c8c017a3abf Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 8 Aug 2016 21:50:53 +0900 Subject: [PATCH 348/813] usb: renesas_usbhs: Use dmac only if the pipe type is bulk commit 700aa7ff8d2c2b9cc669c99375e2ccd06d3cd38d upstream. This patch fixes an issue that isochronous transfer's data is possible to be lost as a workaround. Since this driver uses a workqueue to start the dmac, the transfer is possible to be delayed when system load is high. Fixes: 6e4b74e4690d ("usb: renesas: fix scheduling in atomic context bug") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index db565f620f82..36e5b5c530bd 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -869,7 +869,7 @@ static int usbhsf_dma_prepare_push(struct usbhs_pkt *pkt, int *is_done) /* use PIO if packet is less than pio_dma_border or pipe is DCP */ if ((len < usbhs_get_dparam(priv, pio_dma_border)) || - usbhs_pipe_is_dcp(pipe)) + usbhs_pipe_type_is(pipe, USB_ENDPOINT_XFER_ISOC)) goto usbhsf_pio_prepare_push; /* check data length if this driver don't use USB-DMAC */ @@ -974,7 +974,7 @@ static int usbhsf_dma_prepare_pop_with_usb_dmac(struct usbhs_pkt *pkt, /* use PIO if packet is less than pio_dma_border or pipe is DCP */ if ((pkt->length < usbhs_get_dparam(priv, pio_dma_border)) || - usbhs_pipe_is_dcp(pipe)) + usbhs_pipe_type_is(pipe, USB_ENDPOINT_XFER_ISOC)) goto usbhsf_pio_prepare_pop; fifo = usbhsf_get_dma_fifo(priv, pkt); From 391738b9a371412a3c394975f18696d908971b69 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 1 Aug 2016 15:25:56 -0400 Subject: [PATCH 349/813] USB: validate wMaxPacketValue entries in endpoint descriptors commit aed9d65ac3278d4febd8665bd7db59ef53e825fe upstream. Erroneous or malicious endpoint descriptors may have non-zero bits in reserved positions, or out-of-bounds values. This patch helps prevent these from causing problems by bounds-checking the wMaxPacketValue entries in endpoint descriptors and capping the values at the maximum allowed. This issue was first discovered and tests were conducted by Jake Lamberson , an intern working for Rosie Hall. Signed-off-by: Alan Stern Reported-by: roswest Tested-by: roswest Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 66 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index bbcf4009f99e..67e192c1d37e 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -142,6 +142,31 @@ static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno, } } +static const unsigned short low_speed_maxpacket_maxes[4] = { + [USB_ENDPOINT_XFER_CONTROL] = 8, + [USB_ENDPOINT_XFER_ISOC] = 0, + [USB_ENDPOINT_XFER_BULK] = 0, + [USB_ENDPOINT_XFER_INT] = 8, +}; +static const unsigned short full_speed_maxpacket_maxes[4] = { + [USB_ENDPOINT_XFER_CONTROL] = 64, + [USB_ENDPOINT_XFER_ISOC] = 1023, + [USB_ENDPOINT_XFER_BULK] = 64, + [USB_ENDPOINT_XFER_INT] = 64, +}; +static const unsigned short high_speed_maxpacket_maxes[4] = { + [USB_ENDPOINT_XFER_CONTROL] = 64, + [USB_ENDPOINT_XFER_ISOC] = 1024, + [USB_ENDPOINT_XFER_BULK] = 512, + [USB_ENDPOINT_XFER_INT] = 1023, +}; +static const unsigned short super_speed_maxpacket_maxes[4] = { + [USB_ENDPOINT_XFER_CONTROL] = 512, + [USB_ENDPOINT_XFER_ISOC] = 1024, + [USB_ENDPOINT_XFER_BULK] = 1024, + [USB_ENDPOINT_XFER_INT] = 1024, +}; + static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, int asnum, struct usb_host_interface *ifp, int num_ep, unsigned char *buffer, int size) @@ -150,6 +175,8 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, struct usb_endpoint_descriptor *d; struct usb_host_endpoint *endpoint; int n, i, j, retval; + unsigned int maxp; + const unsigned short *maxpacket_maxes; d = (struct usb_endpoint_descriptor *) buffer; buffer += d->bLength; @@ -257,6 +284,42 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, endpoint->desc.wMaxPacketSize = cpu_to_le16(8); } + /* Validate the wMaxPacketSize field */ + maxp = usb_endpoint_maxp(&endpoint->desc); + + /* Find the highest legal maxpacket size for this endpoint */ + i = 0; /* additional transactions per microframe */ + switch (to_usb_device(ddev)->speed) { + case USB_SPEED_LOW: + maxpacket_maxes = low_speed_maxpacket_maxes; + break; + case USB_SPEED_FULL: + maxpacket_maxes = full_speed_maxpacket_maxes; + break; + case USB_SPEED_HIGH: + /* Bits 12..11 are allowed only for HS periodic endpoints */ + if (usb_endpoint_xfer_int(d) || usb_endpoint_xfer_isoc(d)) { + i = maxp & (BIT(12) | BIT(11)); + maxp &= ~i; + } + /* fallthrough */ + default: + maxpacket_maxes = high_speed_maxpacket_maxes; + break; + case USB_SPEED_SUPER: + case USB_SPEED_SUPER_PLUS: + maxpacket_maxes = super_speed_maxpacket_maxes; + break; + } + j = maxpacket_maxes[usb_endpoint_type(&endpoint->desc)]; + + if (maxp > j) { + dev_warn(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid maxpacket %d, setting to %d\n", + cfgno, inum, asnum, d->bEndpointAddress, maxp, j); + maxp = j; + endpoint->desc.wMaxPacketSize = cpu_to_le16(i | maxp); + } + /* * Some buggy high speed devices have bulk endpoints using * maxpacket sizes other than 512. High speed HCDs may not @@ -264,9 +327,6 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, */ if (to_usb_device(ddev)->speed == USB_SPEED_HIGH && usb_endpoint_xfer_bulk(d)) { - unsigned maxp; - - maxp = usb_endpoint_maxp(&endpoint->desc) & 0x07ff; if (maxp != 512) dev_warn(ddev, "config %d interface %d altsetting %d " "bulk endpoint 0x%X has invalid maxpacket %d\n", From e9caf24cdf4b505d1204e109c9d41defdf57e197 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 13 Jul 2016 13:14:33 +0300 Subject: [PATCH 350/813] usb: gadget: fsl_qe_udc: off by one in setup_received_handle() commit 7442e6db5bdd0dce4615205508301f9b22e502d6 upstream. The udc->eps[] array has USB_MAX_ENDPOINTS elements so > should be >=. Fixes: 3948f0e0c999 ('usb: add Freescale QE/CPM USB peripheral controller driver') Acked-by: Peter Chen Signed-off-by: Dan Carpenter Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/fsl_qe_udc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.c b/drivers/usb/gadget/udc/fsl_qe_udc.c index 5fb6f8b4f0b4..c73689b72f95 100644 --- a/drivers/usb/gadget/udc/fsl_qe_udc.c +++ b/drivers/usb/gadget/udc/fsl_qe_udc.c @@ -2053,7 +2053,7 @@ static void setup_received_handle(struct qe_udc *udc, struct qe_ep *ep; if (wValue != 0 || wLength != 0 - || pipe > USB_MAX_ENDPOINTS) + || pipe >= USB_MAX_ENDPOINTS) break; ep = &udc->eps[pipe]; From 8fd5243e5f01b6f5ad1457c348d0c9e3353b7745 Mon Sep 17 00:00:00 2001 From: Mathieu Laurendeau Date: Fri, 15 Jul 2016 14:58:41 +0200 Subject: [PATCH 351/813] usb/gadget: fix gadgetfs aio support. commit 327b21da884fe1a29f733e41792ddd53e4a30379 upstream. Fix io submissions failing with ENODEV. Signed-off-by: Mathieu Laurendeau Fixes: 7fe3976e0f3a ("gadget: switch ep_io_operations to ->read_iter/->write_iter") Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 55386619a0f1..e57f48f9528f 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -541,7 +541,7 @@ static ssize_t ep_aio(struct kiocb *iocb, */ spin_lock_irq(&epdata->dev->lock); value = -ENODEV; - if (unlikely(epdata->ep)) + if (unlikely(epdata->ep == NULL)) goto fail; req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC); From 76ba94f74c8e064afbd192646b5c198c7fa0379a Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 16 Aug 2016 10:18:03 +0300 Subject: [PATCH 352/813] xhci: always handle "Command Ring Stopped" events commit 33be126510974e2eb9679f1ca9bca4f67ee4c4c7 upstream. Fix "Command completion event does not match command" errors by always handling the command ring stopped events. The command ring stopped event is generated as a result of aborting or stopping the command ring with a register write. It is not caused by a command in the command queue, and thus won't have a matching command in the comman list. Solve it by handling the command ring stopped event before checking for a matching command. In most command time out cases we abort the command ring, and get a command ring stopped event. The events command pointer will point at the current command ring dequeue, which in most cases matches the timed out command in the command list, and no error messages are seen. If we instead get a command aborted event before the command ring stopped event, the abort event will increse the command ring dequeue pointer, and the following command ring stopped events command pointer will point at the next, not yet queued command. This case triggered the error message Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 34cd23724bed..bc4f16c8d11b 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1331,12 +1331,6 @@ static void handle_cmd_completion(struct xhci_hcd *xhci, cmd = list_entry(xhci->cmd_list.next, struct xhci_command, cmd_list); - if (cmd->command_trb != xhci->cmd_ring->dequeue) { - xhci_err(xhci, - "Command completion event does not match command\n"); - return; - } - del_timer(&xhci->cmd_timer); trace_xhci_cmd_completion(cmd_trb, (struct xhci_generic_trb *) event); @@ -1348,6 +1342,13 @@ static void handle_cmd_completion(struct xhci_hcd *xhci, xhci_handle_stopped_cmd_ring(xhci, cmd); return; } + + if (cmd->command_trb != xhci->cmd_ring->dequeue) { + xhci_err(xhci, + "Command completion event does not match command\n"); + return; + } + /* * Host aborted the command ring, check if the current command was * supposed to be aborted, otherwise continue normally. From 262d059872a93c1bdaa3371602c921f1ce758f1d Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Tue, 16 Aug 2016 10:18:05 +0300 Subject: [PATCH 353/813] usb: xhci: Fix panic if disconnect commit 88716a93766b8f095cdef37a8e8f2c93aa233b21 upstream. After a device is disconnected, xhci_stop_device() will be invoked in xhci_bus_suspend(). Also the "disconnect" IRQ will have ISR to invoke xhci_free_virt_device() in this sequence. xhci_irq -> xhci_handle_event -> handle_cmd_completion -> xhci_handle_cmd_disable_slot -> xhci_free_virt_device If xhci->devs[slot_id] has been assigned to NULL in xhci_free_virt_device(), then virt_dev->eps[i].ring in xhci_stop_device() may point to an invlid address to cause kernel panic. virt_dev = xhci->devs[slot_id]; : if (virt_dev->eps[i].ring && virt_dev->eps[i].ring->dequeue) [] Unable to handle kernel paging request at virtual address 00001a68 [] pgd=ffffffc001430000 [] [00001a68] *pgd=000000013c807003, *pud=000000013c807003, *pmd=000000013c808003, *pte=0000000000000000 [] Internal error: Oops: 96000006 [#1] PREEMPT SMP [] CPU: 0 PID: 39 Comm: kworker/0:1 Tainted: G U [] Workqueue: pm pm_runtime_work [] task: ffffffc0bc0e0bc0 ti: ffffffc0bc0ec000 task.ti: ffffffc0bc0ec000 [] PC is at xhci_stop_device.constprop.11+0xb4/0x1a4 This issue is found when running with realtek ethernet device (0bda:8153). Signed-off-by: Jim Lin Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index f980c239eded..1da876605e4d 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -377,6 +377,9 @@ static int xhci_stop_device(struct xhci_hcd *xhci, int slot_id, int suspend) ret = 0; virt_dev = xhci->devs[slot_id]; + if (!virt_dev) + return -ENODEV; + cmd = xhci_alloc_command(xhci, false, true, GFP_NOIO); if (!cmd) { xhci_dbg(xhci, "Couldn't allocate command structure.\n"); From c5a6d60b93315ecf7d2d446045a7295f0825e6fc Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 16 Aug 2016 10:18:06 +0300 Subject: [PATCH 354/813] xhci: don't dereference a xhci member after removing xhci commit f1f6d9a8b540df22b87a5bf6bc104edaade81f47 upstream. Remove the hcd after checking for the xhci last quirks, not before. This caused a hang on a Alpine Ridge xhci based maching which remove the whole xhci controller when unplugging the last usb device Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index de644e56aa3b..963867c2c1d5 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -311,11 +311,12 @@ static void xhci_pci_remove(struct pci_dev *dev) usb_remove_hcd(xhci->shared_hcd); usb_put_hcd(xhci->shared_hcd); } - usb_hcd_pci_remove(dev); /* Workaround for spurious wakeups at shutdown with HSW */ if (xhci->quirks & XHCI_SPURIOUS_WAKEUP) pci_set_power_state(dev, PCI_D3hot); + + usb_hcd_pci_remove(dev); } #ifdef CONFIG_PM From 697c84be4b37d85a244051beadfb6f0e0bbd54d3 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Mon, 8 Aug 2016 02:34:46 +0100 Subject: [PATCH 355/813] USB: serial: fix memleak in driver-registration error path commit 647024a7df36014bbc4479d92d88e6b77c0afcf6 upstream. udriver struct allocated by kzalloc() will not be freed if usb_register() and next calls fail. This patch fixes this by adding one more step with kfree(udriver) in error path. Signed-off-by: Alexey Klimov Acked-by: Alan Stern Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/usb-serial.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 46f1f13b41f1..a0ca291bc07f 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -1432,7 +1432,7 @@ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[] rc = usb_register(udriver); if (rc) - return rc; + goto failed_usb_register; for (sd = serial_drivers; *sd; ++sd) { (*sd)->usb_driver = udriver; @@ -1450,6 +1450,8 @@ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[] while (sd-- > serial_drivers) usb_serial_deregister(*sd); usb_deregister(udriver); +failed_usb_register: + kfree(udriver); return rc; } EXPORT_SYMBOL_GPL(usb_serial_register_drivers); From 340391d39c01597e8986322bd471c5f9ad60506c Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Sun, 24 Jul 2016 13:53:30 +0200 Subject: [PATCH 356/813] USB: serial: option: add D-Link DWM-156/A3 commit cf1b18030de29e4e5b0a57695ae5db4a89da0ff7 upstream. The device has four interfaces; the three serial ports ought to be handled by this driver: 00 Diagnostic interface serial port 01 NMEA device serial port 02 Mass storage (sd card) 03 Modem serial port The other product ids listed in the Windows driver are present already. Signed-off-by: Lubomir Rintel Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 8e07536c233a..0338851e31a2 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1966,6 +1966,7 @@ static const struct usb_device_id option_ids[] = { .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e01, 0xff, 0xff, 0xff) }, /* D-Link DWM-152/C1 */ { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e02, 0xff, 0xff, 0xff) }, /* D-Link DWM-156/C1 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x7e11, 0xff, 0xff, 0xff) }, /* D-Link DWM-156/A3 */ { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x4000, 0xff) }, /* OLICARD300 - MT6225 */ { USB_DEVICE(INOVIA_VENDOR_ID, INOVIA_SEW858) }, { USB_DEVICE(VIATELECOM_VENDOR_ID, VIATELECOM_PRODUCT_CDS7) }, From 8ed7b7d02b707490242d55a5662b00e35b6991a3 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Tue, 2 Aug 2016 11:29:25 +0200 Subject: [PATCH 357/813] USB: serial: option: add support for Telit LE920A4 commit 01d7956b58e644ea0d2e8d9340c5727a8fc39d70 upstream. This patch adds a set of compositions for Telit LE920A4. Compositions in short are: 0x1207: tty + tty 0x1208: tty + adb + tty + tty 0x1211: tty + adb + ecm 0x1212: tty + adb 0x1213: ecm + tty 0x1214: tty + adb + ecm + tty telit_le922_blacklist_usbcfg3 is reused for compositions 0x1211 and 0x1214 due to the same interfaces positions. Signed-off-by: Daniele Palmas Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 0338851e31a2..bc472584a229 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -274,6 +274,12 @@ static void option_instat_callback(struct urb *urb); #define TELIT_PRODUCT_LE920 0x1200 #define TELIT_PRODUCT_LE910 0x1201 #define TELIT_PRODUCT_LE910_USBCFG4 0x1206 +#define TELIT_PRODUCT_LE920A4_1207 0x1207 +#define TELIT_PRODUCT_LE920A4_1208 0x1208 +#define TELIT_PRODUCT_LE920A4_1211 0x1211 +#define TELIT_PRODUCT_LE920A4_1212 0x1212 +#define TELIT_PRODUCT_LE920A4_1213 0x1213 +#define TELIT_PRODUCT_LE920A4_1214 0x1214 /* ZTE PRODUCTS */ #define ZTE_VENDOR_ID 0x19d2 @@ -628,6 +634,11 @@ static const struct option_blacklist_info telit_le920_blacklist = { .reserved = BIT(1) | BIT(5), }; +static const struct option_blacklist_info telit_le920a4_blacklist_1 = { + .sendsetup = BIT(0), + .reserved = BIT(1), +}; + static const struct option_blacklist_info telit_le922_blacklist_usbcfg0 = { .sendsetup = BIT(2), .reserved = BIT(0) | BIT(1) | BIT(3), @@ -1203,6 +1214,16 @@ static const struct usb_device_id option_ids[] = { .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), .driver_info = (kernel_ulong_t)&telit_le920_blacklist }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1207) }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1208), + .driver_info = (kernel_ulong_t)&telit_le920a4_blacklist_1 }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1211), + .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1212), + .driver_info = (kernel_ulong_t)&telit_le920a4_blacklist_1 }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1213, 0xff) }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1214), + .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff), .driver_info = (kernel_ulong_t)&net_intf1_blacklist }, From 6d77ca311e41b374835d9bca8444b854041488f1 Mon Sep 17 00:00:00 2001 From: "Sheng-Hui J. Chu" Date: Thu, 28 Jul 2016 17:01:45 -0400 Subject: [PATCH 358/813] USB: serial: ftdi_sio: add device ID for WICED USB UART dev board commit ae34d12cc1e212ffcd92e069030e54dae69c832f upstream. BCM20706V2_EVAL is a WICED dev board designed with FT2232H USB 2.0 UART/FIFO IC. To support BCM920706V2_EVAL dev board for WICED development on Linux. Add the VID(0a5c) and PID(6422) to ftdi_sio driver to allow loading ftdi_sio for this board. Signed-off-by: Sheng-Hui J. Chu Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 1 + drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index b61f12160d37..af12d67e6791 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1008,6 +1008,7 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(ICPDAS_VID, ICPDAS_I7560U_PID) }, { USB_DEVICE(ICPDAS_VID, ICPDAS_I7561U_PID) }, { USB_DEVICE(ICPDAS_VID, ICPDAS_I7563U_PID) }, + { USB_DEVICE(WICED_VID, WICED_USB20706V2_PID) }, { } /* Terminating entry */ }; diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index c5d6c1e73e8e..1f44839de9f8 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -672,6 +672,12 @@ #define INTREPID_VALUECAN_PID 0x0601 #define INTREPID_NEOVI_PID 0x0701 +/* + * WICED USB UART + */ +#define WICED_VID 0x0A5C +#define WICED_USB20706V2_PID 0x6422 + /* * Definitions for ID TECH (www.idt-net.com) devices */ From 1d816d0bbf8f5d94ff08cdfd69b44ed242dcc3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Deli=C3=ABn?= Date: Thu, 28 Jul 2016 18:52:55 +0000 Subject: [PATCH 359/813] USB: serial: ftdi_sio: add PIDs for Ivium Technologies devices commit 6977495c06f7f47636a076ee5a0ca571279d9697 upstream. Ivium Technologies uses the FTDI VID with custom PIDs for their line of electrochemical interfaces and the PalmSens they developed for PalmSens BV. Signed-off-by: Robert Delien Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 2 ++ drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index af12d67e6791..8c48c9d83d48 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -648,6 +648,8 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(FTDI_VID, FTDI_ELV_TFD128_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_FM3RX_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ELV_WS777_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_PALMSENS_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_IVIUM_XSTAT_PID) }, { USB_DEVICE(FTDI_VID, LINX_SDMUSBQSS_PID) }, { USB_DEVICE(FTDI_VID, LINX_MASTERDEVEL2_PID) }, { USB_DEVICE(FTDI_VID, LINX_FUTURE_0_PID) }, diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 1f44839de9f8..f87a938cf005 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -405,6 +405,12 @@ #define FTDI_4N_GALAXY_DE_2_PID 0xF3C1 #define FTDI_4N_GALAXY_DE_3_PID 0xF3C2 +/* + * Ivium Technologies product IDs + */ +#define FTDI_PALMSENS_PID 0xf440 +#define FTDI_IVIUM_XSTAT_PID 0xf441 + /* * Linx Technologies product ids */ From 433ccf1fb43735e2f8d8cc10bdb78ead132edca0 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 25 Jan 2016 15:30:44 +0200 Subject: [PATCH 360/813] xhci: Make sure xhci handles USB_SPEED_SUPER_PLUS devices. commit 0caf6b33452112e5a1186c8c964e90310e49e6bd upstream. In most cases the devices with the speed set to USB_SPEED_SUPER_PLUS are handled like regular SuperSpeed devices. Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 9 ++++++--- drivers/usb/host/xhci-ring.c | 3 ++- drivers/usb/host/xhci.c | 7 +++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index d8dbd7e5194b..8ea2c05beca2 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1072,7 +1072,7 @@ static u32 xhci_find_real_port_number(struct xhci_hcd *xhci, struct usb_device *top_dev; struct usb_hcd *hcd; - if (udev->speed == USB_SPEED_SUPER) + if (udev->speed >= USB_SPEED_SUPER) hcd = xhci->shared_hcd; else hcd = xhci->main_hcd; @@ -1107,6 +1107,7 @@ int xhci_setup_addressable_virt_dev(struct xhci_hcd *xhci, struct usb_device *ud /* 3) Only the control endpoint is valid - one endpoint context */ slot_ctx->dev_info |= cpu_to_le32(LAST_CTX(1) | udev->route); switch (udev->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: slot_ctx->dev_info |= cpu_to_le32(SLOT_SPEED_SS); max_packets = MAX_PACKET(512); @@ -1294,6 +1295,7 @@ static unsigned int xhci_get_endpoint_interval(struct usb_device *udev, } /* Fall through - SS and HS isoc/int have same decoding */ + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: if (usb_endpoint_xfer_int(&ep->desc) || usb_endpoint_xfer_isoc(&ep->desc)) { @@ -1334,7 +1336,7 @@ static unsigned int xhci_get_endpoint_interval(struct usb_device *udev, static u32 xhci_get_endpoint_mult(struct usb_device *udev, struct usb_host_endpoint *ep) { - if (udev->speed != USB_SPEED_SUPER || + if (udev->speed < USB_SPEED_SUPER || !usb_endpoint_xfer_isoc(&ep->desc)) return 0; return ep->ss_ep_comp.bmAttributes; @@ -1384,7 +1386,7 @@ static u32 xhci_get_max_esit_payload(struct usb_device *udev, usb_endpoint_xfer_bulk(&ep->desc)) return 0; - if (udev->speed == USB_SPEED_SUPER) + if (udev->speed >= USB_SPEED_SUPER) return le16_to_cpu(ep->ss_ep_comp.wBytesPerInterval); max_packet = GET_MAX_PACKET(usb_endpoint_maxp(&ep->desc)); @@ -1455,6 +1457,7 @@ int xhci_endpoint_init(struct xhci_hcd *xhci, max_packet = GET_MAX_PACKET(usb_endpoint_maxp(&ep->desc)); max_burst = 0; switch (udev->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: /* dig out max burst from ep companion desc */ max_burst = ep->ss_ep_comp.bMaxBurst; diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index bc4f16c8d11b..1f37b89e7267 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -3576,7 +3576,7 @@ static unsigned int xhci_get_burst_count(struct xhci_hcd *xhci, { unsigned int max_burst; - if (xhci->hci_version < 0x100 || udev->speed != USB_SPEED_SUPER) + if (xhci->hci_version < 0x100 || udev->speed < USB_SPEED_SUPER) return 0; max_burst = urb->ep->ss_ep_comp.bMaxBurst; @@ -3602,6 +3602,7 @@ static unsigned int xhci_get_last_burst_packet_count(struct xhci_hcd *xhci, return 0; switch (udev->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: /* bMaxBurst is zero based: 0 means 1 packet per burst */ max_burst = urb->ep->ss_ep_comp.bMaxBurst; diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 6fe0174da226..adc169d2fd76 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -2073,6 +2073,7 @@ static unsigned int xhci_get_block_size(struct usb_device *udev) case USB_SPEED_HIGH: return HS_BLOCK; case USB_SPEED_SUPER: + case USB_SPEED_SUPER_PLUS: return SS_BLOCK; case USB_SPEED_UNKNOWN: case USB_SPEED_WIRELESS: @@ -2198,7 +2199,7 @@ static int xhci_check_bw_table(struct xhci_hcd *xhci, unsigned int packets_remaining = 0; unsigned int i; - if (virt_dev->udev->speed == USB_SPEED_SUPER) + if (virt_dev->udev->speed >= USB_SPEED_SUPER) return xhci_check_ss_bw(xhci, virt_dev); if (virt_dev->udev->speed == USB_SPEED_HIGH) { @@ -2399,7 +2400,7 @@ void xhci_drop_ep_from_interval_table(struct xhci_hcd *xhci, if (xhci_is_async_ep(ep_bw->type)) return; - if (udev->speed == USB_SPEED_SUPER) { + if (udev->speed >= USB_SPEED_SUPER) { if (xhci_is_sync_in_ep(ep_bw->type)) xhci->devs[udev->slot_id]->bw_table->ss_bw_in -= xhci_get_ss_bw_consumed(ep_bw); @@ -2437,6 +2438,7 @@ void xhci_drop_ep_from_interval_table(struct xhci_hcd *xhci, interval_bw->overhead[HS_OVERHEAD_TYPE] -= 1; break; case USB_SPEED_SUPER: + case USB_SPEED_SUPER_PLUS: case USB_SPEED_UNKNOWN: case USB_SPEED_WIRELESS: /* Should never happen because only LS/FS/HS endpoints will get @@ -2496,6 +2498,7 @@ static void xhci_add_ep_to_interval_table(struct xhci_hcd *xhci, interval_bw->overhead[HS_OVERHEAD_TYPE] += 1; break; case USB_SPEED_SUPER: + case USB_SPEED_SUPER_PLUS: case USB_SPEED_UNKNOWN: case USB_SPEED_WIRELESS: /* Should never happen because only LS/FS/HS endpoints will get From d360081c4bd4361d7df6f879c0f31262ba4f2525 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 9 Aug 2016 16:23:17 +0100 Subject: [PATCH 361/813] iommu/dma: Don't put uninitialised IOVA domains commit 3ec60043f7c02e1f79e4a90045ff2d2e80042941 upstream. Due to the limitations of having to wait until we see a device's DMA restrictions before we know how we want an IOVA domain initialised, there is a window for error if a DMA ops domain is allocated but later freed without ever being used. In that case, init_iova_domain() was never called, so calling put_iova_domain() from iommu_put_dma_cookie() ends up trying to take an uninitialised lock and crashing. Make things robust by skipping the call unless the IOVA domain actually has been initialised, as we probably should have done from the start. Fixes: 0db2e5d18f76 ("iommu: Implement common IOMMU ops for DMA mapping") Reported-by: Nate Watterson Reviewed-by: Nate Watterson Tested-by: Nate Watterson Reviewed-by: Eric Auger Tested-by: Eric Auger Signed-off-by: Robin Murphy Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/dma-iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 58f2fe687a24..347a3c17f73a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -68,7 +68,8 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) if (!iovad) return; - put_iova_domain(iovad); + if (iovad->granule) + put_iova_domain(iovad); kfree(iovad); domain->iova_cookie = NULL; } From aef62956c81d90456a9b3c051c21f679126a5d56 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Jul 2016 11:15:37 +0100 Subject: [PATCH 362/813] iommu/arm-smmu: Fix CMDQ error handling commit aea2037e0d3e23c3be1498feae29f71ca997d9e6 upstream. In the unlikely event of a global command queue error, the ARM SMMUv3 driver attempts to convert the problematic command into a CMD_SYNC and resume the command queue. Unfortunately, this code is pretty badly broken: 1. It uses the index into the error string table as the CMDQ index, so we probably read the wrong entry out of the queue 2. The arguments to queue_write are the wrong way round, so we end up writing from the queue onto the stack. These happily cancel out, so the kernel is likely to stay alive, but the command queue will probably fault again when we resume. This patch fixes the error handling code to use the correct queue index and write back the CMD_SYNC to the faulting entry. Fixes: 48ec83bcbcf5 ("iommu/arm-smmu: Add initial driver support for ARM SMMUv3 devices") Reported-by: Diwakar Subraveti Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/arm-smmu-v3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 8487987458a1..ebd5d045d53e 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -870,7 +870,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) * We may have concurrent producers, so we need to be careful * not to touch any of the shadow cmdq state. */ - queue_read(cmd, Q_ENT(q, idx), q->ent_dwords); + queue_read(cmd, Q_ENT(q, cons), q->ent_dwords); dev_err(smmu->dev, "skipping command in error state:\n"); for (i = 0; i < ARRAY_SIZE(cmd); ++i) dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]); @@ -881,7 +881,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) return; } - queue_write(cmd, Q_ENT(q, idx), q->ent_dwords); + queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, From d860213f8baf27ef0c014737aa4162b21ad734df Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 16 Aug 2016 14:29:16 +0100 Subject: [PATCH 363/813] iommu/arm-smmu: Don't BUG() if we find aborting STEs with disable_bypass commit 5bc0a11664e17e9f9551983f5b660bd48b57483c upstream. The disable_bypass cmdline option changes the SMMUv3 driver to put down faulting stream table entries by default, as opposed to bypassing transactions from unconfigured devices. In this mode of operation, it is entirely expected to see aborting entries in the stream table if and when we come to installing a valid translation, so don't trigger a BUG() as a result of misdiagnosing these entries as stream table corruption. Fixes: 48ec83bcbcf5 ("iommu/arm-smmu: Add initial driver support for ARM SMMUv3 devices") Tested-by: Robin Murphy Reported-by: Robin Murphy Reviewed-by: Robin Murphy Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/arm-smmu-v3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index ebd5d045d53e..00df3832faab 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1025,6 +1025,9 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid, case STRTAB_STE_0_CFG_S2_TRANS: ste_live = true; break; + case STRTAB_STE_0_CFG_ABORT: + if (disable_bypass) + break; default: BUG(); /* STE corruption */ } From b518b0c8534fb09760605253b53b44c97f2963d9 Mon Sep 17 00:00:00 2001 From: "Agrawal, Nitesh-kumar" Date: Tue, 26 Jul 2016 08:28:19 +0000 Subject: [PATCH 364/813] pinctrl/amd: Remove the default de-bounce time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8cf4345575a416e6856a6856ac6eaa31ad883126 upstream. In the function amd_gpio_irq_enable() and amd_gpio_direction_input(), remove the code which is setting the default de-bounce time to 2.75ms. The driver code shall use the same settings as specified in BIOS. Any default assignment impacts TouchPad behaviour when the LevelTrig is set to EDGE FALLING. Reviewed-by:  Ken Xue Signed-off-by: Nitesh Kumar Agrawal Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-amd.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 3318f1d6193c..7340ff78839a 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -48,17 +48,6 @@ static int amd_gpio_direction_input(struct gpio_chip *gc, unsigned offset) spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + offset * 4); - /* - * Suppose BIOS or Bootloader sets specific debounce for the - * GPIO. if not, set debounce to be 2.75ms and remove glitch. - */ - if ((pin_reg & DB_TMR_OUT_MASK) == 0) { - pin_reg |= 0xf; - pin_reg |= BIT(DB_TMR_OUT_UNIT_OFF); - pin_reg |= DB_TYPE_REMOVE_GLITCH << DB_CNTRL_OFF; - pin_reg &= ~BIT(DB_TMR_LARGE_OFF); - } - pin_reg &= ~BIT(OUTPUT_ENABLE_OFF); writel(pin_reg, gpio_dev->base + offset * 4); spin_unlock_irqrestore(&gpio_dev->lock, flags); @@ -331,15 +320,6 @@ static void amd_gpio_irq_enable(struct irq_data *d) spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + (d->hwirq)*4); - /* - Suppose BIOS or Bootloader sets specific debounce for the - GPIO. if not, set debounce to be 2.75ms. - */ - if ((pin_reg & DB_TMR_OUT_MASK) == 0) { - pin_reg |= 0xf; - pin_reg |= BIT(DB_TMR_OUT_UNIT_OFF); - pin_reg &= ~BIT(DB_TMR_LARGE_OFF); - } pin_reg |= BIT(INTERRUPT_ENABLE_OFF); pin_reg |= BIT(INTERRUPT_MASK_OFF); writel(pin_reg, gpio_dev->base + (d->hwirq)*4); From 07adb640aa87c454cb0334a119111d5e5dfb13b7 Mon Sep 17 00:00:00 2001 From: Emmanouil Maroudas Date: Sat, 23 Apr 2016 18:33:00 +0300 Subject: [PATCH 365/813] EDAC: Increment correct counter in edac_inc_ue_error() commit 993f88f1cc7f0879047ff353e824e5cc8f10adfc upstream. Fix typo in edac_inc_ue_error() to increment ue_noinfo_count instead of ce_noinfo_count. Signed-off-by: Emmanouil Maroudas Cc: Mauro Carvalho Chehab Cc: linux-edac Fixes: 4275be635597 ("edac: Change internal representation to work with layers") Link: http://lkml.kernel.org/r/1461425580-5898-1-git-send-email-emmanouil.maroudas@gmail.com Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- drivers/edac/edac_mc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 1b2c2187b347..dc68394da682 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -966,7 +966,7 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci, mci->ue_mc += count; if (!enable_per_layer_report) { - mci->ce_noinfo_count += count; + mci->ue_noinfo_count += count; return; } From 1473e6afbb1a37adba91aed61a62bbc0dd78634b Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Mon, 8 Aug 2016 14:08:17 +0200 Subject: [PATCH 366/813] s390/dasd: fix hanging device after clear subchannel commit 9ba333dc55cbb9523553df973adb3024d223e905 upstream. When a device is in a status where CIO has killed all I/O by itself the interrupt for a clear request may not contain an irb to determine the clear function. Instead it contains an error pointer -EIO. This was ignored by the DASD int_handler leading to a hanging device waiting for a clear interrupt. Handle -EIO error pointer correctly for requests that are clear pending and treat the clear as successful. Signed-off-by: Stefan Haberland Reviewed-by: Sebastian Ott Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- drivers/s390/block/dasd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 4abfbdb285ec..84c13dffa3a8 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1584,9 +1584,18 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, unsigned long long now; int expires; + cqr = (struct dasd_ccw_req *) intparm; if (IS_ERR(irb)) { switch (PTR_ERR(irb)) { case -EIO: + if (cqr && cqr->status == DASD_CQR_CLEAR_PENDING) { + device = (struct dasd_device *) cqr->startdev; + cqr->status = DASD_CQR_CLEARED; + dasd_device_clear_timer(device); + wake_up(&dasd_flush_wq); + dasd_schedule_device_bh(device); + return; + } break; case -ETIMEDOUT: DBF_EVENT_DEVID(DBF_WARNING, cdev, "%s: " @@ -1602,7 +1611,6 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, } now = get_tod_clock(); - cqr = (struct dasd_ccw_req *) intparm; /* check for conditions that should be handled immediately */ if (!cqr || !(scsw_dstat(&irb->scsw) == (DEV_STAT_CHN_END | DEV_STAT_DEV_END) && From ed6625cfdbe6bb9bc9561934361abdca43be551a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 2 Aug 2016 11:13:41 +0200 Subject: [PATCH 367/813] mac80211: fix purging multicast PS buffer queue commit 6b07d9ca9b5363dda959b9582a3fc9c0b89ef3b5 upstream. The code currently assumes that buffered multicast PS frames don't have a pending ACK frame for tx status reporting. However, hostapd sends a broadcast deauth frame on teardown for which tx status is requested. This can lead to the "Have pending ack frames" warning on module reload. Fix this by using ieee80211_free_txskb/ieee80211_purge_tx_queue. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/cfg.c | 2 +- net/mac80211/tx.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c12f348138ac..19322c047386 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -865,7 +865,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) /* free all potentially still buffered bcast frames */ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); - skb_queue_purge(&sdata->u.ap.ps.bc_buf); + ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf); mutex_lock(&local->mtx); ieee80211_vif_copy_chanctx_to_vlans(sdata, true); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index bdc224d5053a..e1225b395415 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -365,7 +365,7 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) skb = skb_dequeue(&ps->bc_buf); if (skb) { purged++; - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); } total += skb_queue_len(&ps->bc_buf); } @@ -448,7 +448,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (skb_queue_len(&ps->bc_buf) >= AP_MAX_BC_BUFFER) { ps_dbg(tx->sdata, "BC TX buffer full - dropping the oldest frame\n"); - dev_kfree_skb(skb_dequeue(&ps->bc_buf)); + ieee80211_free_txskb(&tx->local->hw, skb_dequeue(&ps->bc_buf)); } else tx->local->total_ps_buffered++; @@ -3781,7 +3781,7 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); if (!ieee80211_tx_prepare(sdata, &tx, NULL, skb)) break; - dev_kfree_skb_any(skb); + ieee80211_free_txskb(hw, skb); } info = IEEE80211_SKB_CB(skb); From df4fe6f8c7b793adcab7b423a6efd989e195b4b3 Mon Sep 17 00:00:00 2001 From: Caesar Wang Date: Wed, 27 Jul 2016 22:24:06 +0800 Subject: [PATCH 368/813] arm64: dts: rockchip: add reset saradc node for rk3368 SoCs commit 78ec79bfd59e126e1cb394302bfa531a420b3ecd upstream. SARADC controller needs to be reset before programming it, otherwise it will not function properly. Signed-off-by: Caesar Wang Acked-by: Heiko Stuebner Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/rockchip/rk3368.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 8fe39e1b680e..e0ee2b00d573 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -262,6 +262,8 @@ #io-channel-cells = <1>; clocks = <&cru SCLK_SARADC>, <&cru PCLK_SARADC>; clock-names = "saradc", "apb_pclk"; + resets = <&cru SRST_SARADC>; + reset-names = "saradc-apb"; status = "disabled"; }; From 51ed10c4ffd7742d160c02f7dd6bab634b2c03ef Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 15 Aug 2016 14:58:43 +0200 Subject: [PATCH 369/813] of: fix reference counting in of_graph_get_endpoint_by_regs commit 34276bb062b8449b3b0a208c9b848a1a27920075 upstream. The called of_graph_get_next_endpoint() already decrements the refcount of the prev node, so it is wrong to do it again in the calling function. Use the for_each_endpoint_of_node() helper to interate through the endpoint OF nodes, which already does the right thing and simplifies the code a bit. Fixes: 8ccd0d0ca041 (of: add helper for getting endpoint node of specific identifiers) Reported-by: David Jander Signed-off-by: Lucas Stach Acked-by: Philipp Zabel Signed-off-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- drivers/of/base.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 942461f36616..31341290cd91 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2253,20 +2253,13 @@ struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg) { struct of_endpoint endpoint; - struct device_node *node, *prev_node = NULL; - - while (1) { - node = of_graph_get_next_endpoint(parent, prev_node); - of_node_put(prev_node); - if (!node) - break; + struct device_node *node = NULL; + for_each_endpoint_of_node(parent, node) { of_graph_parse_endpoint(node, &endpoint); if (((port_reg == -1) || (endpoint.port == port_reg)) && ((reg == -1) || (endpoint.id == reg))) return node; - - prev_node = node; } return NULL; From c3cf68ec5595e30c28d44b0080f236af94e0e8da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Aug 2016 18:38:42 +0200 Subject: [PATCH 370/813] sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression commit 173be9a14f7b2e901cf77c18b1aafd4d672e9d9e upstream. Mike reports: Roughly 10% of the time, ltp testcase getrusage04 fails: getrusage04 0 TINFO : Expected timers granularity is 4000 us getrusage04 0 TINFO : Using 1 as multiply factor for max [us]time increment (1000+4000us)! getrusage04 0 TINFO : utime: 0us; stime: 179us getrusage04 0 TINFO : utime: 3751us; stime: 0us getrusage04 1 TFAIL : getrusage04.c:133: stime increased > 5000us: And tracked it down to the case where the task simply doesn't get _any_ [us]time ticks. Update the code to assume all rtime is utime when we lack information, thus ensuring a task that elides the tick gets time accounted. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Fredrik Markstrom Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wanpeng Li Fixes: 9d7fb0427648 ("sched/cputime: Guarantee stime + utime == rtime") Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cputime.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f74ea89e77a8..a1aecbedf5b1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -600,19 +600,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -635,7 +641,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: From 15abaa07a2f0dabb66dfa637162fdaa66b839141 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 4 May 2016 14:45:34 +0800 Subject: [PATCH 371/813] sched/nohz: Fix affine unpinned timers mess commit 444969223c81c7d0a95136b7b4cfdcfbc96ac5bd upstream. The following commit: 9642d18eee2c ("nohz: Affine unpinned timers to housekeepers")' intended to affine unpinned timers to housekeepers: unpinned timers(full dynaticks, idle) => nearest busy housekeepers(otherwise, fallback to any housekeepers) unpinned timers(full dynaticks, busy) => nearest busy housekeepers(otherwise, fallback to any housekeepers) unpinned timers(houserkeepers, idle) => nearest busy housekeepers(otherwise, fallback to itself) However, the !idle_cpu(i) && is_housekeeping_cpu(cpu) check modified the intention to: unpinned timers(full dynaticks, idle) => any housekeepers(no mattter cpu topology) unpinned timers(full dynaticks, busy) => any housekeepers(no mattter cpu topology) unpinned timers(housekeepers, idle) => any busy cpus(otherwise, fallback to any housekeepers) This patch fixes it by checking if there are busy housekeepers nearby, otherwise falls to any housekeepers/itself. After the patch: unpinned timers(full dynaticks, idle) => nearest busy housekeepers(otherwise, fallback to any housekeepers) unpinned timers(full dynaticks, busy) => nearest busy housekeepers(otherwise, fallback to any housekeepers) unpinned timers(housekeepers, idle) => nearest busy housekeepers(otherwise, fallback to itself) Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) [ Fixed the changelog. ] Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: 'commit 9642d18eee2c ("nohz: Affine unpinned timers to housekeepers")' Link: http://lkml.kernel.org/r/1462344334-8303-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67d1e1597d9c..ea863bc22caf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -627,7 +627,10 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } From 6cb6e951522723e17cad4419eef2cdc75c7ecc95 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 8 Aug 2016 17:19:38 -0700 Subject: [PATCH 372/813] iio: fix sched WARNING "do not call blocking ops when !TASK_RUNNING" commit fcf68f3c0bb2a541aa47a2a38b8939edf84fd529 upstream. When using CONFIG_DEBUG_ATOMIC_SLEEP, the scheduler nicely points out that we're calling sleeping primitives within the wait_event loop, which means we might clobber the task state: [ 10.831289] do not call blocking ops when !TASK_RUNNING; state=1 set at [] [ 10.845531] ------------[ cut here ]------------ [ 10.850161] WARNING: at kernel/sched/core.c:7630 ... [ 12.164333] ---[ end trace 45409966a9a76438 ]--- [ 12.168942] Call trace: [ 12.171391] [] __might_sleep+0x64/0x90 [ 12.176699] [] mutex_lock_nested+0x50/0x3fc [ 12.182440] [] iio_kfifo_buf_data_available+0x28/0x4c [ 12.189043] [] iio_buffer_ready+0x60/0xe0 [ 12.194608] [] iio_buffer_read_first_n_outer+0x108/0x1a8 [ 12.201474] [] __vfs_read+0x58/0x114 [ 12.206606] [] vfs_read+0x94/0x118 [ 12.211564] [] SyS_read+0x64/0xb4 [ 12.216436] [] el0_svc_naked+0x24/0x28 To avoid this, we should (a la https://lwn.net/Articles/628628/) use the wait_woken() function, which avoids the nested sleeping while still handling races between waiting / wake-events. Signed-off-by: Brian Norris Reviewed-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/industrialio-buffer.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index 0f6f63b20263..7afd226a3321 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -107,6 +107,7 @@ ssize_t iio_buffer_read_first_n_outer(struct file *filp, char __user *buf, { struct iio_dev *indio_dev = filp->private_data; struct iio_buffer *rb = indio_dev->buffer; + DEFINE_WAIT_FUNC(wait, woken_wake_function); size_t datum_size; size_t to_wait; int ret; @@ -131,19 +132,29 @@ ssize_t iio_buffer_read_first_n_outer(struct file *filp, char __user *buf, else to_wait = min_t(size_t, n / datum_size, rb->watermark); + add_wait_queue(&rb->pollq, &wait); do { - ret = wait_event_interruptible(rb->pollq, - iio_buffer_ready(indio_dev, rb, to_wait, n / datum_size)); - if (ret) - return ret; + if (!indio_dev->info) { + ret = -ENODEV; + break; + } - if (!indio_dev->info) - return -ENODEV; + if (!iio_buffer_ready(indio_dev, rb, to_wait, n / datum_size)) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + wait_woken(&wait, TASK_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + continue; + } ret = rb->access->read_first_n(rb, n, buf); if (ret == 0 && (filp->f_flags & O_NONBLOCK)) ret = -EAGAIN; } while (ret == 0); + remove_wait_queue(&rb->pollq, &wait); return ret; } From 4e9db9e6bfdf926bb96780cbc4ae842ea1cc3edf Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 12 Aug 2016 19:25:21 -0400 Subject: [PATCH 373/813] drm/amdgpu: Change GART offset to 64-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cab0b8d50e9bbef62c04067072c953433a87a9ff upstream. The GART aperture size can be bigger than 4GB. Therefore the offset used in amdgpu_gart_bind and amdgpu_gart_unbind must be 64-bit. Reviewed-by: Christian König Signed-off-by: Felix Kuehling Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 053fc2f465df..ff5566c69f7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -710,9 +710,9 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev); void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev); int amdgpu_gart_init(struct amdgpu_device *adev); void amdgpu_gart_fini(struct amdgpu_device *adev); -void amdgpu_gart_unbind(struct amdgpu_device *adev, unsigned offset, +void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset, int pages); -int amdgpu_gart_bind(struct amdgpu_device *adev, unsigned offset, +int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset, int pages, struct page **pagelist, dma_addr_t *dma_addr, uint32_t flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c index 7312d729d300..22a613a95bf0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c @@ -221,7 +221,7 @@ void amdgpu_gart_table_vram_free(struct amdgpu_device *adev) * Unbinds the requested pages from the gart page table and * replaces them with the dummy page (all asics). */ -void amdgpu_gart_unbind(struct amdgpu_device *adev, unsigned offset, +void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset, int pages) { unsigned t; @@ -269,7 +269,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, unsigned offset, * (all asics). * Returns 0 for success, -EINVAL for failure. */ -int amdgpu_gart_bind(struct amdgpu_device *adev, unsigned offset, +int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset, int pages, struct page **pagelist, dma_addr_t *dma_addr, uint32_t flags) { From 65317dbcf37367350276967ff9e3b4647054cb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 17 Aug 2016 09:45:25 +0200 Subject: [PATCH 374/813] drm/amdgpu: fix amdgpu_move_blit on 32bit systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 815d27a46f3119f74fe01fe10bf683aa5bc55597 upstream. This bug seems to be present for a very long time. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 1cbb16e15307..475c38fe9245 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -233,8 +233,8 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, adev = amdgpu_get_adev(bo->bdev); ring = adev->mman.buffer_funcs_ring; - old_start = old_mem->start << PAGE_SHIFT; - new_start = new_mem->start << PAGE_SHIFT; + old_start = (u64)old_mem->start << PAGE_SHIFT; + new_start = (u64)new_mem->start << PAGE_SHIFT; switch (old_mem->mem_type) { case TTM_PL_VRAM: From 9c22155c5a39dc452c59534195712f57f512b063 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 24 Aug 2016 12:31:36 -0400 Subject: [PATCH 375/813] drm/amdgpu: avoid a possible array overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e1718d97aa88ea44a6a8f50ff464253dd0dacf01 upstream. When looking up the connector type make sure the index is valid. Avoids a later crash if we read past the end of the array. Workaround for bug: https://bugs.freedesktop.org/show_bug.cgi?id=97460 Reviewed-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index 0aaa457a1710..7c9848c9b71c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -331,6 +331,12 @@ bool amdgpu_atombios_get_connector_info_from_object_table(struct amdgpu_device * (le16_to_cpu(path->usConnObjectId) & OBJECT_TYPE_MASK) >> OBJECT_TYPE_SHIFT; + if (con_obj_id >= ARRAY_SIZE(object_connector_convert)) { + DRM_ERROR("invalid con_obj_id %d for device tag 0x%04x\n", + con_obj_id, le16_to_cpu(path->usDeviceTag)); + continue; + } + connector_type = object_connector_convert[con_obj_id]; connector_object_id = con_obj_id; From 2b2627113eee3050a0744a98940b987ad1341c0b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 24 Aug 2016 13:04:15 -0400 Subject: [PATCH 376/813] drm/amdgpu: skip TV/CV in display parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 611a1507fe8569ce1adab3abc982ea58ab559fb9 upstream. No asics supported by amdgpu support analog TV. Workaround for bug: https://bugs.freedesktop.org/show_bug.cgi?id=97460 Reviewed-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index 7c9848c9b71c..51a9942cdb40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -331,6 +331,13 @@ bool amdgpu_atombios_get_connector_info_from_object_table(struct amdgpu_device * (le16_to_cpu(path->usConnObjectId) & OBJECT_TYPE_MASK) >> OBJECT_TYPE_SHIFT; + /* Skip TV/CV support */ + if ((le16_to_cpu(path->usDeviceTag) == + ATOM_DEVICE_TV1_SUPPORT) || + (le16_to_cpu(path->usDeviceTag) == + ATOM_DEVICE_CV_SUPPORT)) + continue; + if (con_obj_id >= ARRAY_SIZE(object_connector_convert)) { DRM_ERROR("invalid con_obj_id %d for device tag 0x%04x\n", con_obj_id, le16_to_cpu(path->usDeviceTag)); From 2d4ab6c1368a8bdf2d4c8a2980f028f6f0fb142e Mon Sep 17 00:00:00 2001 From: jimqu Date: Tue, 30 Aug 2016 08:59:42 +0800 Subject: [PATCH 377/813] drm/amd/amdgpu: sdma resume fail during S4 on CI commit 10ea9434065e56fe14287f89258ecf2fb684ed1a upstream. SDMA could be fail in the thaw() and restore() processes, do software reset if each SDMA engine is busy. Signed-off-by: JimQu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c index 5f712ceddf08..c568293cb6c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c @@ -52,6 +52,7 @@ static void cik_sdma_set_ring_funcs(struct amdgpu_device *adev); static void cik_sdma_set_irq_funcs(struct amdgpu_device *adev); static void cik_sdma_set_buffer_funcs(struct amdgpu_device *adev); static void cik_sdma_set_vm_pte_funcs(struct amdgpu_device *adev); +static int cik_sdma_soft_reset(void *handle); MODULE_FIRMWARE("radeon/bonaire_sdma.bin"); MODULE_FIRMWARE("radeon/bonaire_sdma1.bin"); @@ -1030,6 +1031,8 @@ static int cik_sdma_resume(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + cik_sdma_soft_reset(handle); + return cik_sdma_hw_init(adev); } From d6af5abb8d5d0690149ff2dab62c830521c6b795 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Tue, 30 Aug 2016 17:59:11 +0800 Subject: [PATCH 378/813] drm/amdgpu: record error code when ring test failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1f703e6679f373f5bba4efe7093aa82e91af4037 upstream. Otherwise we may miss errors. Signed-off-by: Chunming Zhou Reviewed-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 9e25edafa721..c77a1ebfc632 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -288,7 +288,7 @@ void amdgpu_ib_pool_fini(struct amdgpu_device *adev) int amdgpu_ib_ring_tests(struct amdgpu_device *adev) { unsigned i; - int r; + int r, ret = 0; for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -309,10 +309,11 @@ int amdgpu_ib_ring_tests(struct amdgpu_device *adev) } else { /* still not good, but we can live with it */ DRM_ERROR("amdgpu: failed testing IB on ring %d (%d).\n", i, r); + ret = r; } } } - return 0; + return ret; } /* From 137f6bac13992163e26a682fe114878c4066fea4 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 5 Aug 2016 19:04:40 +0100 Subject: [PATCH 379/813] drm/i915: fix aliasing_ppgtt leak commit 3871f42a57efcdc6a9da751a8cb6fa196c212289 upstream. In i915_ggtt_cleanup_hw we need to remember to free aliasing_ppgtt. This fixes the following kmemleak message: unreferenced object 0xffff880213cca000 (size 8192): comm "modprobe", pid 1298, jiffies 4294745402 (age 703.930s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] kmem_cache_alloc_trace+0x142/0x1d0 [] i915_gem_init_ggtt+0x10f/0x210 [i915] [] i915_gem_init+0x5b/0xd0 [i915] [] i915_driver_load+0x97a/0x1460 [i915] [] i915_pci_probe+0x4f/0x70 [i915] [] local_pci_probe+0x45/0xa0 [] pci_device_probe+0x103/0x150 [] driver_probe_device+0x22c/0x440 [] __driver_attach+0xd1/0xf0 [] bus_for_each_dev+0x6c/0xc0 [] driver_attach+0x1e/0x20 [] bus_add_driver+0x1c3/0x280 [] driver_register+0x60/0xe0 [] __pci_register_driver+0x4c/0x50 [] 0xffffffffa013605b Signed-off-by: Matthew Auld Reviewed-by: Chris Wilson Fixes: b18b6bde300e ("drm/i915/bdw: Free PPGTT struct") Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1470420280-21417-1-git-send-email-matthew.auld@intel.com (cherry picked from commit cb7f27601c81a1e0454e9461e96f65b31fafbea0) Signed-off-by: Jani Nikula Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_gtt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 86c7500454b4..b37fe0df743e 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -2747,6 +2747,7 @@ void i915_global_gtt_cleanup(struct drm_device *dev) struct i915_hw_ppgtt *ppgtt = dev_priv->mm.aliasing_ppgtt; ppgtt->base.cleanup(&ppgtt->base); + kfree(ppgtt); } if (drm_mm_initialized(&vm->mm)) { From 413d5877ef6f634f4f0346d75ccfca537b04518a Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 25 Feb 2016 22:04:38 +0530 Subject: [PATCH 380/813] ARC: build: Better way to detect ISA compatible toolchain commit 20d780374c81cf237834af2202c26df2100ddd69 upstream. ARC architecture has 2 instruction sets: ARCompact/ARCv2. While same gcc supports compiling for either (using appropriate toggles), we can't use the same toolchain to build kernel because libgcc needs to be unique and the toolchian (uClibc based) is not multilibed. uClibc toolchain is convenient since it allows all userspace and kernel to be built with a single install for an ISA. This however means 2 gnu installs (with same triplet prefix) are needed for building for 2 ISA and need to be in PATH. As developers we keep switching the builds, but would occassionally fail to update the PATH leading to usage of wrong tools. And this would only show up at the end of kernel build when linking incompatible libgcc. So the initial solution was to have gcc define a special preprocessor macro DEFAULT_CPU_xxx which is unique for default toolchain configuration. Claudiu proposed using grep for an existing preprocessor macro which is again uniquely defined per ISA. Cc: Michal Marek Suggested-by: Claudiu Zissulescu Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/Makefile | 14 ++++++++++++++ arch/arc/include/asm/arcregs.h | 6 ------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 209d8451e23d..c05ea2b54276 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -18,6 +18,20 @@ cflags-y += -fno-common -pipe -fno-builtin -D__linux__ cflags-$(CONFIG_ISA_ARCOMPACT) += -mA7 cflags-$(CONFIG_ISA_ARCV2) += -mcpu=archs +is_700 = $(shell $(CC) -dM -E - < /dev/null | grep -q "ARC700" && echo 1 || echo 0) + +ifdef CONFIG_ISA_ARCOMPACT +ifeq ($(is_700), 0) + $(error Toolchain not configured for ARCompact builds) +endif +endif + +ifdef CONFIG_ISA_ARCV2 +ifeq ($(is_700), 1) + $(error Toolchain not configured for ARCv2 builds) +endif +endif + ifdef CONFIG_ARC_CURR_IN_REG # For a global register defintion, make sure it gets passed to every file # We had a customer reported bug where some code built in kernel was NOT using diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 7fac7d85ed6a..2c30a016cf15 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -374,12 +374,6 @@ static inline int is_isa_arcompact(void) return IS_ENABLED(CONFIG_ISA_ARCOMPACT); } -#if defined(CONFIG_ISA_ARCOMPACT) && !defined(_CPU_DEFAULT_A7) -#error "Toolchain not configured for ARCompact builds" -#elif defined(CONFIG_ISA_ARCV2) && !defined(_CPU_DEFAULT_HS) -#error "Toolchain not configured for ARCv2 builds" -#endif - #endif /* __ASEMBLY__ */ #endif /* _ASM_ARC_ARCREGS_H */ From 45a945050dfa9619c3223add776c1b9d595a0655 Mon Sep 17 00:00:00 2001 From: Liav Rehana Date: Tue, 16 Aug 2016 10:55:35 +0300 Subject: [PATCH 381/813] ARC: use correct offset in pt_regs for saving/restoring user mode r25 commit 86147e3cfa5e118b61e78f4f0bf29e920dcbd477 upstream. User mode callee regs are explicitly collected before signal delivery or breakpoint trap. r25 is special for kernel as it serves as task pointer, so user mode value is clobbered very early. It is saved in pt_regs where generally only scratch (aka caller saved) regs are saved. The code to access the corresponding pt_regs location had a subtle bug as it was using load/store with scaling of offset, whereas the offset was already byte wise correct. So fix this by replacing LD.AS with a standard LD Signed-off-by: Liav Rehana Reviewed-by: Alexey Brodkin [vgupta: rewrote title and commit log] Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/include/asm/entry.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/include/asm/entry.h b/arch/arc/include/asm/entry.h index ad7860c5ce15..51597f344a62 100644 --- a/arch/arc/include/asm/entry.h +++ b/arch/arc/include/asm/entry.h @@ -142,7 +142,7 @@ #ifdef CONFIG_ARC_CURR_IN_REG ; Retrieve orig r25 and save it with rest of callee_regs - ld.as r12, [r12, PT_user_r25] + ld r12, [r12, PT_user_r25] PUSH r12 #else PUSH r25 @@ -198,7 +198,7 @@ ; SP is back to start of pt_regs #ifdef CONFIG_ARC_CURR_IN_REG - st.as r12, [sp, PT_user_r25] + st r12, [sp, PT_user_r25] #endif .endm From e44f5b5386f49f00d8d62880c28a814e8220c8ed Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Thu, 4 Aug 2016 17:56:53 -0700 Subject: [PATCH 382/813] ARC: Call trace_hardirqs_on() before enabling irqs commit 18b43e89d295cc65151c505c643c98fb2c320e59 upstream. trace_hardirqs_on_caller() in lockdep.c expects to be called before, not after interrupts are actually enabled. The following comment in kernel/locking/lockdep.c substantiates this claim: " /* * We're enabling irqs and according to our state above irqs weren't * already enabled, yet we find the hardware thinks they are in fact * enabled.. someone messed up their IRQ state tracing. */ " An example can be found in include/linux/irqflags.h: do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) Without this change, we hit the following DEBUG_LOCKS_WARN_ON. [ 7.760000] ------------[ cut here ]------------ [ 7.760000] WARNING: CPU: 0 PID: 1 at kernel/locking/lockdep.c:2711 resume_user_mode_begin+0x48/0xf0 [ 7.770000] DEBUG_LOCKS_WARN_ON(!irqs_disabled()) [ 7.780000] Modules linked in: [ 7.780000] CPU: 0 PID: 1 Comm: init Not tainted 4.7.0-00003-gc668bb9-dirty #366 [ 7.790000] [ 7.790000] Stack Trace: [ 7.790000] arc_unwind_core.constprop.1+0xa4/0x118 [ 7.800000] warn_slowpath_fmt+0x72/0x158 [ 7.800000] resume_user_mode_begin+0x48/0xf0 [ 7.810000] ---[ end trace 6f6a7a8fae20d2f0 ]--- Signed-off-by: Daniel Mentz Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/include/asm/irqflags-compact.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/irqflags-compact.h b/arch/arc/include/asm/irqflags-compact.h index c1d36458bfb7..4c6eed80cd8b 100644 --- a/arch/arc/include/asm/irqflags-compact.h +++ b/arch/arc/include/asm/irqflags-compact.h @@ -188,10 +188,10 @@ static inline int arch_irqs_disabled(void) .endm .macro IRQ_ENABLE scratch + TRACE_ASM_IRQ_ENABLE lr \scratch, [status32] or \scratch, \scratch, (STATUS_E1_MASK | STATUS_E2_MASK) flag \scratch - TRACE_ASM_IRQ_ENABLE .endm #endif /* __ASSEMBLY__ */ From 03551c85c1c01bf7ec58218a2afb225d98da0b97 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 13 Jun 2016 16:38:27 +0200 Subject: [PATCH 383/813] ARC: Elide redundant setup of DMA callbacks commit 45c3b08a117e2232fc8d7b9e849ead36386f4f96 upstream. For resources shared by all cores such as SLC and IOC, only the master core needs to do any setups / enabling / disabling etc. Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/mm/cache.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index ff7ff6cbb811..aaf1e2d1d900 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -914,6 +914,15 @@ void arc_cache_init(void) printk(arc_cache_mumbojumbo(0, str, sizeof(str))); + /* + * Only master CPU needs to execute rest of function: + * - Assume SMP so all cores will have same cache config so + * any geomtry checks will be same for all + * - IOC setup / dma callbacks only need to be setup once + */ + if (cpu) + return; + if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE)) { struct cpuinfo_arc_cache *ic = &cpuinfo_arc700[cpu].icache; From e4878ef66e5b8d01d6734b1952f9abb3eeea454c Mon Sep 17 00:00:00 2001 From: Dave Carroll Date: Fri, 5 Aug 2016 13:44:10 -0600 Subject: [PATCH 384/813] aacraid: Check size values after double-fetch from user commit fa00c437eef8dc2e7b25f8cd868cfa405fcc2bb3 upstream. In aacraid's ioctl_send_fib() we do two fetches from userspace, one the get the fib header's size and one for the fib itself. Later we use the size field from the second fetch to further process the fib. If for some reason the size from the second fetch is different than from the first fix, we may encounter an out-of- bounds access in aac_fib_send(). We also check the sender size to insure it is not out of bounds. This was reported in https://bugzilla.kernel.org/show_bug.cgi?id=116751 and was assigned CVE-2016-6480. Reported-by: Pengfei Wang Fixes: 7c00ffa31 '[SCSI] 2.6 aacraid: Variable FIB size (updated patch)' Signed-off-by: Dave Carroll Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/aacraid/commctrl.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c index 54195a117f72..f78cc943d230 100644 --- a/drivers/scsi/aacraid/commctrl.c +++ b/drivers/scsi/aacraid/commctrl.c @@ -63,7 +63,7 @@ static int ioctl_send_fib(struct aac_dev * dev, void __user *arg) struct fib *fibptr; struct hw_fib * hw_fib = (struct hw_fib *)0; dma_addr_t hw_fib_pa = (dma_addr_t)0LL; - unsigned size; + unsigned int size, osize; int retval; if (dev->in_reset) { @@ -87,7 +87,8 @@ static int ioctl_send_fib(struct aac_dev * dev, void __user *arg) * will not overrun the buffer when we copy the memory. Return * an error if we would. */ - size = le16_to_cpu(kfib->header.Size) + sizeof(struct aac_fibhdr); + osize = size = le16_to_cpu(kfib->header.Size) + + sizeof(struct aac_fibhdr); if (size < le16_to_cpu(kfib->header.SenderSize)) size = le16_to_cpu(kfib->header.SenderSize); if (size > dev->max_fib_size) { @@ -118,6 +119,14 @@ static int ioctl_send_fib(struct aac_dev * dev, void __user *arg) goto cleanup; } + /* Sanity check the second copy */ + if ((osize != le16_to_cpu(kfib->header.Size) + + sizeof(struct aac_fibhdr)) + || (size < le16_to_cpu(kfib->header.SenderSize))) { + retval = -EINVAL; + goto cleanup; + } + if (kfib->header.Command == cpu_to_le16(TakeABreakPt)) { aac_adapter_interrupt(dev); /* From d91c348e4c3a011849e309cb76a6fdc714935ea4 Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Fri, 15 Jul 2016 16:28:41 -0700 Subject: [PATCH 385/813] mfd: cros_ec: Add cros_ec_cmd_xfer_status() helper commit 9798ac6d32c1a32d6d92d853ff507d2d39c4300c upstream. So that callers of cros_ec_cmd_xfer() don't have to repeat boilerplate code when checking for errors from the EC side. Signed-off-by: Tomeu Vizoso Reviewed-by: Benson Leung Signed-off-by: Brian Norris Acked-by: Lee Jones Tested-by: Enric Balletbo i Serra Signed-off-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/platform/chrome/cros_ec_proto.c | 17 +++++++++++++++++ include/linux/mfd/cros_ec.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 990308ca384f..92430f781eb7 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -380,3 +380,20 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, return ret; } EXPORT_SYMBOL(cros_ec_cmd_xfer); + +int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, + struct cros_ec_command *msg) +{ + int ret; + + ret = cros_ec_cmd_xfer(ec_dev, msg); + if (ret < 0) { + dev_err(ec_dev->dev, "Command xfer error (err:%d)\n", ret); + } else if (msg->result != EC_RES_SUCCESS) { + dev_dbg(ec_dev->dev, "Command result (err: %d)\n", msg->result); + return -EPROTO; + } + + return ret; +} +EXPORT_SYMBOL(cros_ec_cmd_xfer_status); diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 494682ce4bf3..3ab3cede28ea 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -223,6 +223,21 @@ int cros_ec_check_result(struct cros_ec_device *ec_dev, int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, struct cros_ec_command *msg); +/** + * cros_ec_cmd_xfer_status - Send a command to the ChromeOS EC + * + * This function is identical to cros_ec_cmd_xfer, except it returns success + * status only if both the command was transmitted successfully and the EC + * replied with success status. It's not necessary to check msg->result when + * using this function. + * + * @ec_dev: EC device + * @msg: Message to write + * @return: Num. of bytes transferred on success, <0 on failure + */ +int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, + struct cros_ec_command *msg); + /** * cros_ec_remove - Remove a ChromeOS EC * From d489412c69e3ff624413c516cfb7bea18eed6a6c Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Wed, 10 Aug 2016 13:37:18 -0700 Subject: [PATCH 386/813] i2c: cros-ec-tunnel: Fix usage of cros_ec_cmd_xfer() commit 4d01d88019261d05ec3bff5f1a6013393faa3b9e upstream. cros_ec_cmd_xfer returns success status if the command transport completes successfully, but the execution result is incorrectly ignored. In many cases, the execution result is assumed to be successful, leading to ignored errors and operating on uninitialized data. We've recently introduced the cros_ec_cmd_xfer_status() helper to avoid these problems. Let's use it. [Regarding the 'Fixes' tag; there is significant refactoring since the driver's introduction, but the underlying logical error exists throughout I believe] Fixes: 9d230c9e4f4e ("i2c: ChromeOS EC tunnel driver") Signed-off-by: Brian Norris Reviewed-by: Javier Martinez Canillas Reviewed-by: Guenter Roeck Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-cros-ec-tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-cros-ec-tunnel.c b/drivers/i2c/busses/i2c-cros-ec-tunnel.c index a0d95ff682ae..2d5ff86398d0 100644 --- a/drivers/i2c/busses/i2c-cros-ec-tunnel.c +++ b/drivers/i2c/busses/i2c-cros-ec-tunnel.c @@ -215,7 +215,7 @@ static int ec_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg i2c_msgs[], msg->outsize = request_len; msg->insize = response_len; - result = cros_ec_cmd_xfer(bus->ec, msg); + result = cros_ec_cmd_xfer_status(bus->ec, msg); if (result < 0) { dev_err(dev, "Error transferring EC i2c message %d\n", result); goto exit; From af889001ff22390aa739ee5f15a1ed563f75486f Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Fri, 12 Aug 2016 00:52:56 -0700 Subject: [PATCH 387/813] cdc-acm: fix wrong pipe type on rx interrupt xfers commit add125054b8727103631dce116361668436ef6a7 upstream. This fixes the "BOGUS urb xfer" warning logged by usb_submit_urb(). Signed-off-by: Gavin Li Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 5 ++--- drivers/usb/class/cdc-acm.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index d37fdcc3143c..7f374369e539 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1336,7 +1336,6 @@ made_compressed_probe: spin_lock_init(&acm->write_lock); spin_lock_init(&acm->read_lock); mutex_init(&acm->mutex); - acm->rx_endpoint = usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress); acm->is_int_ep = usb_endpoint_xfer_int(epread); if (acm->is_int_ep) acm->bInterval = epread->bInterval; @@ -1376,14 +1375,14 @@ made_compressed_probe: urb->transfer_dma = rb->dma; if (acm->is_int_ep) { usb_fill_int_urb(urb, acm->dev, - acm->rx_endpoint, + usb_rcvintpipe(usb_dev, epread->bEndpointAddress), rb->base, acm->readsize, acm_read_bulk_callback, rb, acm->bInterval); } else { usb_fill_bulk_urb(urb, acm->dev, - acm->rx_endpoint, + usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress), rb->base, acm->readsize, acm_read_bulk_callback, rb); diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h index ccfaba9ab4e4..b30ac5fcde68 100644 --- a/drivers/usb/class/cdc-acm.h +++ b/drivers/usb/class/cdc-acm.h @@ -95,7 +95,6 @@ struct acm { struct urb *read_urbs[ACM_NR]; struct acm_rb read_buffers[ACM_NR]; int rx_buflimit; - int rx_endpoint; spinlock_t read_lock; int write_used; /* number of non-empty write buffers */ int transmitting; From 7386f927cf74aa8c829323706ff70c7a53b32619 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Sat, 30 Jul 2016 10:06:26 -0600 Subject: [PATCH 388/813] mpt3sas: Fix resume on WarpDrive flash cards commit ce7c6c9e1d997a2670aead3a7b87f4df32c11118 upstream. mpt3sas crashes on resume after suspend with WarpDrive flash cards. The reply_post_host_index array is not set back up after the resume, and we deference a stale pointer in _base_interrupt(). [ 47.309711] BUG: unable to handle kernel paging request at ffffc90001f8006c [ 47.318289] IP: [] _base_interrupt+0x49f/0xa30 [mpt3sas] [ 47.326749] PGD 41ccaa067 PUD 41ccab067 PMD 3466c067 PTE 0 [ 47.333848] Oops: 0002 [#1] SMP ... [ 47.452708] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.7.0 #6 [ 47.460506] Hardware name: Dell Inc. OptiPlex 990/06D7TR, BIOS A18 09/24/2013 [ 47.469629] task: ffffffff81c0d500 ti: ffffffff81c00000 task.ti: ffffffff81c00000 [ 47.479112] RIP: 0010:[] [] _base_interrupt+0x49f/0xa30 [mpt3sas] [ 47.490466] RSP: 0018:ffff88041d203e30 EFLAGS: 00010002 [ 47.497801] RAX: 0000000000000001 RBX: ffff880033f4c000 RCX: 0000000000000001 [ 47.506973] RDX: ffffc90001f8006c RSI: 0000000000000082 RDI: 0000000000000082 [ 47.516141] RBP: ffff88041d203eb0 R08: ffff8804118e2820 R09: 0000000000000001 [ 47.525300] R10: 0000000000000001 R11: 00000000100c0000 R12: 0000000000000000 [ 47.534457] R13: ffff880412c487e0 R14: ffff88041a8987d8 R15: 0000000000000001 [ 47.543632] FS: 0000000000000000(0000) GS:ffff88041d200000(0000) knlGS:0000000000000000 [ 47.553796] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 47.561632] CR2: ffffc90001f8006c CR3: 0000000001c06000 CR4: 00000000000406f0 [ 47.570883] Stack: [ 47.575015] 000000001d211228 ffff88041d2100c0 ffff8800c47d8130 0000000000000100 [ 47.584625] ffff8804100c0000 100c000000000000 ffff88041a8992a0 ffff88041a8987f8 [ 47.594230] ffff88041d203e00 ffffffff81111e55 000000000000038c ffff880414ad4280 [ 47.603862] Call Trace: [ 47.608474] [ 47.610413] [] ? call_timer_fn+0x35/0x120 [ 47.620539] [] handle_irq_event_percpu+0x7f/0x1c0 [ 47.629061] [] handle_irq_event+0x2c/0x50 [ 47.636859] [] handle_edge_irq+0x6f/0x130 [ 47.644654] [] handle_irq+0x73/0x120 [ 47.652011] [] ? atomic_notifier_call_chain+0x1a/0x20 [ 47.660854] [] do_IRQ+0x4b/0xd0 [ 47.667777] [] common_interrupt+0x8c/0x8c [ 47.675635] Move the reply_post_host_index array setup into mpt3sas_base_map_resources(), which is also in the resume path. Signed-off-by: Greg Edwards Acked-by: Chaitra P B Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/mpt3sas/mpt3sas_base.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 11393ebf1a68..356233f86064 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -2155,6 +2155,17 @@ mpt3sas_base_map_resources(struct MPT3SAS_ADAPTER *ioc) } else ioc->msix96_vector = 0; + if (ioc->is_warpdrive) { + ioc->reply_post_host_index[0] = (resource_size_t __iomem *) + &ioc->chip->ReplyPostHostIndex; + + for (i = 1; i < ioc->cpu_msix_table_sz; i++) + ioc->reply_post_host_index[i] = + (resource_size_t __iomem *) + ((u8 __iomem *)&ioc->chip->Doorbell + (0x4000 + ((i - 1) + * 4))); + } + list_for_each_entry(reply_q, &ioc->reply_queue_list, list) pr_info(MPT3SAS_FMT "%s: IRQ %d\n", reply_q->name, ((ioc->msix_enable) ? "PCI-MSI-X enabled" : @@ -5201,17 +5212,6 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) if (r) goto out_free_resources; - if (ioc->is_warpdrive) { - ioc->reply_post_host_index[0] = (resource_size_t __iomem *) - &ioc->chip->ReplyPostHostIndex; - - for (i = 1; i < ioc->cpu_msix_table_sz; i++) - ioc->reply_post_host_index[i] = - (resource_size_t __iomem *) - ((u8 __iomem *)&ioc->chip->Doorbell + (0x4000 + ((i - 1) - * 4))); - } - pci_set_drvdata(ioc->pdev, ioc->shost); r = _base_get_ioc_facts(ioc, CAN_SLEEP); if (r) From bbaf7193761eb2e7d43a6e3e23e23b923dfee26e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 5 Aug 2016 23:37:34 -0700 Subject: [PATCH 389/813] megaraid_sas: Fix probing cards without io port commit e7f851684efb3377e9c93aca7fae6e76212e5680 upstream. Found one megaraid_sas HBA probe fails, [ 187.235190] scsi host2: Avago SAS based MegaRAID driver [ 191.112365] megaraid_sas 0000:89:00.0: BAR 0: can't reserve [io 0x0000-0x00ff] [ 191.120548] megaraid_sas 0000:89:00.0: IO memory region busy! and the card has resource like, [ 125.097714] pci 0000:89:00.0: [1000:005d] type 00 class 0x010400 [ 125.104446] pci 0000:89:00.0: reg 0x10: [io 0x0000-0x00ff] [ 125.110686] pci 0000:89:00.0: reg 0x14: [mem 0xce400000-0xce40ffff 64bit] [ 125.118286] pci 0000:89:00.0: reg 0x1c: [mem 0xce300000-0xce3fffff 64bit] [ 125.125891] pci 0000:89:00.0: reg 0x30: [mem 0xce200000-0xce2fffff pref] that does not io port resource allocated from BIOS, and kernel can not assign one as io port shortage. The driver is only looking for MEM, and should not fail. It turns out megasas_init_fw() etc are using bar index as mask. index 1 is used as mask 1, so that pci_request_selected_regions() is trying to request BAR0 instead of BAR1. Fix all related reference. Fixes: b6d5d8808b4c ("megaraid_sas: Use lowest memory bar for SR-IOV VF support") Signed-off-by: Yinghai Lu Acked-by: Kashyap Desai Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 6 +++--- drivers/scsi/megaraid/megaraid_sas_fusion.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 00ce3e269a43..e994ff944091 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -4669,7 +4669,7 @@ static int megasas_init_fw(struct megasas_instance *instance) /* Find first memory bar */ bar_list = pci_select_bars(instance->pdev, IORESOURCE_MEM); instance->bar = find_first_bit(&bar_list, sizeof(unsigned long)); - if (pci_request_selected_regions(instance->pdev, instance->bar, + if (pci_request_selected_regions(instance->pdev, 1<bar, "megasas: LSI")) { dev_printk(KERN_DEBUG, &instance->pdev->dev, "IO memory region busy!\n"); return -EBUSY; @@ -4960,7 +4960,7 @@ fail_ready_state: iounmap(instance->reg_set); fail_ioremap: - pci_release_selected_regions(instance->pdev, instance->bar); + pci_release_selected_regions(instance->pdev, 1<bar); return -EINVAL; } @@ -4981,7 +4981,7 @@ static void megasas_release_mfi(struct megasas_instance *instance) iounmap(instance->reg_set); - pci_release_selected_regions(instance->pdev, instance->bar); + pci_release_selected_regions(instance->pdev, 1<bar); } /** diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index 8d630a552b07..4f391e747be2 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -2437,7 +2437,7 @@ megasas_release_fusion(struct megasas_instance *instance) iounmap(instance->reg_set); - pci_release_selected_regions(instance->pdev, instance->bar); + pci_release_selected_regions(instance->pdev, 1<bar); } /** From 133716877986567ccdd3d3446458bc404e04a709 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 13 Aug 2016 01:28:24 +0000 Subject: [PATCH 390/813] usb: renesas_usbhs: gadget: fix return value check in usbhs_mod_gadget_probe() commit 3295235fd70ed6d594aadee8c892a14f6a4b2d2e upstream. In case of error, the function usb_get_phy() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: b5a2875605ca ("usb: renesas_usbhs: Allow an OTG PHY driver to provide VBUS") Acked-by: Yoshihiro Shimoda Signed-off-by: Wei Yongjun Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/mod_gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c index 67f7dbda9e79..efc4fae123a4 100644 --- a/drivers/usb/renesas_usbhs/mod_gadget.c +++ b/drivers/usb/renesas_usbhs/mod_gadget.c @@ -1075,7 +1075,7 @@ int usbhs_mod_gadget_probe(struct usbhs_priv *priv) gpriv->transceiver = usb_get_phy(USB_PHY_TYPE_UNDEFINED); dev_info(dev, "%stransceiver found\n", - gpriv->transceiver ? "" : "no "); + !IS_ERR(gpriv->transceiver) ? "" : "no "); /* * CAUTION From 3e7c26db216b565040f2e100cbe4ef6a4af33026 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 16 Aug 2016 09:58:25 +0200 Subject: [PATCH 391/813] gpio: Fix OF build problem on UM commit 2527ecc9195e9c66252af24c4689e8a67cd4ccb9 upstream. The UserMode (UM) Linux build was failing in gpiolib-of as it requires ioremap()/iounmap() to exist, which is absent from UM. The non-existence of IO memory is negatively defined as CONFIG_NO_IOMEM which means we need to depend on HAS_IOMEM. Cc: Geert Uytterhoeven Reported-by: kbuild test robot Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index b18bea08ff25..469dc378adeb 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -50,6 +50,7 @@ config GPIO_DEVRES config OF_GPIO def_bool y depends on OF + depends on HAS_IOMEM config GPIO_ACPI def_bool y From 104b0d196dc23023dfd8401574585f16ce7ca57a Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 25 Aug 2016 15:17:11 -0700 Subject: [PATCH 392/813] fs/seq_file: fix out-of-bounds read commit 088bf2ff5d12e2e32ee52a4024fec26e582f44d3 upstream. seq_read() is a nasty piece of work, not to mention buggy. It has (I think) an old bug which allows unprivileged userspace to read beyond the end of m->buf. I was getting these: BUG: KASAN: slab-out-of-bounds in seq_read+0xcd2/0x1480 at addr ffff880116889880 Read of size 2713 by task trinity-c2/1329 CPU: 2 PID: 1329 Comm: trinity-c2 Not tainted 4.8.0-rc1+ #96 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 Call Trace: kasan_object_err+0x1c/0x80 kasan_report_error+0x2cb/0x7e0 kasan_report+0x4e/0x80 check_memory_region+0x13e/0x1a0 kasan_check_read+0x11/0x20 seq_read+0xcd2/0x1480 proc_reg_read+0x10b/0x260 do_loop_readv_writev.part.5+0x140/0x2c0 do_readv_writev+0x589/0x860 vfs_readv+0x7b/0xd0 do_readv+0xd8/0x2c0 SyS_readv+0xb/0x10 do_syscall_64+0x1b3/0x4b0 entry_SYSCALL64_slow_path+0x25/0x25 Object at ffff880116889100, in cache kmalloc-4096 size: 4096 Allocated: PID = 1329 save_stack_trace+0x26/0x80 save_stack+0x46/0xd0 kasan_kmalloc+0xad/0xe0 __kmalloc+0x1aa/0x4a0 seq_buf_alloc+0x35/0x40 seq_read+0x7d8/0x1480 proc_reg_read+0x10b/0x260 do_loop_readv_writev.part.5+0x140/0x2c0 do_readv_writev+0x589/0x860 vfs_readv+0x7b/0xd0 do_readv+0xd8/0x2c0 SyS_readv+0xb/0x10 do_syscall_64+0x1b3/0x4b0 return_from_SYSCALL_64+0x0/0x6a Freed: PID = 0 (stack is not available) Memory state around the buggy address: ffff88011688a000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff88011688a080: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff88011688a100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff88011688a180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88011688a200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Disabling lock debugging due to kernel taint This seems to be the same thing that Dave Jones was seeing here: https://lkml.org/lkml/2016/8/12/334 There are multiple issues here: 1) If we enter the function with a non-empty buffer, there is an attempt to flush it. But it was not clearing m->from after doing so, which means that if we try to do this flush twice in a row without any call to traverse() in between, we are going to be reading from the wrong place -- the splat above, fixed by this patch. 2) If there's a short write to userspace because of page faults, the buffer may already contain multiple lines (i.e. pos has advanced by more than 1), but we don't save the progress that was made so the next call will output what we've already returned previously. Since that is a much less serious issue (and I have a headache after staring at seq_read() for the past 8 hours), I'll leave that for now. Link: http://lkml.kernel.org/r/1471447270-32093-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Vegard Nossum Reported-by: Dave Jones Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/seq_file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index e85664b7c7d9..d672e2fec459 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -222,8 +222,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) + if (!m->count) { + m->from = 0; m->index++; + } if (!size) goto Done; } From f31d48a091877f7580c0c6aa70e3d35d9b6e8625 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 8 Aug 2016 22:08:06 -0400 Subject: [PATCH 393/813] btrfs: waiting on qgroup rescan should not always be interruptible commit d06f23d6a947c9abae41dc46be69a56baf36f436 upstream. We wait on qgroup rescan completion in three places: file system shutdown, the quota disable ioctl, and the rescan wait ioctl. If the user sends a signal while we're waiting, we continue happily along. This is expected behavior for the rescan wait ioctl. It's racy in the shutdown path but mostly works due to other unrelated synchronization points. In the quota disable path, it Oopses the kernel pretty much immediately. Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/qgroup.c | 12 +++++++++--- fs/btrfs/qgroup.h | 3 ++- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41fb43183406..10bc556ddb3d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3811,7 +3811,7 @@ void close_ctree(struct btrfs_root *root) smp_mb(); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bfcd87ee8ff5..65f30b3b04f9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5121,7 +5121,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info); + return btrfs_qgroup_wait_for_completion(root->fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5279fdae7142..d7829f5ebaf5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, goto out; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -2467,7 +2467,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) { int running; int ret = 0; @@ -2478,9 +2479,14 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - if (running) + if (!running) + return 0; + + if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..3d73e4c9c7df 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, From cc79d3982d9c77f36cebe8f1033e6715c55ac726 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 15 Aug 2016 12:10:33 -0400 Subject: [PATCH 394/813] btrfs: properly track when rescan worker is running commit d2c609b834d62f1e91f1635a27dca29f7806d3d6 upstream. The qgroup_flags field is overloaded such that it reflects the on-disk status of qgroups and the runtime state. The BTRFS_QGROUP_STATUS_FLAG_RESCAN flag is used to indicate that a rescan operation is in progress, but if the file system is unmounted while a rescan is running, the rescan operation is paused. If the file system is then mounted read-only, the flag will still be present but the rescan operation will not have been resumed. When we go to umount, btrfs_qgroup_wait_for_completion will see the flag and interpret it to mean that the rescan worker is still running and will wait for a completion that will never come. This patch uses a separate flag to indicate when the worker is running. The locking and state surrounding the qgroup rescan worker needs a lot of attention beyond this patch but this is enough to avoid a hung umount. Signed-off-by; Jeff Mahoney Reviewed-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 9 ++++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 385b449fd7ed..1391f72c28c3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1770,6 +1770,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; + bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 10bc556ddb3d..85b207d19aa5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2276,6 +2276,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d7829f5ebaf5..bcc965ed5fa1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2283,6 +2283,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + mutex_unlock(&fs_info->qgroup_rescan_lock); + path = btrfs_alloc_path(); if (!path) goto out; @@ -2349,6 +2353,9 @@ out: } done: + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = false; + mutex_unlock(&fs_info->qgroup_rescan_lock); complete_all(&fs_info->qgroup_rescan_completion); } @@ -2475,7 +2482,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + running = fs_info->qgroup_rescan_running; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); From d6720176bc99535858a5dfffbea0046a89842262 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 22 Aug 2016 13:25:56 -0700 Subject: [PATCH 395/813] Input: tegra-kbc - fix inverted reset logic commit fae16989be77b09bab86c79233e4b511ea769cea upstream. Commit fe6b0dfaba68 ("Input: tegra-kbc - use reset framework") accidentally converted _deassert to _assert, so there is no code to wake up this hardware. Fixes: fe6b0dfaba68 ("Input: tegra-kbc - use reset framework") Signed-off-by: Masahiro Yamada Acked-by: Thierry Reding Acked-by: Laxman Dewangan Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/keyboard/tegra-kbc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/tegra-kbc.c b/drivers/input/keyboard/tegra-kbc.c index acc5394afb03..29485bc4221c 100644 --- a/drivers/input/keyboard/tegra-kbc.c +++ b/drivers/input/keyboard/tegra-kbc.c @@ -376,7 +376,7 @@ static int tegra_kbc_start(struct tegra_kbc *kbc) /* Reset the KBC controller to clear all previous status.*/ reset_control_assert(kbc->rst); udelay(100); - reset_control_assert(kbc->rst); + reset_control_deassert(kbc->rst); udelay(100); tegra_kbc_config_pins(kbc); From 11dd037e42590ee224658ddddfb715e5ce1d328a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 25 Jul 2016 11:36:54 -0700 Subject: [PATCH 396/813] Input: i8042 - break load dependency between atkbd/psmouse and i8042 commit 4097461897df91041382ff6fcd2bfa7ee6b2448c upstream. As explained in 1407814240-4275-1-git-send-email-decui@microsoft.com we have a hard load dependency between i8042 and atkbd which prevents keyboard from working on Gen2 Hyper-V VMs. > hyperv_keyboard invokes serio_interrupt(), which needs a valid serio > driver like atkbd.c. atkbd.c depends on libps2.c because it invokes > ps2_command(). libps2.c depends on i8042.c because it invokes > i8042_check_port_owner(). As a result, hyperv_keyboard actually > depends on i8042.c. > > For a Generation 2 Hyper-V VM (meaning no i8042 device emulated), if a > Linux VM (like Arch Linux) happens to configure CONFIG_SERIO_I8042=m > rather than =y, atkbd.ko can't load because i8042.ko can't load(due to > no i8042 device emulated) and finally hyperv_keyboard can't work and > the user can't input: https://bugs.archlinux.org/task/39820 > (Ubuntu/RHEL/SUSE aren't affected since they use CONFIG_SERIO_I8042=y) To break the dependency we move away from using i8042_check_port_owner() and instead allow serio port owner specify a mutex that clients should use to serialize PS/2 command stream. Reported-by: Mark Laws Tested-by: Mark Laws Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042.c | 16 +--------------- drivers/input/serio/libps2.c | 10 ++++------ include/linux/i8042.h | 6 ------ include/linux/serio.h | 24 +++++++++++++++++++----- 4 files changed, 24 insertions(+), 32 deletions(-) diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 454195709a82..b4d34086e73f 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -1277,6 +1277,7 @@ static int __init i8042_create_kbd_port(void) serio->start = i8042_start; serio->stop = i8042_stop; serio->close = i8042_port_close; + serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; strlcpy(serio->name, "i8042 KBD port", sizeof(serio->name)); @@ -1373,21 +1374,6 @@ static void i8042_unregister_ports(void) } } -/* - * Checks whether port belongs to i8042 controller. - */ -bool i8042_check_port_owner(const struct serio *port) -{ - int i; - - for (i = 0; i < I8042_NUM_PORTS; i++) - if (i8042_ports[i].serio == port) - return true; - - return false; -} -EXPORT_SYMBOL(i8042_check_port_owner); - static void i8042_free_irqs(void) { if (i8042_aux_irq_registered) diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 316f2c897101..83e9c663aa67 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -56,19 +56,17 @@ EXPORT_SYMBOL(ps2_sendbyte); void ps2_begin_command(struct ps2dev *ps2dev) { - mutex_lock(&ps2dev->cmd_mutex); + struct mutex *m = ps2dev->serio->ps2_cmd_mutex ?: &ps2dev->cmd_mutex; - if (i8042_check_port_owner(ps2dev->serio)) - i8042_lock_chip(); + mutex_lock(m); } EXPORT_SYMBOL(ps2_begin_command); void ps2_end_command(struct ps2dev *ps2dev) { - if (i8042_check_port_owner(ps2dev->serio)) - i8042_unlock_chip(); + struct mutex *m = ps2dev->serio->ps2_cmd_mutex ?: &ps2dev->cmd_mutex; - mutex_unlock(&ps2dev->cmd_mutex); + mutex_unlock(m); } EXPORT_SYMBOL(ps2_end_command); diff --git a/include/linux/i8042.h b/include/linux/i8042.h index 0f9bafa17a02..d98780ca9604 100644 --- a/include/linux/i8042.h +++ b/include/linux/i8042.h @@ -62,7 +62,6 @@ struct serio; void i8042_lock_chip(void); void i8042_unlock_chip(void); int i8042_command(unsigned char *param, int command); -bool i8042_check_port_owner(const struct serio *); int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *serio)); int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, @@ -83,11 +82,6 @@ static inline int i8042_command(unsigned char *param, int command) return -ENODEV; } -static inline bool i8042_check_port_owner(const struct serio *serio) -{ - return false; -} - static inline int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *serio)) { diff --git a/include/linux/serio.h b/include/linux/serio.h index df4ab5de1586..c733cff44e18 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -31,7 +31,8 @@ struct serio { struct serio_device_id id; - spinlock_t lock; /* protects critical sections from port's interrupt handler */ + /* Protects critical sections from port's interrupt handler */ + spinlock_t lock; int (*write)(struct serio *, unsigned char); int (*open)(struct serio *); @@ -40,16 +41,29 @@ struct serio { void (*stop)(struct serio *); struct serio *parent; - struct list_head child_node; /* Entry in parent->children list */ + /* Entry in parent->children list */ + struct list_head child_node; struct list_head children; - unsigned int depth; /* level of nesting in serio hierarchy */ + /* Level of nesting in serio hierarchy */ + unsigned int depth; - struct serio_driver *drv; /* accessed from interrupt, must be protected by serio->lock and serio->sem */ - struct mutex drv_mutex; /* protects serio->drv so attributes can pin driver */ + /* + * serio->drv is accessed from interrupt handlers; when modifying + * caller should acquire serio->drv_mutex and serio->lock. + */ + struct serio_driver *drv; + /* Protects serio->drv so attributes can pin current driver */ + struct mutex drv_mutex; struct device dev; struct list_head node; + + /* + * For use by PS/2 layer when several ports share hardware and + * may get indigestion when exposed to concurrent access (i8042). + */ + struct mutex *ps2_cmd_mutex; }; #define to_serio_port(d) container_of(d, struct serio, dev) From 9781b971f456189f3664d6f5bd2d5643893c0408 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 16 Aug 2016 17:38:54 -0700 Subject: [PATCH 397/813] Input: i8042 - set up shared ps2_cmd_mutex for AUX ports commit 47af45d684b5f3ae000ad448db02ce4f13f73273 upstream. The commit 4097461897df ("Input: i8042 - break load dependency ...") correctly set up ps2_cmd_mutex pointer for the KBD port but forgot to do the same for AUX port(s), which results in communication on KBD and AUX ports to clash with each other. Fixes: 4097461897df ("Input: i8042 - break load dependency ...") Reported-by: Bruno Wolff III Tested-by: Bruno Wolff III Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index b4d34086e73f..405252a884dd 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -1305,6 +1305,7 @@ static int __init i8042_create_aux_port(int idx) serio->write = i8042_aux_write; serio->start = i8042_start; serio->stop = i8042_stop; + serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; if (idx < 0) { From 71d27af29aa5ef06716a8aab15b3bb5144281dc4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 15 Jul 2016 14:09:13 +0300 Subject: [PATCH 398/813] crypto: nx - off by one bug in nx_of_update_msc() commit e514cc0a492a3f39ef71b31590a7ef67537ee04b upstream. The props->ap[] array is defined like this: struct alg_props ap[NX_MAX_FC][NX_MAX_MODE][3]; So we can see that if msc->fc and msc->mode are == to NX_MAX_FC or NX_MAX_MODE then we're off by one. Fixes: ae0222b7289d ('powerpc/crypto: nx driver code supporting nx encryption') Signed-off-by: Dan Carpenter Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/nx/nx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/nx/nx.c b/drivers/crypto/nx/nx.c index 0794f1cc0018..42f0f229f7f7 100644 --- a/drivers/crypto/nx/nx.c +++ b/drivers/crypto/nx/nx.c @@ -392,7 +392,7 @@ static void nx_of_update_msc(struct device *dev, ((bytes_so_far + sizeof(struct msc_triplet)) <= lenp) && i < msc->triplets; i++) { - if (msc->fc > NX_MAX_FC || msc->mode > NX_MAX_MODE) { + if (msc->fc >= NX_MAX_FC || msc->mode >= NX_MAX_MODE) { dev_err(dev, "unknown function code/mode " "combo: %d/%d (ignored)\n", msc->fc, msc->mode); From ac069deaaae26656a726af9c45a3dbeeea15c917 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Thu, 18 Aug 2016 19:53:36 +0100 Subject: [PATCH 399/813] crypto: qat - fix aes-xts key sizes commit 10bb087ce381c812cd81a65ffd5e6f83e6399291 upstream. Increase value of supported key sizes for qat_aes_xts. aes-xts keys consists of keys of equal size concatenated. Fixes: def14bfaf30d ("crypto: qat - add support for ctr(aes) and xts(aes)") Reported-by: Wenqian Yu Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/qat/qat_common/qat_algs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index 59e4c3af15ed..367b6661ee04 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -1262,8 +1262,8 @@ static struct crypto_alg qat_algs[] = { { .setkey = qat_alg_ablkcipher_xts_setkey, .decrypt = qat_alg_ablkcipher_decrypt, .encrypt = qat_alg_ablkcipher_encrypt, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, }, }, From e879dae59e057343a15091f55c54a354623d8142 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Thu, 4 Aug 2016 19:59:41 +0900 Subject: [PATCH 400/813] dmaengine: usb-dmac: check CHCR.DE bit in usb_dmac_isr_channel() commit 626d2f07de89bf6be3d7301524d0ab3375b81b9c upstream. The USB-DMAC's interruption happens even if the CHCR.DE is not set to 1 because CHCR.NULLE is set to 1. So, this driver should call usb_dmac_isr_transfer_end() if the DE bit is set to 1 only. Otherwise, the desc is possible to be NULL in the usb_dmac_isr_transfer_end(). Fixes: 0c1c8ff32fa2 ("dmaengine: usb-dmac: Add Renesas USB DMA Controller (USB-DMAC) driver) Signed-off-by: Yoshihiro Shimoda Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/sh/usb-dmac.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/dma/sh/usb-dmac.c b/drivers/dma/sh/usb-dmac.c index f1bcc2a163b3..b1bc945f008f 100644 --- a/drivers/dma/sh/usb-dmac.c +++ b/drivers/dma/sh/usb-dmac.c @@ -600,27 +600,30 @@ static irqreturn_t usb_dmac_isr_channel(int irq, void *dev) { struct usb_dmac_chan *chan = dev; irqreturn_t ret = IRQ_NONE; - u32 mask = USB_DMACHCR_TE; - u32 check_bits = USB_DMACHCR_TE | USB_DMACHCR_SP; + u32 mask = 0; u32 chcr; + bool xfer_end = false; spin_lock(&chan->vc.lock); chcr = usb_dmac_chan_read(chan, USB_DMACHCR); - if (chcr & check_bits) - mask |= USB_DMACHCR_DE | check_bits; + if (chcr & (USB_DMACHCR_TE | USB_DMACHCR_SP)) { + mask |= USB_DMACHCR_DE | USB_DMACHCR_TE | USB_DMACHCR_SP; + if (chcr & USB_DMACHCR_DE) + xfer_end = true; + ret |= IRQ_HANDLED; + } if (chcr & USB_DMACHCR_NULL) { /* An interruption of TE will happen after we set FTE */ mask |= USB_DMACHCR_NULL; chcr |= USB_DMACHCR_FTE; ret |= IRQ_HANDLED; } - usb_dmac_chan_write(chan, USB_DMACHCR, chcr & ~mask); + if (mask) + usb_dmac_chan_write(chan, USB_DMACHCR, chcr & ~mask); - if (chcr & check_bits) { + if (xfer_end) usb_dmac_isr_transfer_end(chan); - ret |= IRQ_HANDLED; - } spin_unlock(&chan->vc.lock); From 5ab968f27aee23f03ddd0401a77c2ba81dc73ad0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 23 Aug 2016 15:32:51 -0400 Subject: [PATCH 401/813] USB: avoid left shift by -1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 53e5f36fbd2453ad69a3369a1db62dc06c30a4aa upstream. UBSAN complains about a left shift by -1 in proc_do_submiturb(). This can occur when an URB is submitted for a bulk or control endpoint on a high-speed device, since the code doesn't bother to check the endpoint type; normally only interrupt or isochronous endpoints have a nonzero bInterval value. Aside from the fact that the operation is illegal, it shouldn't matter because the result isn't used. Still, in theory it could cause a hardware exception or other problem, so we should work around it. This patch avoids doing the left shift unless the shift amount is >= 0. The same piece of code has another problem. When checking the device speed (the exponential encoding for interrupt endpoints is used only by high-speed or faster devices), we need to look for speed >= USB_SPEED_SUPER as well as speed == USB_SPEED HIGH. The patch adds this check. Signed-off-by: Alan Stern Reported-by: Vittorio Zecca Tested-by: Vittorio Zecca Suggested-by: Bjørn Mork Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 3ffb01ff6549..f5c92d904ded 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1530,11 +1530,17 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb as->urb->start_frame = uurb->start_frame; as->urb->number_of_packets = number_of_packets; as->urb->stream_id = stream_id; - if (uurb->type == USBDEVFS_URB_TYPE_ISO || - ps->dev->speed == USB_SPEED_HIGH) - as->urb->interval = 1 << min(15, ep->desc.bInterval - 1); - else - as->urb->interval = ep->desc.bInterval; + + if (ep->desc.bInterval) { + if (uurb->type == USBDEVFS_URB_TYPE_ISO || + ps->dev->speed == USB_SPEED_HIGH || + ps->dev->speed >= USB_SPEED_SUPER) + as->urb->interval = 1 << + min(15, ep->desc.bInterval - 1); + else + as->urb->interval = ep->desc.bInterval; + } + as->urb->context = as; as->urb->complete = async_completed; for (totlen = u = 0; u < number_of_packets; u++) { From 8136b595bfa36a09a6e1b357f280d368df3d1cc5 Mon Sep 17 00:00:00 2001 From: Li Jun Date: Tue, 16 Aug 2016 19:19:11 +0800 Subject: [PATCH 402/813] usb: chipidea: udc: don't touch DP when controller is in host mode commit c4e94174983a86c935be1537a73e496b778b0287 upstream. When the controller is configured to be dual role and it's in host mode, if bind udc and gadgt driver, those gadget operations will do gadget disconnect and finally pull down DP line, which will break host function. Signed-off-by: Li Jun Signed-off-by: Peter Chen Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/udc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c index 391a1225b0ba..ca367b05e440 100644 --- a/drivers/usb/chipidea/udc.c +++ b/drivers/usb/chipidea/udc.c @@ -1585,8 +1585,11 @@ static int ci_udc_pullup(struct usb_gadget *_gadget, int is_on) { struct ci_hdrc *ci = container_of(_gadget, struct ci_hdrc, gadget); - /* Data+ pullup controlled by OTG state machine in OTG fsm mode */ - if (ci_otg_is_fsm_mode(ci)) + /* + * Data+ pullup controlled by OTG state machine in OTG fsm mode; + * and don't touch Data+ in host mode for dual role config. + */ + if (ci_otg_is_fsm_mode(ci) || ci->role == CI_ROLE_HOST) return 0; pm_runtime_get_sync(&ci->gadget.dev); From 6e0b5f821c318f58e5b5c82f308adfd0b9ddb929 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 22 Aug 2016 16:58:53 -0400 Subject: [PATCH 403/813] USB: fix typo in wMaxPacketSize validation commit 6c73358c83ce870c0cf32413e5cadb3b9a39c606 upstream. The maximum value allowed for wMaxPacketSize of a high-speed interrupt endpoint is 1024 bytes, not 1023. Signed-off-by: Alan Stern Fixes: aed9d65ac327 ("USB: validate wMaxPacketValue entries in endpoint descriptors") Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 67e192c1d37e..80c8d90d8b75 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -158,7 +158,7 @@ static const unsigned short high_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_CONTROL] = 64, [USB_ENDPOINT_XFER_ISOC] = 1024, [USB_ENDPOINT_XFER_BULK] = 512, - [USB_ENDPOINT_XFER_INT] = 1023, + [USB_ENDPOINT_XFER_INT] = 1024, }; static const unsigned short super_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_CONTROL] = 512, From e5d33f1036d4d145495952079e9b023624d0b7c4 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 12 Aug 2016 01:05:08 +0300 Subject: [PATCH 404/813] USB: serial: mos7720: fix non-atomic allocation in write path commit 5a5a1d614287a647b36dff3f40c2b0ceabbc83ec upstream. There is an allocation with GFP_KERNEL flag in mos7720_write(), while it may be called from interrupt context. Follow-up for commit 191252837626 ("USB: kobil_sct: fix non-atomic allocation in write path") Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7720.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c index 06c7dbc1c802..63db004af21f 100644 --- a/drivers/usb/serial/mos7720.c +++ b/drivers/usb/serial/mos7720.c @@ -1252,7 +1252,7 @@ static int mos7720_write(struct tty_struct *tty, struct usb_serial_port *port, if (urb->transfer_buffer == NULL) { urb->transfer_buffer = kmalloc(URB_TRANSFER_BUFFER_SIZE, - GFP_KERNEL); + GFP_ATOMIC); if (!urb->transfer_buffer) goto exit; } From 201cb6d48be510dba6161d56c1179a91684c6c5d Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 12 Aug 2016 01:05:09 +0300 Subject: [PATCH 405/813] USB: serial: mos7840: fix non-atomic allocation in write path commit 3b7c7e52efda0d4640060de747768360ba70a7c0 upstream. There is an allocation with GFP_KERNEL flag in mos7840_write(), while it may be called from interrupt context. Follow-up for commit 191252837626 ("USB: kobil_sct: fix non-atomic allocation in write path") Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7840.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 8ac9b55f05af..7f3ddd7ba2ce 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -1340,8 +1340,8 @@ static int mos7840_write(struct tty_struct *tty, struct usb_serial_port *port, } if (urb->transfer_buffer == NULL) { - urb->transfer_buffer = - kmalloc(URB_TRANSFER_BUFFER_SIZE, GFP_KERNEL); + urb->transfer_buffer = kmalloc(URB_TRANSFER_BUFFER_SIZE, + GFP_ATOMIC); if (!urb->transfer_buffer) goto exit; } From fed48a260b0cbc571d4ba50aeb1833d652cc677a Mon Sep 17 00:00:00 2001 From: Aleksandr Makarov Date: Sat, 20 Aug 2016 13:29:41 +0300 Subject: [PATCH 406/813] USB: serial: option: add WeTelecom WM-D200 commit 6695593e4a7659db49ac6eca98c164f7b5589f72 upstream. Add support for WeTelecom WM-D200. T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 4 Spd=12 MxCh= 0 D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=22de ProdID=6801 Rev=00.00 S: Manufacturer=WeTelecom Incorporated S: Product=WeTelecom Mobile Products C: #Ifs= 4 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#= 3 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=usb-storage Signed-off-by: Aleksandr Makarov Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index bc472584a229..bb6a71120c03 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -525,6 +525,10 @@ static void option_instat_callback(struct urb *urb); #define VIATELECOM_VENDOR_ID 0x15eb #define VIATELECOM_PRODUCT_CDS7 0x0001 +/* WeTelecom products */ +#define WETELECOM_VENDOR_ID 0x22de +#define WETELECOM_PRODUCT_WMD200 0x6801 + struct option_blacklist_info { /* bitmask of interface numbers blacklisted for send_setup */ const unsigned long sendsetup; @@ -1991,6 +1995,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x4000, 0xff) }, /* OLICARD300 - MT6225 */ { USB_DEVICE(INOVIA_VENDOR_ID, INOVIA_SEW858) }, { USB_DEVICE(VIATELECOM_VENDOR_ID, VIATELECOM_PRODUCT_CDS7) }, + { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD200, 0xff, 0xff, 0xff) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From 1b3b122f598ff1020447935b0c908626ea2b8782 Mon Sep 17 00:00:00 2001 From: Aleksandr Makarov Date: Wed, 24 Aug 2016 13:06:22 +0300 Subject: [PATCH 407/813] USB: serial: option: add WeTelecom 0x6802 and 0x6803 products commit 40d9c32525cba79130612650b1abc47c0c0f19a8 upstream. These product IDs are listed in Windows driver. 0x6803 corresponds to WeTelecom WM-D300. 0x6802 name is unknown. Signed-off-by: Aleksandr Makarov Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index bb6a71120c03..9894e341c6ac 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -528,6 +528,8 @@ static void option_instat_callback(struct urb *urb); /* WeTelecom products */ #define WETELECOM_VENDOR_ID 0x22de #define WETELECOM_PRODUCT_WMD200 0x6801 +#define WETELECOM_PRODUCT_6802 0x6802 +#define WETELECOM_PRODUCT_WMD300 0x6803 struct option_blacklist_info { /* bitmask of interface numbers blacklisted for send_setup */ @@ -1996,6 +1998,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE(INOVIA_VENDOR_ID, INOVIA_SEW858) }, { USB_DEVICE(VIATELECOM_VENDOR_ID, VIATELECOM_PRODUCT_CDS7) }, { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD200, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_6802, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD300, 0xff, 0xff, 0xff) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From bc93350cb49857d013c1011a56a0042274f82555 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 29 Jun 2016 20:27:44 +0100 Subject: [PATCH 408/813] staging: comedi: daqboard2000: bug fix board type matching code commit 80e162ee9b31d77d851b10f8c5299132be1e120f upstream. `daqboard2000_find_boardinfo()` is supposed to check if the DaqBoard/2000 series model is supported, based on the PCI subvendor and subdevice ID. The current code is wrong as it is comparing the PCI device's subdevice ID to an expected, fixed value for the subvendor ID. It should be comparing the PCI device's subvendor ID to this fixed value. Correct it. Fixes: 7e8401b23e7f ("staging: comedi: daqboard2000: add back subsystem_device check") Signed-off-by: Ian Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/daqboard2000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c index 57ab6680e3ae..e5fee6e0fb47 100644 --- a/drivers/staging/comedi/drivers/daqboard2000.c +++ b/drivers/staging/comedi/drivers/daqboard2000.c @@ -636,7 +636,7 @@ static const void *daqboard2000_find_boardinfo(struct comedi_device *dev, const struct daq200_boardtype *board; int i; - if (pcidev->subsystem_device != PCI_VENDOR_ID_IOTECH) + if (pcidev->subsystem_vendor != PCI_VENDOR_ID_IOTECH) return NULL; for (i = 0; i < ARRAY_SIZE(boardtypes); i++) { From fd514089133a2a6cb98d4a615237015cd8075e12 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Thu, 30 Jun 2016 19:58:32 +0100 Subject: [PATCH 409/813] staging: comedi: comedi_test: fix timer race conditions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 403fe7f34e3327ddac2e06a15e76a293d613381e upstream. Commit 73e0e4dfed4c ("staging: comedi: comedi_test: fix timer lock-up") fixed a lock-up in the timer routine `waveform_ai_timer()` (which was called `waveform_ai_interrupt()` at the time) caused by commit 240512474424 ("staging: comedi: comedi_test: use comedi_handle_events()"). However, it introduced a race condition that can result in the timer routine misbehaving, such as accessing freed memory or dereferencing a NULL pointer. 73e0... changed the timer routine to do nothing unless a `WAVEFORM_AI_RUNNING` flag was set, and changed `waveform_ai_cancel()` to clear the flag and replace a call to `del_timer_sync()` with a call to `del_timer()`. `waveform_ai_cancel()` may be called from the timer routine itself (via `comedi_handle_events()`), or from `do_cancel()`. (`do_cancel()` is called as a result of a file operation (usually a `COMEDI_CANCEL` ioctl command, or a release), or during device removal.) When called from `do_cancel()`, the call to `waveform_ai_cancel()` is followed by a call to `do_become_nonbusy()`, which frees up stuff for the current asynchronous command under the assumption that it is now safe to do so. The race condition occurs when the timer routine `waveform_ai_timer()` checks the `WAVEFORM_AI_RUNNING` flag just before it is cleared by `waveform_ai_cancel()`, and is still running during the call to `do_become_nonbusy()`. In particular, it can lead to a NULL pointer dereference: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] waveform_ai_timer+0x17d/0x290 [comedi_test] That corresponds to this line in `waveform_ai_timer()`: unsigned int chanspec = cmd->chanlist[async->cur_chan]; but `do_become_nonbusy()` frees `cmd->chanlist` and sets it to `NULL`. Fix the race by calling `del_timer_sync()` instead of `del_timer()` in `waveform_ai_cancel()` when not in an interrupt context. The only time `waveform_ai_cancel()` is called in an interrupt context is when it is called from the timer routine itself, via `comedi_handle_events()`. There is no longer any need for the `WAVEFORM_AI_RUNNING` flag, so get rid of it. The bug was copied from the AI subdevice to the AO when support for commands on the AO subdevice was added by commit 0cf55bbef2f9 ("staging: comedi: comedi_test: implement commands on AO subdevice"). That involves the timer routine `waveform_ao_timer()`, the comedi "cancel" routine `waveform_ao_cancel()`, and the flag `WAVEFORM_AO_RUNNING`. Fix it in the same way as for the AI subdevice. Fixes: 73e0e4dfed4c ("staging: comedi: comedi_test: fix timer lock-up") Fixes: 0cf55bbef2f9 ("staging: comedi: comedi_test: implement commands on AO subdevice") Reported-by: Éric Piel Signed-off-by: Ian Abbott Cc: Éric Piel Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/comedi_test.c | 46 +++++--------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/drivers/staging/comedi/drivers/comedi_test.c b/drivers/staging/comedi/drivers/comedi_test.c index 4ab186669f0c..ec5b9a23494d 100644 --- a/drivers/staging/comedi/drivers/comedi_test.c +++ b/drivers/staging/comedi/drivers/comedi_test.c @@ -56,11 +56,6 @@ #define N_CHANS 8 -enum waveform_state_bits { - WAVEFORM_AI_RUNNING, - WAVEFORM_AO_RUNNING -}; - /* Data unique to this driver */ struct waveform_private { struct timer_list ai_timer; /* timer for AI commands */ @@ -68,7 +63,6 @@ struct waveform_private { unsigned int wf_amplitude; /* waveform amplitude in microvolts */ unsigned int wf_period; /* waveform period in microseconds */ unsigned int wf_current; /* current time in waveform period */ - unsigned long state_bits; unsigned int ai_scan_period; /* AI scan period in usec */ unsigned int ai_convert_period; /* AI conversion period in usec */ struct timer_list ao_timer; /* timer for AO commands */ @@ -191,10 +185,6 @@ static void waveform_ai_timer(unsigned long arg) unsigned int nsamples; unsigned int time_increment; - /* check command is still active */ - if (!test_bit(WAVEFORM_AI_RUNNING, &devpriv->state_bits)) - return; - now = ktime_to_us(ktime_get()); nsamples = comedi_nsamples_left(s, UINT_MAX); @@ -386,11 +376,6 @@ static int waveform_ai_cmd(struct comedi_device *dev, */ devpriv->ai_timer.expires = jiffies + usecs_to_jiffies(devpriv->ai_convert_period) + 1; - - /* mark command as active */ - smp_mb__before_atomic(); - set_bit(WAVEFORM_AI_RUNNING, &devpriv->state_bits); - smp_mb__after_atomic(); add_timer(&devpriv->ai_timer); return 0; } @@ -400,11 +385,12 @@ static int waveform_ai_cancel(struct comedi_device *dev, { struct waveform_private *devpriv = dev->private; - /* mark command as no longer active */ - clear_bit(WAVEFORM_AI_RUNNING, &devpriv->state_bits); - smp_mb__after_atomic(); - /* cannot call del_timer_sync() as may be called from timer routine */ - del_timer(&devpriv->ai_timer); + if (in_softirq()) { + /* Assume we were called from the timer routine itself. */ + del_timer(&devpriv->ai_timer); + } else { + del_timer_sync(&devpriv->ai_timer); + } return 0; } @@ -436,10 +422,6 @@ static void waveform_ao_timer(unsigned long arg) u64 scans_since; unsigned int scans_avail = 0; - /* check command is still active */ - if (!test_bit(WAVEFORM_AO_RUNNING, &devpriv->state_bits)) - return; - /* determine number of scan periods since last time */ now = ktime_to_us(ktime_get()); scans_since = now - devpriv->ao_last_scan_time; @@ -518,11 +500,6 @@ static int waveform_ao_inttrig_start(struct comedi_device *dev, devpriv->ao_last_scan_time = ktime_to_us(ktime_get()); devpriv->ao_timer.expires = jiffies + usecs_to_jiffies(devpriv->ao_scan_period); - - /* mark command as active */ - smp_mb__before_atomic(); - set_bit(WAVEFORM_AO_RUNNING, &devpriv->state_bits); - smp_mb__after_atomic(); add_timer(&devpriv->ao_timer); return 1; @@ -608,11 +585,12 @@ static int waveform_ao_cancel(struct comedi_device *dev, struct waveform_private *devpriv = dev->private; s->async->inttrig = NULL; - /* mark command as no longer active */ - clear_bit(WAVEFORM_AO_RUNNING, &devpriv->state_bits); - smp_mb__after_atomic(); - /* cannot call del_timer_sync() as may be called from timer routine */ - del_timer(&devpriv->ao_timer); + if (in_softirq()) { + /* Assume we were called from the timer routine itself. */ + del_timer(&devpriv->ao_timer); + } else { + del_timer_sync(&devpriv->ao_timer); + } return 0; } From b03ee3d047cafd407896b4a2ad018338387ec8e7 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Tue, 19 Jul 2016 12:17:39 +0100 Subject: [PATCH 410/813] staging: comedi: ni_mio_common: fix AO inttrig backwards compatibility commit f0f4b0cc3a8cffd983f5940d46cd0227f3f5710a upstream. Commit ebb657babfa9 ("staging: comedi: ni_mio_common: clarify the cmd->start_arg validation and use") introduced a backwards compatibility issue in the use of asynchronous commands on the AO subdevice when `start_src` is `TRIG_EXT`. Valid values for `start_src` are `TRIG_INT` (for internal, software trigger), and `TRIG_EXT` (for external trigger). When set to `TRIG_EXT`. In both cases, the driver relies on an internal, software trigger to set things up (allowing the user application to write sufficient samples to the data buffer before the trigger), so it acts as a software "pre-trigger" in the `TRIG_EXT` case. The software trigger is handled by `ni_ao_inttrig()`. Prior to the above change, when `start_src` was `TRIG_INT`, `start_arg` was required to be 0, and `ni_ao_inttrig()` checked that the software trigger number was also 0. After the above change, when `start_src` was `TRIG_INT`, any value was allowed for `start_arg`, and `ni_ao_inttrig()` checked that the software trigger number matched this `start_arg` value. The backwards compatibility issue is that the internal trigger number now has to match `start_arg` when `start_src` is `TRIG_EXT` when it previously had to be 0. Fix the backwards compatibility issue in `ni_ao_inttrig()` by always allowing software trigger number 0 when `start_src` is something other than `TRIG_INT`. Thanks to Spencer Olson for reporting the issue. Signed-off-by: Ian Abbott Reported-by: Spencer Olson Fixes: ebb657babfa9 ("staging: comedi: ni_mio_common: clarify the cmd->start_arg validation and use") Reviewed-by: H Hartley Sweeten Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/ni_mio_common.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c index 27fbf1a81097..955f802a1d0a 100644 --- a/drivers/staging/comedi/drivers/ni_mio_common.c +++ b/drivers/staging/comedi/drivers/ni_mio_common.c @@ -2823,7 +2823,15 @@ static int ni_ao_inttrig(struct comedi_device *dev, int i; static const int timeout = 1000; - if (trig_num != cmd->start_arg) + /* + * Require trig_num == cmd->start_arg when cmd->start_src == TRIG_INT. + * For backwards compatibility, also allow trig_num == 0 when + * cmd->start_src != TRIG_INT (i.e. when cmd->start_src == TRIG_EXT); + * in that case, the internal trigger is being used as a pre-trigger + * before the external trigger. + */ + if (!(trig_num == cmd->start_arg || + (trig_num == 0 && cmd->start_src != TRIG_INT))) return -EINVAL; /* Null trig at beginning prevent ao start trigger from executing more than From fbde41faa1c043f465fb3d435c2fd34e82e7b88a Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 20 Jul 2016 17:07:34 +0100 Subject: [PATCH 411/813] staging: comedi: ni_mio_common: fix wrong insn_write handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5ca05345c56cb979e1a25ab6146437002f95cac8 upstream. For counter subdevices, the `s->insn_write` handler is being set to the wrong function, `ni_tio_insn_read()`. It should be `ni_tio_insn_write()`. Signed-off-by: Ian Abbott Reported-by: Éric Piel Fixes: 10f74377eec3 ("staging: comedi: ni_tio: make ni_tio_winsn() a proper comedi (*insn_write)" Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/ni_mio_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c index 955f802a1d0a..35ab4a9ef95d 100644 --- a/drivers/staging/comedi/drivers/ni_mio_common.c +++ b/drivers/staging/comedi/drivers/ni_mio_common.c @@ -5354,7 +5354,7 @@ static int ni_E_init(struct comedi_device *dev, s->maxdata = (devpriv->is_m_series) ? 0xffffffff : 0x00ffffff; s->insn_read = ni_tio_insn_read; - s->insn_write = ni_tio_insn_read; + s->insn_write = ni_tio_insn_write; s->insn_config = ni_tio_insn_config; #ifdef PCIDMA if (dev->irq && devpriv->mite) { From 0b21b21b58706dc35102b24a566bb578c32218df Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 16 Aug 2016 16:59:52 +0100 Subject: [PATCH 412/813] ACPI / drivers: fix typo in ACPI_DECLARE_PROBE_ENTRY macro commit 3feab13c919f99b0a17d0ca22ae00cf90f5d3fd1 upstream. When the ACPI_DECLARE_PROBE_ENTRY macro was added in commit e647b532275b ("ACPI: Add early device probing infrastructure"), a stub macro adding an unused entry was added for the !CONFIG_ACPI Kconfig option case to make sure kernel code making use of the macro did not require to be guarded within CONFIG_ACPI in order to be compiled. The stub macro was never used since all kernel code that defines ACPI_DECLARE_PROBE_ENTRY entries is currently guarded within CONFIG_ACPI; it contains a typo that should be nonetheless fixed. Fix the typo in the stub (ie !CONFIG_ACPI) ACPI_DECLARE_PROBE_ENTRY() macro so that it can actually be used if needed. Signed-off-by: Lorenzo Pieralisi Fixes: e647b532275b (ACPI: Add early device probing infrastructure) Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- include/linux/acpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1991aea2ec4c..3672893b275e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -920,7 +920,7 @@ static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev, return NULL; } -#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, validate, data, fn) \ +#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \ static const void * __acpi_table_##name[] \ __attribute__((unused)) \ = { (void *) table_id, \ From b0917f5dae67620aaab79e98cdb53a5620570e58 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 16 Aug 2016 16:59:53 +0100 Subject: [PATCH 413/813] ACPI / drivers: replace acpi_probe_lock spinlock with mutex commit 5331d9cab32ef640b4cd38a43b0858874fbb7168 upstream. Commit e647b532275b ("ACPI: Add early device probing infrastructure") introduced code that allows inserting driver specific struct acpi_probe_entry probe entries into ACPI linker sections (one per-subsystem, eg irqchip, clocksource) that are then walked to retrieve the data and function hooks required to probe the respective kernel components. Probing for all entries in a section is triggered through the __acpi_probe_device_table() function, that in turn, according to the table ID a given probe entry reports parses the table with the function retrieved from the respective section structures (ie struct acpi_probe_entry). Owing to the current ACPI table parsing implementation, the __acpi_probe_device_table() function has to share global variables with the acpi_match_madt() function, so in order to guarantee mutual exclusion locking is required between the two functions. Current kernel code implements the locking through the acpi_probe_lock spinlock; this has the side effect of requiring all code called within the lock (ie struct acpi_probe_entry.probe_{table/subtbl} hooks) not to sleep. However, kernel subsystems that make use of the early probing infrastructure are relying on kernel APIs that may sleep (eg irq_domain_alloc_fwnode(), among others) in the function calls pointed at by struct acpi_probe_entry.{probe_table/subtbl} entries (eg gic_v2_acpi_init()), which is a bug. Since __acpi_probe_device_table() is called from context that is allowed to sleep the acpi_probe_lock spinlock can be replaced with a mutex; this fixes the issue whilst still guaranteeing mutual exclusion. Signed-off-by: Lorenzo Pieralisi Fixes: e647b532275b (ACPI: Add early device probing infrastructure) Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/scan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 78d5f02a073b..dcb3d6245ca5 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1958,7 +1958,7 @@ int __init acpi_scan_init(void) static struct acpi_probe_entry *ape; static int acpi_probe_count; -static DEFINE_SPINLOCK(acpi_probe_lock); +static DEFINE_MUTEX(acpi_probe_mutex); static int __init acpi_match_madt(struct acpi_subtable_header *header, const unsigned long end) @@ -1977,7 +1977,7 @@ int __init __acpi_probe_device_table(struct acpi_probe_entry *ap_head, int nr) if (acpi_disabled) return 0; - spin_lock(&acpi_probe_lock); + mutex_lock(&acpi_probe_mutex); for (ape = ap_head; nr; ape++, nr--) { if (ACPI_COMPARE_NAME(ACPI_SIG_MADT, ape->id)) { acpi_probe_count = 0; @@ -1990,7 +1990,7 @@ int __init __acpi_probe_device_table(struct acpi_probe_entry *ap_head, int nr) count++; } } - spin_unlock(&acpi_probe_lock); + mutex_unlock(&acpi_probe_mutex); return count; } From a37b834402b27800d028b06b8e0c1275b607ff37 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 May 2016 16:23:04 +0300 Subject: [PATCH 414/813] ACPI / sysfs: fix error code in get_status() commit f18ebc211e259d4f591e39e74b2aa2de226c9a1d upstream. The problem with ornamental, do-nothing gotos is that they lead to "forgot to set the error code" bugs. We should be returning -EINVAL here but we don't. It leads to an uninitalized variable in counter_show(): drivers/acpi/sysfs.c:603 counter_show() error: uninitialized symbol 'status'. Fixes: 1c8fce27e275 (ACPI: introduce drivers/acpi/sysfs.c) Signed-off-by: Dan Carpenter Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/sysfs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 0243d375c6fd..4b3a9e27f1b6 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -555,23 +555,22 @@ static void acpi_global_event_handler(u32 event_type, acpi_handle device, static int get_status(u32 index, acpi_event_status *status, acpi_handle *handle) { - int result = 0; + int result; if (index >= num_gpes + ACPI_NUM_FIXED_EVENTS) - goto end; + return -EINVAL; if (index < num_gpes) { result = acpi_get_gpe_device(index, handle); if (result) { ACPI_EXCEPTION((AE_INFO, AE_NOT_FOUND, "Invalid GPE 0x%x", index)); - goto end; + return result; } result = acpi_get_gpe_status(*handle, index, status); } else if (index < (num_gpes + ACPI_NUM_FIXED_EVENTS)) result = acpi_get_event_status(index - num_gpes, status); -end: return result; } From 37e16dc96556fbcba12b6bb1d151df5a769e7928 Mon Sep 17 00:00:00 2001 From: Lukasz Anaczkowski Date: Thu, 21 Apr 2016 11:29:00 +0200 Subject: [PATCH 415/813] ACPI / SRAT: fix SRAT parsing order with both LAPIC and X2APIC present commit 702b07fcc9b264c9afd372676bbdd50a762dcde0 upstream. SRAT maps APIC ID to proximity domains ids (PXM). Mapping from PXM to NUMA node ids is based on order of entries in SRAT table. SRAT table has just LAPIC entires or mix of LAPIC and X2APIC entries. As long as there are only LAPIC entires, mapping from proximity domain id to NUMA node id is as assumed by BIOS. However, once APIC entries are mixed, X2APIC entries would be first mapped which causes unexpected NUMA node mapping. To fix that, change parsing to check each entry against both LAPIC and X2APIC so mapping is in the SRAT/PXM order. This is supplemental change to the fix made by commit d81056b5278 (Handle apic/x2apic entries in MADT in correct order) and using the mechanism introduced by 9b3fedd (ACPI / tables: Add acpi_subtable_proc to ACPI table parsers). Fixes: d81056b5278 (Handle apic/x2apic entries in MADT in correct order) Signed-off-by: Lukasz Anaczkowski [ rjw : Subject & changelog ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/numa.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 72b6e9ef0ae9..d176e0ece470 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -327,10 +327,18 @@ int __init acpi_numa_init(void) /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { - acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, 0); - acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, 0); + struct acpi_subtable_proc srat_proc[2]; + + memset(srat_proc, 0, sizeof(srat_proc)); + srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; + srat_proc[0].handler = acpi_parse_processor_affinity; + srat_proc[1].id = ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY; + srat_proc[1].handler = acpi_parse_x2apic_affinity; + + acpi_table_parse_entries_array(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + srat_proc, ARRAY_SIZE(srat_proc), 0); + cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); From 85db22a1079b6e10d30b83aca749c98b612d2867 Mon Sep 17 00:00:00 2001 From: Andrej Krutak Date: Thu, 18 Aug 2016 23:52:10 +0200 Subject: [PATCH 416/813] ALSA: line6: Remove double line6_pcm_release() after failed acquire. commit 7e4379eae0e31994ea645db1d13006ea8e5ce539 upstream. If there's an error, pcm is released in line6_pcm_acquire already. Fixes: 247d95ee6dd2 ('ALSA: line6: Handle error from line6_pcm_acquire()') Reviewed-by: Stefan Hajnoczi Signed-off-by: Andrej Krutak Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/line6/pcm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c index 204cc074adb9..2bc8879757f9 100644 --- a/sound/usb/line6/pcm.c +++ b/sound/usb/line6/pcm.c @@ -55,7 +55,6 @@ static int snd_line6_impulse_volume_put(struct snd_kcontrol *kcontrol, err = line6_pcm_acquire(line6pcm, LINE6_STREAM_IMPULSE); if (err < 0) { line6pcm->impulse_volume = 0; - line6_pcm_release(line6pcm, LINE6_STREAM_IMPULSE); return err; } } else { From d21befeb0d9ad10a69850969963de00a46eb992a Mon Sep 17 00:00:00 2001 From: Andrej Krutak Date: Thu, 18 Aug 2016 23:52:11 +0200 Subject: [PATCH 417/813] ALSA: line6: Give up on the lock while URBs are released. commit adc8a43a6d6688272ebffa81789fa857e603dec6 upstream. Done, because line6_stream_stop() locks and calls line6_unlink_audio_urbs(), which in turn invokes audio_out_callback(), which tries to lock 2nd time. Fixes: ============================================= [ INFO: possible recursive locking detected ] 4.4.15+ #15 Not tainted --------------------------------------------- mplayer/3591 is trying to acquire lock: (&(&line6pcm->out.lock)->rlock){-.-...}, at: [] audio_out_callback+0x70/0x110 [snd_usb_line6] but task is already holding lock: (&(&line6pcm->out.lock)->rlock){-.-...}, at: [] line6_stream_stop+0x24/0x5c [snd_usb_line6] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&line6pcm->out.lock)->rlock); lock(&(&line6pcm->out.lock)->rlock); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by mplayer/3591: #0: (snd_pcm_link_rwlock){.-.-..}, at: [] snd_pcm_stream_lock+0x1e/0x40 [snd_pcm] #1: (&(&substream->self_group.lock)->rlock){-.-...}, at: [] snd_pcm_stream_lock+0x26/0x40 [snd_pcm] #2: (&(&line6pcm->out.lock)->rlock){-.-...}, at: [] line6_stream_stop+0x24/0x5c [snd_usb_line6] stack backtrace: CPU: 0 PID: 3591 Comm: mplayer Not tainted 4.4.15+ #15 Hardware name: Generic AM33XX (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x11/0x14) [] (show_stack) from [] (dump_stack+0x8b/0xac) [] (dump_stack) from [] (__lock_acquire+0xc8b/0x1780) [] (__lock_acquire) from [] (lock_acquire+0x99/0x1c0) [] (lock_acquire) from [] (_raw_spin_lock_irqsave+0x3f/0x4c) [] (_raw_spin_lock_irqsave) from [] (audio_out_callback+0x70/0x110 [snd_usb_line6]) [] (audio_out_callback [snd_usb_line6]) from [] (__usb_hcd_giveback_urb+0x53/0xd0) [] (__usb_hcd_giveback_urb) from [] (musb_giveback+0x3d/0x98) [] (musb_giveback) from [] (musb_urb_dequeue+0x6d/0x114) [] (musb_urb_dequeue) from [] (usb_hcd_unlink_urb+0x39/0x98) [] (usb_hcd_unlink_urb) from [] (line6_unlink_audio_urbs+0x6a/0x6c [snd_usb_line6]) [] (line6_unlink_audio_urbs [snd_usb_line6]) from [] (line6_stream_stop+0x42/0x5c [snd_usb_line6]) [] (line6_stream_stop [snd_usb_line6]) from [] (snd_line6_trigger+0xb6/0xf4 [snd_usb_line6]) [] (snd_line6_trigger [snd_usb_line6]) from [] (snd_pcm_do_stop+0x36/0x38 [snd_pcm]) [] (snd_pcm_do_stop [snd_pcm]) from [] (snd_pcm_action_single+0x22/0x40 [snd_pcm]) [] (snd_pcm_action_single [snd_pcm]) from [] (snd_pcm_action+0xac/0xb0 [snd_pcm]) [] (snd_pcm_action [snd_pcm]) from [] (snd_pcm_drop+0x38/0x64 [snd_pcm]) [] (snd_pcm_drop [snd_pcm]) from [] (snd_pcm_common_ioctl1+0x7fe/0xbe8 [snd_pcm]) [] (snd_pcm_common_ioctl1 [snd_pcm]) from [] (snd_pcm_playback_ioctl1+0x15c/0x51c [snd_pcm]) [] (snd_pcm_playback_ioctl1 [snd_pcm]) from [] (snd_pcm_playback_ioctl+0x20/0x28 [snd_pcm]) [] (snd_pcm_playback_ioctl [snd_pcm]) from [] (do_vfs_ioctl+0x3af/0x5c8) Fixes: 63e20df1e5b2 ('ALSA: line6: Reorganize PCM stream handling') Reviewed-by: Stefan Hajnoczi Signed-off-by: Andrej Krutak Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/line6/pcm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c index 2bc8879757f9..41aa3355e920 100644 --- a/sound/usb/line6/pcm.c +++ b/sound/usb/line6/pcm.c @@ -210,7 +210,9 @@ static void line6_stream_stop(struct snd_line6_pcm *line6pcm, int direction, spin_lock_irqsave(&pstr->lock, flags); clear_bit(type, &pstr->running); if (!pstr->running) { + spin_unlock_irqrestore(&pstr->lock, flags); line6_unlink_audio_urbs(line6pcm, pstr); + spin_lock_irqsave(&pstr->lock, flags); if (direction == SNDRV_PCM_STREAM_CAPTURE) { line6pcm->prev_fbuf = NULL; line6pcm->prev_fsize = 0; From a2d9e40c8fdc8d206ebee3ec8066bf240a340c65 Mon Sep 17 00:00:00 2001 From: Andrej Krutak Date: Thu, 18 Aug 2016 23:52:12 +0200 Subject: [PATCH 418/813] ALSA: line6: Fix POD sysfs attributes segfault commit b027d11263836a0cd335520175257dcb99b43757 upstream. The commit 02fc76f6a changed base of the sysfs attributes from device to card. The "show" callbacks dereferenced wrong objects because of this. Fixes: 02fc76f6a7db ('ALSA: line6: Create sysfs via snd_card_add_dev_attr()') Reviewed-by: Stefan Hajnoczi Signed-off-by: Andrej Krutak Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/line6/pod.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/usb/line6/pod.c b/sound/usb/line6/pod.c index daf81d169a42..45dd34874f43 100644 --- a/sound/usb/line6/pod.c +++ b/sound/usb/line6/pod.c @@ -244,8 +244,8 @@ static int pod_set_system_param_int(struct usb_line6_pod *pod, int value, static ssize_t serial_number_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct usb_interface *interface = to_usb_interface(dev); - struct usb_line6_pod *pod = usb_get_intfdata(interface); + struct snd_card *card = dev_to_snd_card(dev); + struct usb_line6_pod *pod = card->private_data; return sprintf(buf, "%u\n", pod->serial_number); } @@ -256,8 +256,8 @@ static ssize_t serial_number_show(struct device *dev, static ssize_t firmware_version_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct usb_interface *interface = to_usb_interface(dev); - struct usb_line6_pod *pod = usb_get_intfdata(interface); + struct snd_card *card = dev_to_snd_card(dev); + struct usb_line6_pod *pod = card->private_data; return sprintf(buf, "%d.%02d\n", pod->firmware_version / 100, pod->firmware_version % 100); @@ -269,8 +269,8 @@ static ssize_t firmware_version_show(struct device *dev, static ssize_t device_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct usb_interface *interface = to_usb_interface(dev); - struct usb_line6_pod *pod = usb_get_intfdata(interface); + struct snd_card *card = dev_to_snd_card(dev); + struct usb_line6_pod *pod = card->private_data; return sprintf(buf, "%d\n", pod->device_id); } From dde898fba89069f78be1d0174f5184dc939cce5f Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Tue, 26 Jul 2016 09:47:09 +0200 Subject: [PATCH 419/813] hwmon: (iio_hwmon) fix memory leak in name attribute commit 5d17d3b4bbf3becb89fd48b74340a50a39736f6d upstream. The "name" variable's memory is now freed when the device is destructed thanks to devm function. Signed-off-by: Quentin Schulz Reported-by: Guenter Roeck Fixes: e0f8a24e0edfd ("staging:iio::hwmon interface client driver.") Fixes: 61bb53bcbdd86 ("hwmon: (iio_hwmon) Add support for humidity sensors") Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/iio_hwmon.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/hwmon/iio_hwmon.c b/drivers/hwmon/iio_hwmon.c index 17ae2eb26ce2..d5c06f2764f4 100644 --- a/drivers/hwmon/iio_hwmon.c +++ b/drivers/hwmon/iio_hwmon.c @@ -109,24 +109,24 @@ static int iio_hwmon_probe(struct platform_device *pdev) switch (type) { case IIO_VOLTAGE: - a->dev_attr.attr.name = kasprintf(GFP_KERNEL, - "in%d_input", - in_i++); + a->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL, + "in%d_input", + in_i++); break; case IIO_TEMP: - a->dev_attr.attr.name = kasprintf(GFP_KERNEL, - "temp%d_input", - temp_i++); + a->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL, + "temp%d_input", + temp_i++); break; case IIO_CURRENT: - a->dev_attr.attr.name = kasprintf(GFP_KERNEL, - "curr%d_input", - curr_i++); + a->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL, + "curr%d_input", + curr_i++); break; case IIO_HUMIDITYRELATIVE: - a->dev_attr.attr.name = kasprintf(GFP_KERNEL, - "humidity%d_input", - humidity_i++); + a->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL, + "humidity%d_input", + humidity_i++); break; default: ret = -EINVAL; From 625ddb785d013221458ab851ac718495eba1dcbf Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 22 Jun 2016 21:42:16 +0300 Subject: [PATCH 420/813] sysfs: correctly handle read offset on PREALLOC attrs commit 17d0774f80681020eccc9638d925a23f1fc4f671 upstream. Attributes declared with __ATTR_PREALLOC use sysfs_kf_read() which returns zero bytes for non-zero offset. This breaks script checkarray in mdadm tool in debian where /bin/sh is 'dash' because its builtin 'read' reads only one byte at a time. Script gets 'i' instead of 'idle' when reads current action from /sys/block/$dev/md/sync_action and as a result does nothing. This patch adds trivial implementation of partial read: generate whole string and move required part into buffer head. Signed-off-by: Konstantin Khlebnikov Fixes: 4ef67a8c95f3 ("sysfs/kernfs: make read requests on pre-alloc files use the buffer.") Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=787950 Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f35523d4fa3a..b803213d1307 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ - if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); + if (pos) { + if (len <= pos) + return 0; + len -= pos; + memmove(buf, buf + pos, len); + } return min(count, len); } From 2cb99ded2f551c78506e5f7bbf6c0d7613351ab1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 7 Sep 2016 08:32:59 +0200 Subject: [PATCH 421/813] Linux 4.4.20 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 695c64ec160c..b74d60081a16 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 19 +SUBLEVEL = 20 EXTRAVERSION = NAME = Blurry Fish Butt From 573787341e57a73bf027c2ea3bdf18dc298ae0a0 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 13 Sep 2016 11:31:27 +0800 Subject: [PATCH 422/813] backporting: a brief introduce of backported feautures on 4.4 Add a file named backported-features to give a short introduction on backported features. Signed-off-by: Alex Shi --- backported-features | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 backported-features diff --git a/backported-features b/backported-features new file mode 100644 index 000000000000..b680ed49292f --- /dev/null +++ b/backported-features @@ -0,0 +1,14 @@ + LSK backported features + +1, The kaslr and kaslr-pax_usercopy branches base on LSK directly. + v4.4/topic/mm-kaslr + v4.4/topic/mm-kaslr-pax_usercopy + +2, Coresight and openCSD are used for Juno board 'perf' tool implement. + origin/v4.4/topic/coresight + origin/v4.4/topic/perf-opencsd-4.4-github + +3, OPTEE base on LSK mainline, but isn't included of mainline. + +Feature introducation: +https://wiki.linaro.org/lsk/features From eed022ee2f71ef37b65442f3f430a3c1822169b2 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 23 Aug 2016 11:32:37 -0700 Subject: [PATCH 423/813] ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module Exports the device mapper callbacks of linear and dm-verity-target methods. Signed-off-by: Badhri Jagan Sridharan Change-Id: I0358be0615c431dce3cc78575aaac4ccfe3aacd7 --- drivers/md/Kconfig | 3 ++- drivers/md/Makefile | 5 +---- drivers/md/dm-linear.c | 6 ++++++ drivers/md/dm-verity-target.c | 7 +++++++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 96b419b544ed..6035794bc1f2 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -501,7 +501,7 @@ config DM_LOG_WRITES If unsure, say N. config DM_ANDROID_VERITY - bool "Android verity target support" + tristate "Android verity target support" depends on DM_VERITY depends on X509_CERTIFICATE_PARSER depends on SYSTEM_TRUSTED_KEYRING @@ -509,6 +509,7 @@ config DM_ANDROID_VERITY depends on KEYS depends on ASYMMETRIC_KEY_TYPE depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + depends on MD_LINEAR ---help--- This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked diff --git a/drivers/md/Makefile b/drivers/md/Makefile index c8fb00d8cc36..32b5d0a90d60 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o +obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o @@ -68,7 +69,3 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif - -ifeq ($(CONFIG_DM_ANDROID_VERITY),y) -dm-verity-objs += dm-android-verity.o -endif diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 8505a771de42..2ff5f32a4b99 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -66,6 +66,7 @@ int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(lc); return ret; } +EXPORT_SYMBOL_GPL(dm_linear_ctr); void dm_linear_dtr(struct dm_target *ti) { @@ -74,6 +75,7 @@ void dm_linear_dtr(struct dm_target *ti) dm_put_device(ti, lc->dev); kfree(lc); } +EXPORT_SYMBOL_GPL(dm_linear_dtr); static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { @@ -98,6 +100,7 @@ int dm_linear_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +EXPORT_SYMBOL_GPL(dm_linear_map); void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -115,6 +118,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(dm_linear_status); int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) @@ -132,6 +136,7 @@ int dm_linear_prepare_ioctl(struct dm_target *ti, return 1; return 0; } +EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl); int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -140,6 +145,7 @@ int dm_linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +EXPORT_SYMBOL_GPL(dm_linear_iterate_devices); static struct target_type linear_target = { .name = "linear", diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5214ed2c7507..9d3d4b297201 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -592,6 +592,7 @@ int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +EXPORT_SYMBOL_GPL(verity_map); /* * Status: V (valid) or C (corruption found) @@ -655,6 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(verity_status); int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) @@ -668,6 +670,7 @@ int verity_prepare_ioctl(struct dm_target *ti, return 1; return 0; } +EXPORT_SYMBOL_GPL(verity_prepare_ioctl); int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -676,6 +679,7 @@ int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } +EXPORT_SYMBOL_GPL(verity_iterate_devices); void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { @@ -689,6 +693,7 @@ void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } +EXPORT_SYMBOL_GPL(verity_io_hints); void verity_dtr(struct dm_target *ti) { @@ -719,6 +724,7 @@ void verity_dtr(struct dm_target *ti) kfree(v); } +EXPORT_SYMBOL_GPL(verity_dtr); static int verity_alloc_zero_digest(struct dm_verity *v) { @@ -1053,6 +1059,7 @@ bad: return r; } +EXPORT_SYMBOL_GPL(verity_ctr); static struct target_type verity_target = { .name = "verity", From a270309c905338503f94a104178a3175f75f57c6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 11 Jun 2016 20:32:06 +0200 Subject: [PATCH 424/813] ipv6: fix endianness error in icmpv6_err IPv6 ping socket error handler doesn't correctly convert the new 32 bit mtu to host endianness before using. [Cherry-pick of net dcb94b88c09ce82a80e188d49bcffdc83ba215a6] Bug: 29370996 Change-Id: Iea0ca79f16c2a1366d82b3b0a3097093d18da8b7 Cc: Lorenzo Colitti Fixes: 6d0bfe22611602f ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Hannes Frederic Sowa Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a3ec7a77a1ee..41e5c9520c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) - ping_err(skb, offset, info); + ping_err(skb, offset, ntohl(info)); } static int icmpv6_rcv(struct sk_buff *skb); From 0e806c83bca02211f6b2e3de31df085eb4cf3508 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Sat, 13 Aug 2016 01:13:38 +0900 Subject: [PATCH 425/813] net: ipv6: Fix ping to link-local addresses. ping_v6_sendmsg does not set flowi6_oif in response to sin6_scope_id or sk_bound_dev_if, so it is not possible to use these APIs to ping an IPv6 address on a different interface. Instead, it sets flowi6_iif, which is incorrect but harmless. Stop setting flowi6_iif, and support various ways of setting oif in the same priority order used by udpv6_sendmsg. [Backport of net 5e457896986e16c440c97bb94b9ccd95dd157292] Bug: 29370996 Change-Id: Ibe1b9434c00ed96f1e30acb110734c6570b087b8 Tested: https://android-review.googlesource.com/#/c/254470/ Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/ping.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 2cfaedf03252..9411c8d770a5 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -84,7 +84,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; - int iif = 0; + int oif = 0; struct flowi6 fl6; int err; int hlimit; @@ -106,25 +106,30 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != u->sin6_scope_id) { - return -EINVAL; - } daddr = &(u->sin6_addr); - iif = u->sin6_scope_id; + if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) + oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; } - if (!iif) - iif = sk->sk_bound_dev_if; + if (!oif) + oif = sk->sk_bound_dev_if; + + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; + + if (!oif && ipv6_addr_is_multicast(daddr)) + oif = np->mcast_oif; + else if (!oif) + oif = np->ucast_oif; addr_type = ipv6_addr_type(daddr); - if (__ipv6_addr_needs_scope_id(addr_type) && !iif) - return -EINVAL; - if (addr_type & IPV6_ADDR_MAPPED) + if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || + (addr_type & IPV6_ADDR_MAPPED) || + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) return -EINVAL; /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ @@ -134,17 +139,13 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; + fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sock_i_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -154,11 +155,6 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!np) return -EBADF; - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; pfh.icmph.checksum = 0; From d4d74af4b871915fe926d2f267e311949e0bf4b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Aug 2016 08:44:12 -0700 Subject: [PATCH 426/813] RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact Currently the percpu-rwsem switches to (global) atomic ops while a writer is waiting; which could be quite a while and slows down releasing the readers. This patch cures this problem by ordering the reader-state vs reader-count (see the comments in __percpu_down_read() and percpu_down_write()). This changes a global atomic op into a full memory barrier, which doesn't have the global cacheline contention. This also enables using the percpu-rwsem with rcu_sync disabled in order to bias the implementation differently, reducing the writer latency by adding some cost to readers. Mailing-list-URL: https://lkml.org/lkml/2016/8/9/181 Cc: Paul McKenney Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) [jstultz: Backported to 4.4] Change-Id: I8ea04b4dca2ec36f1c2469eccafde1423490572f Signed-off-by: John Stultz --- include/linux/percpu-rwsem.h | 84 +++++++++-- kernel/locking/percpu-rwsem.c | 265 +++++++++++++++++++--------------- 2 files changed, 225 insertions(+), 124 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c2fa3ecb0dce..146efefde2a1 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -10,30 +10,96 @@ struct percpu_rw_semaphore { struct rcu_sync rss; - unsigned int __percpu *fast_read_ctr; + unsigned int __percpu *read_count; struct rw_semaphore rw_sem; - atomic_t slow_read_ctr; - wait_queue_head_t write_waitq; + wait_queue_head_t writer; + int readers_block; }; -extern void percpu_down_read(struct percpu_rw_semaphore *); -extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); -extern void percpu_up_read(struct percpu_rw_semaphore *); +extern int __percpu_down_read(struct percpu_rw_semaphore *, int); +extern void __percpu_up_read(struct percpu_rw_semaphore *); + +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + might_sleep(); + + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); + + preempt_disable(); + /* + * We are in an RCU-sched read-side critical section, so the writer + * cannot both change sem->state from readers_fast and start checking + * counters while we are here. So if we see !sem->state, we know that + * the writer won't be checking until we're past the preempt_enable() + * and that one the synchronize_sched() is done, the writer will see + * anything we did within this RCU-sched read-size critical section. + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + __percpu_down_read(sem, false); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ +} + +static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) +{ + int ret = 1; + + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ + + if (ret) + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); + + return ret; +} + +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +{ + /* + * The barrier() in preempt_disable() prevents the compiler from + * bleeding the critical section out. + */ + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + if (likely(rcu_sync_is_idle(&sem->rss))) + __this_cpu_dec(*sem->read_count); + else + __percpu_up_read(sem); /* Unconditional memory barrier */ + preempt_enable(); + + rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); +} extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); + extern void percpu_free_rwsem(struct percpu_rw_semaphore *); -#define percpu_init_rwsem(brw) \ +#define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ - __percpu_init_rwsem(brw, #brw, &rwsem_key); \ + __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) - #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,151 +8,186 @@ #include #include -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) -{ - bool success; - - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); - - return success; -} - -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); - - if (likely(update_fast_ctr(brw, +1))) - return; - - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); - -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); - return 1; -} - -void percpu_up_read(struct percpu_rw_semaphore *brw) -{ - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; - - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); -} -EXPORT_SYMBOL_GPL(percpu_up_read); - -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } - - return sum; -} - -void percpu_down_write(struct percpu_rw_semaphore *brw) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. */ - rcu_sync_enter(&brw->rss); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* A matches D */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); + + if (try) + return 0; + + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); + + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); + return 1; +} +EXPORT_SYMBOL_GPL(__percpu_down_read); + +void __percpu_up_read(struct percpu_rw_semaphore *sem) +{ + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); + + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); +} +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) + +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) +{ + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ + + smp_mb(); /* C matches B */ + + return true; +} + +void percpu_down_write(struct percpu_rw_semaphore *sem) +{ + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + + /* + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. + */ + WRITE_ONCE(sem->readers_block, 1); + + smp_mb(); /* D matches A */ + + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ + + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. */ - rcu_sync_exit(&brw->rss); + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. + */ + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); From a81c69e149012f7437d8edd9e84ca141c2228cd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Aug 2016 18:54:13 +0200 Subject: [PATCH 427/813] RFC: FROMLIST: cgroup: avoid synchronize_sched() in __cgroup_procs_write() The current percpu-rwsem read side is entirely free of serializing insns at the cost of having a synchronize_sched() in the write path. The latency of the synchronize_sched() is too high for cgroups. The commit 1ed1328792ff talks about the write path being a fairly cold path but this is not the case for Android which moves task to the foreground cgroup and back around binder IPC calls from foreground processes to background processes, so it is significantly hotter than human initiated operations. Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the problem, hopefully it should not be that slow after another commit 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact"). We could just add rcu_sync_enter() into cgroup_init() but we do not want another synchronize_sched() at boot time, so this patch adds the new helper which doesn't block but currently can only be called before the first use. Cc: Tejun Heo Cc: Paul McKenney Reported-by: John Stultz Reported-by: Dmitry Shmidt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov [jstultz: backported to 4.4] Change-Id: I34aa9c394d3052779b56976693e96d861bd255f2 Mailing-list-URL: https://lkml.org/lkml/2016/8/11/557 Signed-off-by: John Stultz --- include/linux/rcu_sync.h | 1 + kernel/cgroup.c | 6 ++++++ kernel/rcu/sync.c | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index a63a33e6196e..ece7ed9a4a70 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8191a95f9952..f53e61f95b55 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5380,6 +5380,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..e358313a0d6c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -82,6 +82,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) rsp->gp_type = type; } +/** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + /** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization From 48bb58c01270ba34509ccdfa08b10bf2afe8b41c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 10 Aug 2016 15:43:06 -0400 Subject: [PATCH 428/813] RFC: FROMLIST: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork cgroup_threadgroup_rwsem is acquired in read mode during process exit and fork. It is also grabbed in write mode during __cgroups_proc_write(). I've recently run into a scenario with lots of memory pressure and OOM and I am beginning to see systemd __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 percpu_down_write+0x114/0x170 __cgroup_procs_write.isra.12+0xb8/0x3c0 cgroup_file_write+0x74/0x1a0 kernfs_fop_write+0x188/0x200 __vfs_write+0x6c/0xe0 vfs_write+0xc0/0x230 SyS_write+0x6c/0x110 system_call+0x38/0xb4 This thread is waiting on the reader of cgroup_threadgroup_rwsem to exit. The reader itself is under memory pressure and has gone into reclaim after fork. There are times the reader also ends up waiting on oom_lock as well. __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 jbd2_log_wait_commit+0xd4/0x180 ext4_evict_inode+0x88/0x5c0 evict+0xf8/0x2a0 dispose_list+0x50/0x80 prune_icache_sb+0x6c/0x90 super_cache_scan+0x190/0x210 shrink_slab.part.15+0x22c/0x4c0 shrink_zone+0x288/0x3c0 do_try_to_free_pages+0x1dc/0x590 try_to_free_pages+0xdc/0x260 __alloc_pages_nodemask+0x72c/0xc90 alloc_pages_current+0xb4/0x1a0 page_table_alloc+0xc0/0x170 __pte_alloc+0x58/0x1f0 copy_page_range+0x4ec/0x950 copy_process.isra.5+0x15a0/0x1870 _do_fork+0xa8/0x4b0 ppc_clone+0x8/0xc In the meanwhile, all processes exiting/forking are blocked almost stalling the system. This patch moves the threadgroup_change_begin from before cgroup_fork() to just before cgroup_canfork(). There is no nee to worry about threadgroup changes till the task is actually added to the threadgroup. This avoids having to call reclaim with cgroup_threadgroup_rwsem held. tj: Subject and description edits. Signed-off-by: Balbir Singh Acked-by: Zefan Li Cc: Oleg Nesterov Cc: Andrew Morton Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Tejun Heo [jstultz: Cherry-picked from: git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 568ac888215c7f] Change-Id: Ie8ece84fb613cf6a7b08cea1468473a8df2b9661 Signed-off-by: John Stultz --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7ec6e9939b2c..d6a6da547e41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1370,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1522,6 +1521,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1622,6 +1622,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: @@ -1652,7 +1653,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); From 8b94247342c5d606af162aa4da6618481d9c1ca0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 31 Aug 2016 09:52:16 -0700 Subject: [PATCH 429/813] ANDROID: rcu_sync: Export rcu_sync_lockdep_assert x86_64:allmodconfig fails to build with the following error. ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined! Introduced by commit 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact"). The applied upstream version exports the missing symbol, so let's do the same. Change-Id: If4e516715c3415fe8c82090f287174857561550d Fixes: 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize ...") Signed-off-by: Guenter Roeck --- kernel/rcu/sync.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e358313a0d6c..b49cf3ac2d47 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,7 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** From 7988ef0cccaef65a0f0d9f5dc28150d2347c1f47 Mon Sep 17 00:00:00 2001 From: Yongqin Liu Date: Thu, 1 Sep 2016 22:06:04 +0530 Subject: [PATCH 430/813] ANDROID: base-cfg: enable SECCOMP config Enable following seccomp configs CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y Otherwise we will get mediacode error like this on Android N: E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument Change-Id: I2477b6a2cfdded5c0ebf6ffbb6150b0e5fe2ba12 Signed-off-by: Yongqin Liu Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 6496bb3961a2..288bab2d858d 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -141,6 +141,8 @@ CONFIG_PROFILING=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From c70373ec84c24c5f8b5a7393ed6efa25bab0592f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 28 Jan 2016 09:22:44 -0200 Subject: [PATCH 431/813] UPSTREAM: [media] xc2028: avoid use after free (cherry picked from commit 8dfbcc4351a0b6d2f2d77f367552f48ffefafe18) If struct xc2028_config is passed without a firmware name, the following trouble may happen: [11009.907205] xc2028 5-0061: type set to XCeive xc2028/xc3028 tuner [11009.907491] ================================================================== [11009.907750] BUG: KASAN: use-after-free in strcmp+0x96/0xb0 at addr ffff8803bd78ab40 [11009.907992] Read of size 1 by task modprobe/28992 [11009.907994] ============================================================================= [11009.907997] BUG kmalloc-16 (Tainted: G W ): kasan: bad access detected [11009.907999] ----------------------------------------------------------------------------- [11009.908008] INFO: Allocated in xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] age=0 cpu=3 pid=28992 [11009.908012] ___slab_alloc+0x581/0x5b0 [11009.908014] __slab_alloc+0x51/0x90 [11009.908017] __kmalloc+0x27b/0x350 [11009.908022] xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] [11009.908026] usb_hcd_submit_urb+0x1e8/0x1c60 [11009.908029] usb_submit_urb+0xb0e/0x1200 [11009.908032] usb_serial_generic_write_start+0xb6/0x4c0 [11009.908035] usb_serial_generic_write+0x92/0xc0 [11009.908039] usb_console_write+0x38a/0x560 [11009.908045] call_console_drivers.constprop.14+0x1ee/0x2c0 [11009.908051] console_unlock+0x40d/0x900 [11009.908056] vprintk_emit+0x4b4/0x830 [11009.908061] vprintk_default+0x1f/0x30 [11009.908064] printk+0x99/0xb5 [11009.908067] kasan_report_error+0x10a/0x550 [11009.908070] __asan_report_load1_noabort+0x43/0x50 [11009.908074] INFO: Freed in xc2028_set_config+0x90/0x630 [tuner_xc2028] age=1 cpu=3 pid=28992 [11009.908077] __slab_free+0x2ec/0x460 [11009.908080] kfree+0x266/0x280 [11009.908083] xc2028_set_config+0x90/0x630 [tuner_xc2028] [11009.908086] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908090] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908094] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908098] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908101] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908105] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908108] do_one_initcall+0x141/0x300 [11009.908111] do_init_module+0x1d0/0x5ad [11009.908114] load_module+0x6666/0x9ba0 [11009.908117] SyS_finit_module+0x108/0x130 [11009.908120] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908123] INFO: Slab 0xffffea000ef5e280 objects=25 used=25 fp=0x (null) flags=0x2ffff8000004080 [11009.908126] INFO: Object 0xffff8803bd78ab40 @offset=2880 fp=0x0000000000000001 [11009.908130] Bytes b4 ffff8803bd78ab30: 01 00 00 00 2a 07 00 00 9d 28 00 00 01 00 00 00 ....*....(...... [11009.908133] Object ffff8803bd78ab40: 01 00 00 00 00 00 00 00 b0 1d c3 6a 00 88 ff ff ...........j.... [11009.908137] CPU: 3 PID: 28992 Comm: modprobe Tainted: G B W 4.5.0-rc1+ #43 [11009.908140] Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0350.2015.0812.1722 08/12/2015 [11009.908142] ffff8803bd78a000 ffff8802c273f1b8 ffffffff81932007 ffff8803c6407a80 [11009.908148] ffff8802c273f1e8 ffffffff81556759 ffff8803c6407a80 ffffea000ef5e280 [11009.908153] ffff8803bd78ab40 dffffc0000000000 ffff8802c273f210 ffffffff8155ccb4 [11009.908158] Call Trace: [11009.908162] [] dump_stack+0x4b/0x64 [11009.908165] [] print_trailer+0xf9/0x150 [11009.908168] [] object_err+0x34/0x40 [11009.908171] [] kasan_report_error+0x230/0x550 [11009.908175] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908179] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908182] [] __asan_report_load1_noabort+0x43/0x50 [11009.908185] [] ? __asan_register_globals+0x50/0xa0 [11009.908189] [] ? strcmp+0x96/0xb0 [11009.908192] [] strcmp+0x96/0xb0 [11009.908196] [] xc2028_set_config+0x15c/0x630 [tuner_xc2028] [11009.908200] [] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908203] [] ? memset+0x28/0x30 [11009.908206] [] ? xc2028_set_config+0x630/0x630 [tuner_xc2028] [11009.908211] [] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908215] [] ? em28xx_dvb_init.part.3+0x37c/0x5cf4 [em28xx_dvb] [11009.908219] [] ? hauppauge_hvr930c_init+0x487/0x487 [em28xx_dvb] [11009.908222] [] ? lgdt330x_attach+0x1cc/0x370 [lgdt330x] [11009.908226] [] ? i2c_read_demod_bytes.isra.2+0x210/0x210 [lgdt330x] [11009.908230] [] ? ref_module.part.15+0x10/0x10 [11009.908233] [] ? module_assert_mutex_or_preempt+0x80/0x80 [11009.908238] [] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908242] [] ? em28xx_attach_xc3028.constprop.7+0x30d/0x30d [em28xx_dvb] [11009.908245] [] ? string+0x14d/0x1f0 [11009.908249] [] ? symbol_string+0xff/0x1a0 [11009.908253] [] ? uuid_string+0x6f0/0x6f0 [11009.908257] [] ? __kernel_text_address+0x7e/0xa0 [11009.908260] [] ? print_context_stack+0x7f/0xf0 [11009.908264] [] ? __module_address+0xb6/0x360 [11009.908268] [] ? is_ftrace_trampoline+0x99/0xe0 [11009.908271] [] ? __kernel_text_address+0x7e/0xa0 [11009.908275] [] ? debug_check_no_locks_freed+0x290/0x290 [11009.908278] [] ? dump_trace+0x11b/0x300 [11009.908282] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908285] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908289] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908292] [] ? trace_hardirqs_on+0xd/0x10 [11009.908296] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908299] [] ? mutex_trylock+0x400/0x400 [11009.908302] [] ? do_one_initcall+0x131/0x300 [11009.908306] [] ? call_rcu_sched+0x17/0x20 [11009.908309] [] ? put_object+0x48/0x70 [11009.908314] [] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908317] [] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908320] [] ? 0xffffffffa0150000 [11009.908324] [] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908327] [] do_one_initcall+0x141/0x300 [11009.908330] [] ? try_to_run_init_process+0x40/0x40 [11009.908333] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908337] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908340] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908343] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908346] [] ? __asan_register_globals+0x87/0xa0 [11009.908350] [] do_init_module+0x1d0/0x5ad [11009.908353] [] load_module+0x6666/0x9ba0 [11009.908356] [] ? symbol_put_addr+0x50/0x50 [11009.908361] [] ? em28xx_dvb_init.part.3+0x5989/0x5cf4 [em28xx_dvb] [11009.908366] [] ? module_frob_arch_sections+0x20/0x20 [11009.908369] [] ? open_exec+0x50/0x50 [11009.908374] [] ? ns_capable+0x5b/0xd0 [11009.908377] [] SyS_finit_module+0x108/0x130 [11009.908379] [] ? SyS_init_module+0x1f0/0x1f0 [11009.908383] [] ? lockdep_sys_exit_thunk+0x12/0x14 [11009.908394] [] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908396] Memory state around the buggy address: [11009.908398] ffff8803bd78aa00: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908401] ffff8803bd78aa80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908403] >ffff8803bd78ab00: fc fc fc fc fc fc fc fc 00 00 fc fc fc fc fc fc [11009.908405] ^ [11009.908407] ffff8803bd78ab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908409] ffff8803bd78ac00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908411] ================================================================== In order to avoid it, let's set the cached value of the firmware name to NULL after freeing it. While here, return an error if the memory allocation fails. Signed-off-by: Mauro Carvalho Chehab Change-Id: I945c841dcfb45de2056267e4aa50bbe176b527cf Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 4e941f00b600..082ff5608455 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1403,11 +1403,12 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) * in order to avoid troubles during device release. */ kfree(priv->ctrl.fname); + priv->ctrl.fname = NULL; memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); if (priv->ctrl.fname == NULL) - rc = -ENOMEM; + return -ENOMEM; } /* From 6bc91eb13fe93cfdc23bdf2188ce9b987a3d0712 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Feb 2016 13:34:00 -0200 Subject: [PATCH 432/813] UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config() (cherry picked from commit 210bd104c6acd31c3c6b8b075b3f12d4a9f6b60d) We have to unlock before returning -ENOMEM. Fixes: 8dfbcc4351a0 ('[media] xc2028: avoid use after free') Signed-off-by: Dan Carpenter Signed-off-by: Mauro Carvalho Chehab Change-Id: I7b6ba9fde5c6e29467e6de23d398af2fe56e2547 Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 082ff5608455..317ef63ee789 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1407,8 +1407,10 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); - if (priv->ctrl.fname == NULL) - return -ENOMEM; + if (priv->ctrl.fname == NULL) { + rc = -ENOMEM; + goto unlock; + } } /* @@ -1440,6 +1442,7 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) } else priv->state = XC2028_WAITING_FIRMWARE; } +unlock: mutex_unlock(&priv->lock); return rc; From bfdbb3be1eaaf13cfbb7ac421356369e53fa7d0f Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 2 Sep 2016 10:13:21 +0530 Subject: [PATCH 433/813] ANDROID: base-cfg: drop SECCOMP_FILTER config Don't need to set SECCOMP_FILTER explicitly since CONFIG_SECCOMP=y will select that config anyway. Fixes: a49dcf2e745c ("ANDROID: base-cfg: enable SECCOMP config") Change-Id: Iff18ed4d2db5a55b9f9480d5ecbeef7b818b3837 Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 288bab2d858d..34fa64a669ac 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -142,7 +142,6 @@ CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y -CONFIG_SECCOMP_FILTER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From 2a111a3c242aed0fbbe6024a65f652c7ba853c8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Aug 2016 05:56:26 -0700 Subject: [PATCH 434/813] UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit bb1fceca22492109be12640d49f5ea5a544c6bb4) When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the tail of the write queue using tcp_add_write_queue_tail() Then it attempts to copy user data into this fresh skb. If the copy fails, we undo the work and remove the fresh skb. Unfortunately, this undo lacks the change done to tp->highest_sack and we can leave a dangling pointer (to a freed skb) Later, tcp_xmit_retransmit_queue() can dereference this pointer and access freed memory. For regular kernels where memory is not unmapped, this might cause SACK bugs because tcp_highest_sack_seq() is buggy, returning garbage instead of tp->snd_nxt, but with various debug features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel. This bug was found by Marco Grassi thanks to syzkaller. Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb") Reported-by: Marco Grassi Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Reviewed-by: Cong Wang Signed-off-by: David S. Miller Change-Id: I58bb02d6e4e399612e8580b9e02d11e661df82f5 Bug: 31183296 --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index b36cebad6b2f..6c480679423e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1513,6 +1513,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (sk->sk_send_head == skb_unlinked) sk->sk_send_head = NULL; + if (tcp_sk(sk)->highest_sack == skb_unlinked) + tcp_sk(sk)->highest_sack = NULL; } static inline void tcp_init_send_head(struct sock *sk) From 0202669aab680f7a4868be1ea387925ee3199357 Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Wed, 11 May 2016 13:18:35 -0700 Subject: [PATCH 435/813] BACKPORT: Don't show empty tag stats for unprivileged uids BUG: 27577101 BUG: 27532522 Signed-off-by: Mohamad Ayyash --- net/netfilter/xt_qtaguid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e2e7d54f9bb1..3bf0c59dab2f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1946,7 +1946,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) ); f_count = atomic_long_read( &sock_tag_entry->socket->file->f_count); - seq_printf(m, "sock=%p tag=0x%llx (uid=%u) pid=%u " + seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " "f_count=%lu\n", sock_tag_entry->sk, sock_tag_entry->tag, uid, @@ -2548,8 +2548,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, uid_t stat_uid = get_uid_from_tag(tag); struct proc_print_info *ppi = m->private; /* Detailed tags are not available to everybody */ - if (get_atag_from_tag(tag) && !can_read_other_uid_stats( - make_kuid(&init_user_ns,stat_uid))) { + if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) { CT_DEBUG("qtaguid: stats line: " "%s 0x%llx %u: insufficient priv " "from pid=%u tgid=%u uid=%u stats.gid=%u\n", From de3e4231c642ecdb41522edbf4d74677b136c76a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 21:28:47 +0100 Subject: [PATCH 436/813] BACKPORT: ARM: 8583/1: mm: fix location of _etext The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. Just to be conservative, leave the kernel resource as it was, using __init_begin instead of _etext as the end mark. Signed-off-by: Kees Cook Signed-off-by: Russell King Change-Id: Ida514d1359dbe6f782f562ce29b4ba09ae72bfc0 (cherry picked from commit 14c4a533e0996f95a0a64dfd0b6252d788cebc74) Signed-off-by: Sami Tolvanen --- arch/arm/kernel/setup.c | 2 +- arch/arm/kernel/vmlinux.lds.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 20edd349d379..bf63b4693457 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -772,7 +772,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 8b60fde5ce48..be2ab6d3b91f 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -120,6 +120,8 @@ SECTIONS #ifdef CONFIG_DEBUG_RODATA . = ALIGN(1< Date: Thu, 23 Jun 2016 15:53:17 +0200 Subject: [PATCH 437/813] BACKPORT: arm64: mm: fix location of _etext As Kees Cook notes in the ARM counterpart of this patch [0]: The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. In particular, Kees is referring to the HARDENED_USERCOPY patch set [1], which rejects attempts to call copy_to_user() on kernel ranges containing executable code, but does allow access to the .rodata segment. Regardless of whether one may or may not agree with the distinction, it makes sense for _etext to have the same meaning across architectures. So let's put _etext where it belongs, between .text and .rodata, and fix up existing references to use __init_begin instead, which unlike _end_rodata includes the exception and notes sections as well. The _etext references in kaslr.c are left untouched, since its references to [_stext, _etext) are meant to capture potential jump instruction targets, and so disregarding .rodata is actually an improvement here. [0] http://article.gmane.org/gmane.linux.kernel/2245084 [1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502 Reported-by: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas (cherry picked from commit 9fdc14c55cd6579d619ccd9d40982e0805e62b6d) Signed-off-by: Amit Pundir --- arch/arm64/kernel/setup.c | 2 +- arch/arm64/kernel/vmlinux.lds.S | 7 ++++--- arch/arm64/mm/init.c | 4 ++-- arch/arm64/mm/mmu.c | 20 ++++++++++---------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 42371f69def3..29b8c247d56f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -202,7 +202,7 @@ static void __init request_standard_resources(void) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index ac925e54e7eb..3458db0b0eed 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -120,12 +120,13 @@ SECTIONS } . = ALIGN(SEGMENT_ALIGN); - RO_DATA(PAGE_SIZE) /* everything from this point to */ - EXCEPTION_TABLE(8) /* _etext will be marked RO NX */ + _etext = .; /* End of text section */ + + RO_DATA(PAGE_SIZE) /* everything from this point to */ + EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ NOTES . = ALIGN(SEGMENT_ALIGN); - _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f001d40eaaa7..fe4fe832ee47 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -387,8 +387,8 @@ void __init mem_init(void) MLM(MODULES_VADDR, MODULES_END), MLG(VMALLOC_START, VMALLOC_END), MLK_ROUNDUP(__init_begin, __init_end), - MLK_ROUNDUP(_text, __start_rodata), - MLK_ROUNDUP(__start_rodata, _etext), + MLK_ROUNDUP(_text, _etext), + MLK_ROUNDUP(__start_rodata, __init_begin), MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index cd4177a1781d..d89ad9a992dc 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -386,14 +386,14 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { unsigned long kernel_start = __pa(_text); - unsigned long kernel_end = __pa(_etext); + unsigned long kernel_end = __pa(__init_begin); /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel text */ + /* No overlap with the kernel text/rodata */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -402,7 +402,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel text mapping. + * This block overlaps the kernel text/rodata mappings. * Map the portion(s) which don't overlap. */ if (start < kernel_start) @@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end early_pgtable_alloc); /* - * Map the linear alias of the [_text, _etext) interval as + * Map the linear alias of the [_text, __init_begin) interval as * read-only/non-executable. This makes the contents of the * region accessible to subsystems such as hibernate, but * protects it from inadvertent modification or execution. @@ -447,14 +447,14 @@ void mark_rodata_ro(void) { unsigned long section_size; - section_size = (unsigned long)__start_rodata - (unsigned long)_text; + section_size = (unsigned long)_etext - (unsigned long)_text; create_mapping_late(__pa(_text), (unsigned long)_text, section_size, PAGE_KERNEL_ROX); /* - * mark .rodata as read only. Use _etext rather than __end_rodata to - * cover NOTES and EXCEPTION_TABLE. + * mark .rodata as read only. Use __init_begin rather than __end_rodata + * to cover NOTES and EXCEPTION_TABLE. */ - section_size = (unsigned long)_etext - (unsigned long)__start_rodata; + section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); } @@ -497,8 +497,8 @@ static void __init map_kernel(pgd_t *pgd) { static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; - map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text); - map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata); + map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, &vmlinux_init); map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); From 1cbefb3fb1c189e02e8329316fce1fbb2529badb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: [PATCH 438/813] UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds Change-Id: Ib905a84a04d46984320f6fd1056da4d72f3d6b53 (cherry picked from commit 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 16 ++++++++-------- include/linux/uaccess.h | 4 ++-- lib/strncpy_from_user.c | 8 ++++---- lib/strnlen_user.c | 7 +++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index dd73cf90fb18..be439e246d91 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -773,21 +773,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 349557825428..f30c187ed785 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33840324138c..5a003a2ebd96 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -55,8 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -75,6 +74,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } From c3f4d074ed48e93586acb5b83c0f702094fa6276 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Aug 2016 12:15:22 -0700 Subject: [PATCH 439/813] UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math check_bogus_address() checked for pointer overflow using this expression, where 'ptr' has type 'const void *': ptr + n < ptr Since pointer wraparound is undefined behavior, gcc at -O2 by default treats it like the following, which would not behave as intended: (long)n < 0 Fortunately, this doesn't currently happen for kernel code because kernel code is compiled with -fno-strict-overflow. But the expression should be fixed anyway to use well-defined integer arithmetic, since it could be treated differently by different compilers in the future or could be reported by tools checking for undefined behavior. Signed-off-by: Eric Biggers Signed-off-by: Kees Cook Change-Id: I73b13be651cf35c03482f2014bf2c3dd291518ab (cherry picked from commit 7329a655875a2f4bd6984fe8a7e00a6981e802f3) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 8ebae91a6b55..82f81df2edcf 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -124,7 +124,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if (ptr + n < ptr) + if ((unsigned long)ptr + n < (unsigned long)ptr) return ""; /* Reject if NULL or ZERO-allocation. */ From 10bd621d383631bbbe10aeca21f95e764603f294 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Aug 2016 11:53:59 -0500 Subject: [PATCH 440/813] UPSTREAM: usercopy: fix overlap check for kernel text When running with a local patch which moves the '_stext' symbol to the very beginning of the kernel text area, I got the following panic with CONFIG_HARDENED_USERCOPY: usercopy: kernel memory exposure attempt detected from ffff88103dfff000 () (4096 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:79! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1 Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016 task: ffff880817444140 task.stack: ffff880816274000 RIP: 0010:[] __check_object_size+0x76/0x413 RSP: 0018:ffff880816277c40 EFLAGS: 00010246 RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8 RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000 R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000 R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001 FS: 00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0 Stack: ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000 0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28 000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6 Call Trace: [] copy_page_to_iter_iovec+0xa6/0x1c0 [] copy_page_to_iter+0x16/0x90 [] generic_file_read_iter+0x3e3/0x7c0 [] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs] [] ? down_read+0x12/0x40 [] xfs_file_buffered_aio_read+0x51/0xc0 [xfs] [] xfs_file_read_iter+0x62/0xb0 [xfs] [] __vfs_read+0xdf/0x130 [] vfs_read+0x8e/0x140 [] SyS_read+0x55/0xc0 [] do_syscall_64+0x67/0x160 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00 RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00 RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004 RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000 R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000 Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3 RIP [] __check_object_size+0x76/0x413 RSP The checked object's range [ffff88103dfff000, ffff88103e000000) is valid, so there shouldn't have been a BUG. The hardened usercopy code got confused because the range's ending address is the same as the kernel's text starting address at 0xffff88103e000000. The overlap check is slightly off. Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Josh Poimboeuf Signed-off-by: Kees Cook Change-Id: I839dbf4ddbb4d9874026a42abed557eb9b3f8bef (cherry picked from commit 94cd97af690dd9537818dc9841d0ec68bb1dd877) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 82f81df2edcf..a3cc3052f830 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -83,7 +83,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; From 8e4b2f84a8926e0b49cfbcd14dd8e58b5af84791 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Thu, 25 Aug 2016 18:31:01 -0700 Subject: [PATCH 441/813] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. This commit fixes the 32 bit build breakage in the previous commit. Tested on both 32 bit and 64 bit arm devices. Bug: 30677035 Change-Id: I9a615a16616d80f87e75676ac4d078a5c429dcf9 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 82 +++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 ++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 6 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 81 ++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 322 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f8e64cac981a..b20ada4ad68e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -40,6 +40,8 @@ #include "blk.h" #include "blk-mq.h" +#include + EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -3539,3 +3541,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + u_int64_t num_elem, elem; + int pct; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 4df0c68e87e8..1689075e2229 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index fcf7829c759e..17068839c74b 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,8 +32,6 @@ #include "slot-gpio.h" #include "pwrseq.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif + mmc_latency_hist_sysfs_init(host); + mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 992bf5397633..bf38533406fd 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,6 +12,8 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 85cd2564c157..4167bdbf0ecf 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5327,6 +5356,54 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5653,6 +5733,7 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 2570d9477b37..f3780cf7d895 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,6 +532,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 168755791ec8..c98bae90624c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,6 +197,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1656,6 +1659,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 37967b6da03c..3349f0676acb 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,6 +136,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 40025b28c1fb..e4862f7cdede 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -379,6 +380,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From 03eb77747db507cf82f7bf2e60c9b613e88ed85d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Aug 2016 12:47:01 -0700 Subject: [PATCH 442/813] UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator The kernel test robot reported a usercopy failure in the new hardened sanity checks, due to a page-crossing copy of the FPU state into the task structure. This happened because the kernel test robot was testing with SLOB, which doesn't actually do the required book-keeping for slab allocations, and as a result the hardening code didn't realize that the task struct allocation was one single allocation - and the sanity checks fail. Since SLOB doesn't even claim to support hardening (and you really shouldn't use it), the straightforward solution is to just make the usercopy hardening code depend on the allocator supporting it. Reported-by: kernel test robot Cc: Kees Cook Signed-off-by: Linus Torvalds Change-Id: I37d51f866f873341bf7d5297249899b852e1c6ce (cherry picked from commit 6040e57658eee6eb1315a26119101ca832d1f854) Signed-off-by: Sami Tolvanen --- security/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/security/Kconfig b/security/Kconfig index aba6a8d4d1f4..2b42c225de28 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -145,6 +145,7 @@ config HAVE_ARCH_HARDENED_USERCOPY config HARDENED_USERCOPY bool "Harden memory copies between kernel and userspace" depends on HAVE_ARCH_HARDENED_USERCOPY + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR select BUG help This option checks for obviously wrong memory regions when From e08e07aec28acf28b60e572a623ce7339765b2e5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: [PATCH 443/813] UPSTREAM: mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ib893a17ecabd692e6c402e864196bf89cd6781a5 (cherry picked from commit d86bd1bece6fc41d59253002db5441fe960a37f6) Signed-off-by: Sami Tolvanen --- include/linux/slub_def.h | 1 + mm/slub.c | 100 +++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33885118523c..f4e857e920cd 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,6 +81,7 @@ struct kmem_cache { int reserved; /* Reserved bytes at the end of slabs */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ + int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif diff --git a/mm/slub.c b/mm/slub.c index fbadb3753d4d..41f7cae64a49 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -489,6 +497,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -817,6 +850,10 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; @@ -1468,7 +1505,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -3283,7 +3320,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3292,6 +3329,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* From 9400d22ae17aaebf301d92adc8401ff346d5d5b4 Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Tue, 6 Sep 2016 15:16:25 -0700 Subject: [PATCH 444/813] cpuset: Make cpusets restore on hotplug This deliberately changes the behavior of the per-cpuset cpus file to not be effected by hotplug. When a cpu is offlined, it will be removed from the cpuset/cpus file. When a cpu is onlined, if the cpuset originally requested that that cpu was part of the cpuset, that cpu will be restored to the cpuset. The cpus files still have to be hierachical, but the ranges no longer have to be out of the currently online cpus, just the physically present cpus. Change-Id: I22cdf33e7d312117bcefba1aeb0125e1ada289a9 Signed-off-by: Dmitry Shmidt --- kernel/cpuset.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a65d63463420..e2e294d997e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -98,6 +98,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; + cpumask_var_t cpus_requested; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -386,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_requested, q->cpus_requested) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -486,7 +487,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_requested, c->cpus_requested)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -945,17 +946,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) return -EINVAL; + + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); } /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; retval = validate_change(cs, trialcs); @@ -964,6 +966,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ @@ -1754,7 +1757,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) switch (type) { case FILE_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); @@ -1943,11 +1946,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; + if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_allowed; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_cpus; + goto free_requested; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_requested); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); @@ -1956,7 +1962,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return &cs->css; -free_cpus: +free_requested: + free_cpumask_var(cs->cpus_requested); +free_allowed: free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); @@ -2019,6 +2027,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->cpus_requested, parent->cpus_requested); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: @@ -2053,6 +2062,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); kfree(cs); } @@ -2120,8 +2130,11 @@ int __init cpuset_init(void) BUG(); if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_requested); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); @@ -2255,7 +2268,7 @@ retry: goto retry; } - cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); From d24bf3c8b21e84a59093c7b7d4ba330e6646ffea Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 2 Feb 2016 13:36:45 -0500 Subject: [PATCH 445/813] UPSTREAM: netfilter: nfnetlink: correctly validate length of batch messages (cherry picked from commit c58d6c93680f28ac58984af61d0a7ebf4319c241) If nlh->nlmsg_len is zero then an infinite loop is triggered because 'skb_pull(skb, msglen);' pulls zero bytes. The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len < NLMSG_HDRLEN' which bypasses the length validation and will later trigger an out-of-bound read. If the length validation does fail then the malformed batch message is copied back to userspace. However, we cannot do this because the nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in netlink_ack: [ 41.455421] ================================================================== [ 41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340 [ 41.456431] Read of size 4294967280 by task a.out/987 [ 41.456431] ============================================================================= [ 41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 41.456431] ----------------------------------------------------------------------------- ... [ 41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00 ................ [ 41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00 ............... [ 41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05 .......@EV."3... [ 41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb ................ ^^ start of batch nlmsg with nlmsg_len=4294967280 ... [ 41.456431] Memory state around the buggy address: [ 41.456431] ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ^ [ 41.456431] ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb [ 41.456431] ================================================================== Fix this with better validation of nlh->nlmsg_len and by setting NFNL_BATCH_FAILURE if any batch message fails length validation. CAP_NET_ADMIN is required to trigger the bugs. Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch") Signed-off-by: Phil Turnbull Signed-off-by: Pablo Neira Ayuso Change-Id: Id3e15c40cb464bf2791af907c235d8a316b2449c Bug: 30947055 --- net/netfilter/nfnetlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 77afe913d03d..9adedba78eea 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -326,10 +326,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ From f52aa97a9bd3ba520bf0e044f3f2fa2f7017ad65 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Fri, 27 Nov 2015 14:30:21 -0500 Subject: [PATCH 446/813] UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields (cherry picked from commit dd42bf1197144ede075a9d4793123f7689e164bc) Line discipline drivers may mistakenly misuse ldisc-related fields when initializing. For example, a failure to initialize tty->receive_room in the N_GIGASET_M101 line discipline was recently found and fixed [1]. Now, the N_X25 line discipline has been discovered accessing the previous line discipline's already-freed private data [2]. Harden the ldisc interface against misuse by initializing revelant tty fields before instancing the new line discipline. [1] commit fd98e9419d8d622a4de91f76b306af6aa627aa9c Author: Tilman Schmidt Date: Tue Jul 14 00:37:13 2015 +0200 isdn/gigaset: reset tty->receive_room when attaching ser_gigaset [2] Report from Sasha Levin [ 634.336761] ================================================================== [ 634.338226] BUG: KASAN: use-after-free in x25_asy_open_tty+0x13d/0x490 at addr ffff8800a743efd0 [ 634.339558] Read of size 4 by task syzkaller_execu/8981 [ 634.340359] ============================================================================= [ 634.341598] BUG kmalloc-512 (Not tainted): kasan: bad access detected ... [ 634.405018] Call Trace: [ 634.405277] dump_stack (lib/dump_stack.c:52) [ 634.405775] print_trailer (mm/slub.c:655) [ 634.406361] object_err (mm/slub.c:662) [ 634.406824] kasan_report_error (mm/kasan/report.c:138 mm/kasan/report.c:236) [ 634.409581] __asan_report_load4_noabort (mm/kasan/report.c:279) [ 634.411355] x25_asy_open_tty (drivers/net/wan/x25_asy.c:559 (discriminator 1)) [ 634.413997] tty_ldisc_open.isra.2 (drivers/tty/tty_ldisc.c:447) [ 634.414549] tty_set_ldisc (drivers/tty/tty_ldisc.c:567) [ 634.415057] tty_ioctl (drivers/tty/tty_io.c:2646 drivers/tty/tty_io.c:2879) [ 634.423524] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607) [ 634.427491] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613) [ 634.427945] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188) Cc: Tilman Schmidt Cc: Sasha Levin Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Change-Id: Ibed6feadfb9706d478f93feec3b240aecfc64af3 Bug: 30951112 --- drivers/tty/tty_ldisc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 629e3c865072..9bee25cfa0be 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -417,6 +417,10 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush); * they are not on hot paths so a little discipline won't do * any harm. * + * The line discipline-related tty_struct fields are reset to + * prevent the ldisc driver from re-using stale information for + * the new ldisc instance. + * * Locking: takes termios_rwsem */ @@ -425,6 +429,9 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) down_write(&tty->termios_rwsem); tty->termios.c_line = num; up_write(&tty->termios_rwsem); + + tty->disc_data = NULL; + tty->receive_room = 0; } /** From 34363937f74d63305d45f699c3bc1b7f4d48fbf4 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 7 Sep 2016 17:39:42 -0700 Subject: [PATCH 447/813] Android: Fix build breakages. The IO latency histogram change broke allmodconfig and allnoconfig builds. This fixes those breakages. Change-Id: I9cdae655b40ed155468f3cef25cdb74bb56c4d3e Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 2 ++ include/linux/mmc/host.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index b20ada4ad68e..32e5bd63f6ae 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3560,6 +3560,7 @@ blk_zero_latency_hist(struct io_latency_state *s) s->latency_reads_elems = 0; s->latency_writes_elems = 0; } +EXPORT_SYMBOL(blk_zero_latency_hist); ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf) @@ -3621,3 +3622,4 @@ blk_latency_hist_show(struct io_latency_state *s, char *buf) } return bytes_written; } +EXPORT_SYMBOL(blk_latency_hist_show); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e4862f7cdede..97b2b0b1f99d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -380,8 +380,10 @@ struct mmc_host { } embedded_sdio_data; #endif +#ifdef CONFIG_BLOCK int latency_hist_enabled; struct io_latency_state io_lat_s; +#endif unsigned long private[0] ____cacheline_aligned; }; From 4774accdcc64b873ca0d6a1ec9c4d18dede368ee Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 19 Jan 2016 12:34:58 +0100 Subject: [PATCH 448/813] UPSTREAM: HID: core: prevent out-of-bound readings (cherry picked from commit 50220dead1650609206efe91f0cc116132d59b3f) Plugging a Logitech DJ receiver with KASAN activated raises a bunch of out-of-bound readings. The fields are allocated up to MAX_USAGE, meaning that potentially, we do not have enough fields to fit the incoming values. Add checks and silence KASAN. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Change-Id: Iaf25e882a6696884439d7091b5fbb0b350d893d3 Bug: 30951261 --- drivers/hid/hid-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index ec791e169f8f..936960202cf4 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1251,6 +1251,7 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && value[n] >= min && value[n] <= max && + value[n] - min < field->maxusage && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) goto exit; } @@ -1263,11 +1264,13 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, } if (field->value[n] >= min && field->value[n] <= max + && field->value[n] - min < field->maxusage && field->usage[field->value[n] - min].hid && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (value[n] >= min && value[n] <= max + && value[n] - min < field->maxusage && field->usage[value[n] - min].hid && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); From b53d6e9ab1130a59da405825584d93d901e87864 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Sep 2016 11:56:01 -0700 Subject: [PATCH 449/813] UPSTREAM: x86/uaccess: force copy_*_user() to be inlined As already done with __copy_*_user(), mark copy_*_user() as __always_inline. Without this, the checks for things like __builtin_const_p() won't work consistently in either hardened usercopy nor the recent adjustments for detecting usercopy overflows at compile time. The change in kernel text size is detectable, but very small: text data bss dec hex filename 12118735 5768608 14229504 32116847 1ea106f vmlinux.before 12120207 5768608 14229504 32118319 1ea162f vmlinux.after Signed-off-by: Kees Cook Change-Id: I284c85c2a782145f46655a91d4f83874c90eba61 (cherry picked from commit e6971009a95a74f28c58bbae415c40effad1226c) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index be439e246d91..dbe64f27280e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -706,7 +706,7 @@ __copy_from_user_overflow(int size, unsigned long count) #endif -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -742,7 +742,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); From ab8010a1aea63d162c912319f1b81f4e0aaebb4a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 31 Aug 2016 16:04:21 -0700 Subject: [PATCH 450/813] BACKPORT: usercopy: fold builtin_const check into inline function Instead of having each caller of check_object_size() need to remember to check for a const size parameter, move the check into check_object_size() itself. This actually matches the original implementation in PaX, though this commit cleans up the now-redundant builtin_const() calls in the various architectures. Signed-off-by: Kees Cook Change-Id: I348809399c10ffa051251866063be674d064b9ff (cherry picked from 81409e9e28058811c9ea865345e1753f8f677e44) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0ae29ff9ccfd..eded095fe81e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -161,7 +161,8 @@ extern void __check_object_size(const void *ptr, unsigned long n, static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { - __check_object_size(ptr, n, to_user); + if (!__builtin_constant_p(n)) + __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, From 0c9f69c443b43eb8b545c605cf65f509a7ab64f7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:39:32 -0700 Subject: [PATCH 451/813] UPSTREAM: usercopy: force check_object_size() inline Just for good measure, make sure that check_object_size() is always inlined too, as already done for copy_*_user() and __copy_*_user(). Suggested-by: Linus Torvalds Signed-off-by: Kees Cook Change-Id: Ibfdf4790d03fe426e68d9a864c55a0d1bbfb7d61 (cherry picked from commit a85d6b8242dc78ef3f4542a0f979aebcbe77fc4e) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index eded095fe81e..4cf89517783a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -158,8 +158,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); From f9ae153c0546111f4f11a6a63473802fcad345c2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:54:34 -0700 Subject: [PATCH 452/813] UPSTREAM: usercopy: remove page-spanning test for now A custom allocator without __GFP_COMP that copies to userspace has been found in vmw_execbuf_process[1], so this disables the page-span checker by placing it behind a CONFIG for future work where such things can be tracked down later. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1373326 Reported-by: Vinson Lee Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Kees Cook Change-Id: I4177c0fb943f14a5faf5c70f5e54bf782c316f43 (cherry picked from commit 8e1f74ea02cf4562404c48c6882214821552c13f) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 61 +++++++++++++++++++++++++++--------------------- security/Kconfig | 11 +++++++++ 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index a3cc3052f830..089328f2b920 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -134,30 +134,15 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n) return NULL; } -static inline const char *check_heap_object(const void *ptr, unsigned long n, - bool to_user) +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) { - struct page *page, *endpage; +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN const void *end = ptr + n - 1; + struct page *endpage; bool is_reserved, is_cma; - /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - */ - if (is_vmalloc_addr(ptr)) - return NULL; - - if (!virt_addr_valid(ptr)) - return NULL; - - page = virt_to_head_page(ptr); - - /* Check slab allocator for flags and size. */ - if (PageSlab(page)) - return __check_heap_object(ptr, n, page); - /* * Sometimes the kernel data regions are not marked Reserved (see * check below). And sometimes [_sdata,_edata) does not cover @@ -186,7 +171,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, ((unsigned long)end & (unsigned long)PAGE_MASK))) return NULL; - /* Allow if start and end are inside the same compound page. */ + /* Allow if fully inside the same compound (__GFP_COMP) page. */ endpage = virt_to_head_page(end); if (likely(endpage == page)) return NULL; @@ -199,20 +184,44 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, is_reserved = PageReserved(page); is_cma = is_migrate_cma_page(page); if (!is_reserved && !is_cma) - goto reject; + return ""; for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { page = virt_to_head_page(ptr); if (is_reserved && !PageReserved(page)) - goto reject; + return ""; if (is_cma && !is_migrate_cma_page(page)) - goto reject; + return ""; } +#endif return NULL; +} -reject: - return ""; +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); } /* diff --git a/security/Kconfig b/security/Kconfig index 2b42c225de28..3aa60791f84d 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -156,6 +156,17 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. +config HARDENED_USERCOPY_PAGESPAN + bool "Refuse to copy allocations that span multiple pages" + depends on HARDENED_USERCOPY + depends on !COMPILE_TEST + help + When a multi-page allocation is done without __GFP_COMP, + hardened usercopy will reject attempts to copy it. There are, + however, several cases of this in the kernel that have not all + been removed. This config is intended to be used only while + trying to find such users. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 872ffdd406dc2c2d218a43ce9f2e7f8d0aec4643 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 31 Aug 2016 08:09:04 -0700 Subject: [PATCH 453/813] FROMLIST: pstore: drop pmsg bounce buffer (from https://lkml.org/lkml/2016/9/1/428) (cherry pick from android-3.10 commit b58133100b38f2bf83cad2d7097417a3a196ed0b) Removing a bounce buffer copy operation in the pmsg driver path is always better. We also gain in overall performance by not requesting a vmalloc on every write as this can cause precious RT tasks, such as user facing media operation, to stall while memory is being reclaimed. Added a write_buf_user to the pstore functions, a backup platform write_buf_user that uses the small buffer that is part of the instance, and implemented a ramoops write_buf_user that only supports PSTORE_TYPE_PMSG. Signed-off-by: Mark Salyzyn Bug: 31057326 Change-Id: I4cdee1cd31467aa3e6c605bce2fbd4de5b0f8caa --- fs/pstore/platform.c | 36 +++++++++++++++++++++++++++++ fs/pstore/pmsg.c | 35 +++++----------------------- fs/pstore/ram.c | 19 +++++++++++++++ fs/pstore/ram_core.c | 47 ++++++++++++++++++++++++++++++++++++-- include/linux/pstore.h | 11 ++++++--- include/linux/pstore_ram.h | 7 ++++-- 6 files changed, 119 insertions(+), 36 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 588461bb2dd4..40a0fe0a4e05 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -431,6 +431,40 @@ static int pstore_write_compat(enum pstore_type_id type, size, psi); } +static int pstore_write_buf_user_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + unsigned long flags = 0; + size_t i, bufsize = size; + long ret = 0; + + if (unlikely(!access_ok(VERIFY_READ, buf, size))) + return -EFAULT; + if (bufsize > psinfo->bufsize) + bufsize = psinfo->bufsize; + spin_lock_irqsave(&psinfo->buf_lock, flags); + for (i = 0; i < size; ) { + size_t c = min(size - i, bufsize); + + ret = __copy_from_user(psinfo->buf, buf + i, c); + if (unlikely(ret != 0)) { + ret = -EFAULT; + break; + } + ret = psi->write_buf(type, reason, id, part, psinfo->buf, + compressed, c, psi); + if (unlikely(ret < 0)) + break; + i += c; + } + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + return unlikely(ret < 0) ? ret : size; +} + /* * platform specific persistent storage driver registers with * us here. If pstore is already mounted, call the platform @@ -453,6 +487,8 @@ int pstore_register(struct pstore_info *psi) if (!psi->write) psi->write = pstore_write_compat; + if (!psi->write_buf_user) + psi->write_buf_user = pstore_write_buf_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 7de20cd3797f..78f6176c020f 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -19,48 +19,25 @@ #include "internal.h" static DEFINE_MUTEX(pmsg_lock); -#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - size_t i, buffer_size; - char *buffer; + u64 id; + int ret; if (!count) return 0; + /* check outside lock, page in any data. write_buf_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - buffer_size = count; - if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) - buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; - buffer = vmalloc(buffer_size); - if (!buffer) - return -ENOMEM; - mutex_lock(&pmsg_lock); - for (i = 0; i < count; ) { - size_t c = min(count - i, buffer_size); - u64 id; - long ret; - - ret = __copy_from_user(buffer, buf + i, c); - if (unlikely(ret != 0)) { - mutex_unlock(&pmsg_lock); - vfree(buffer); - return -EFAULT; - } - psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, - psinfo); - - i += c; - } - + ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, + psinfo); mutex_unlock(&pmsg_lock); - vfree(buffer); - return count; + return ret ? ret : count; } static const struct file_operations pmsg_fops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 414041342a99..5b10c2b4146c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } +static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + if (type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = psi->data; + + if (!cxt->mprz) + return -ENOMEM; + return persistent_ram_write_user(cxt->mprz, buf, size); + } + + return -EINVAL; +} + static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi) { @@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = { .open = ramoops_pstore_open, .read = ramoops_pstore_read, .write_buf = ramoops_pstore_write_buf, + .write_buf_user = ramoops_pstore_write_buf_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 76c3f80efdfa..aa9afe573155 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,15 +17,16 @@ #include #include #include -#include #include #include +#include #include #include +#include #include #include +#include #include -#include #include struct persistent_ram_buffer { @@ -303,6 +304,16 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, persistent_ram_update_ecc(prz, start, count); } +static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + -EFAULT : 0; + persistent_ram_update_ecc(prz, start, count); + return ret; +} + void persistent_ram_save_old(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; @@ -356,6 +367,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz, return count; } +int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count) +{ + int rem, ret = 0, c = count; + size_t start; + + if (unlikely(!access_ok(VERIFY_READ, s, count))) + return -EFAULT; + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + ret = persistent_ram_update_user(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + if (likely(!ret)) + ret = persistent_ram_update_user(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return unlikely(ret) ? ret : count; +} + size_t persistent_ram_old_size(struct persistent_ram_zone *prz) { return prz->old_log_size; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 831479f8df8f..5cae2c6c90ad 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -22,12 +22,13 @@ #ifndef _LINUX_PSTORE_H #define _LINUX_PSTORE_H -#include +#include +#include #include #include -#include #include -#include +#include +#include /* types */ enum pstore_type_id { @@ -67,6 +68,10 @@ struct pstore_info { enum kmsg_dump_reason reason, u64 *id, unsigned int part, const char *buf, bool compressed, size_t size, struct pstore_info *psi); + int (*write_buf_user)(enum pstore_type_id type, + enum kmsg_dump_reason reason, u64 *id, + unsigned int part, const char __user *buf, + bool compressed, size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi); diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 712757f320a4..45ac5a0d29ee 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -17,11 +17,12 @@ #ifndef __LINUX_PSTORE_RAM_H__ #define __LINUX_PSTORE_RAM_H__ +#include #include +#include #include #include #include -#include struct persistent_ram_buffer; struct rs_control; @@ -59,7 +60,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz); void persistent_ram_zap(struct persistent_ram_zone *prz); int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, - unsigned int count); + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); void persistent_ram_save_old(struct persistent_ram_zone *prz); size_t persistent_ram_old_size(struct persistent_ram_zone *prz); From 8e211455958b9dd3266e0ae53e324e0d4231721d Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Thu, 17 Dec 2015 07:34:27 +0800 Subject: [PATCH 454/813] sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() If a newly created task is selected to go to a different CPU in fork balance when it wakes up the first time, its load averages should not be removed from the source CPU since they are never added to it before. The same is also applicable to a never used group entity. Fix it in remove_entity_load_avg(): when entity's last_update_time is 0, simply return. This should precisely identify the case in question, because in other migrations, the last_update_time is set to 0 after remove_entity_load_avg(). Reported-by: Steve Muckle Signed-off-by: Yuyang Du [peterz: cfs_rq_last_update_time] Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44a6f3e539f2..30c98e9c60f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2824,6 +2824,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } +#ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + u64 last_update_time_copy; + u64 last_update_time; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); + + return last_update_time; +} +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} +#endif + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2833,17 +2854,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; -#ifndef CONFIG_64BIT - u64 last_update_time_copy; + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif + last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); From e3c1175d916b271863b47bfa4984864e1eaba11b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 17 Sep 2015 16:10:56 +0100 Subject: [PATCH 455/813] cpufreq: Frequency invariant scheduler load-tracking support Implements cpufreq_scale_freq_capacity() to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is: current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 29 +++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 32 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e49512718325..0d494c937920 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -347,6 +347,31 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) #endif } +/********************************************************************* + * FREQUENCY INVARIANT CPU CAPACITY * + *********************************************************************/ + +static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +static void +scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) +{ + unsigned long cur = freqs ? freqs->new : policy->cur; + unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + int cpu; + + pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", + cpumask_pr_args(policy->cpus), cur, policy->max, scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(freq_scale, cpu) = scale; +} + +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { @@ -450,6 +475,8 @@ wait: spin_unlock(&policy->transition_lock); + scale_freq_capacity(policy, freqs); + cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); @@ -2126,6 +2153,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY, new_policy); + scale_freq_capacity(new_policy, NULL); + policy->min = new_policy->min; policy->max = new_policy->max; trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 3ac01621cd1f..5f1e66e544f5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -619,4 +619,7 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +struct sched_domain; +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif /* _LINUX_CPUFREQ_H */ From ec2b699198fc7567c6319b0cffe4c1a0c051f2c0 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 12:47:48 +0100 Subject: [PATCH 456/813] arm: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Signed-off-by: Dietmar Eggemann --- arch/arm/include/asm/topology.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 370f7a732900..a69917b7d2c9 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,11 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#include +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +#endif + #else static inline void init_cpu_topology(void) { } From 777273b78ea8e63ce5bf2f90ffbc8247cba8c15c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:15:11 +0100 Subject: [PATCH 457/813] arm64: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a3e9d6fdbf21..72c47c3fa7b3 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,6 +22,12 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +struct sched_domain; +extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +#endif + #include #endif /* _ASM_ARM_TOPOLOGY_H */ From 49e029d7a7355ee5d44bbb906606c08cb4133ba3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 14 Apr 2015 16:25:31 +0100 Subject: [PATCH 458/813] arm: Update arch_scale_cpu_capacity() to reflect change to define arch_scale_cpu_capacity() is no longer a weak function but a #define instead. Include the #define in topology.h. cc: Russell King Signed-off-by: Morten Rasmussen --- arch/arm/include/asm/topology.h | 2 ++ arch/arm/kernel/topology.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index a69917b7d2c9..e3e596cbb1a7 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -28,6 +28,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #else diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847bf912..614554765e44 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } From 50df3f37c6642460c4625ccd2fffff17ec534c5b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 7 May 2015 18:46:15 +0100 Subject: [PATCH 459/813] sched: Store system-wide maximum cpu capacity in root domain To be able to compare the capacity of the target cpu with the highest cpu capacity of the system in the wakeup path, store the system-wide maximum cpu capacity in the root domain. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6f6c7cc13f33..8bf755697230 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6975,6 +6975,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7025,11 +7026,18 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); + + if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) + rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); + if (rq) + pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0517abd7dd73..412a072da775 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -543,6 +543,9 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + /* Maximum cpu capacity in the system. */ + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; From 91b2b633145344b23bd960802a0dbb76a61afef7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 19:53:49 +0100 Subject: [PATCH 460/813] sched: Add cpu capacity awareness to wakeup balancing Wakeup balancing is completely unaware of cpu capacity, cpu utilization and task utilization. The task is preferably placed on a cpu which is idle in the instant the wakeup happens. New tasks (SD_BALANCE_{FORK,EXEC} are placed on an idle cpu in the idlest group if such can be found, otherwise it goes on the least loaded one. Existing tasks (SD_BALANCE_WAKE) are placed on the previous cpu or an idle cpu sharing the same last level cache unless the wakee_flips heuristic in wake_wide() decides to fallback to considering cpus outside SD_LLC. Hence existing tasks are not guaranteed to get a chance to migrate to a different group at wakeup in case the current one has reduced cpu capacity (due RT/IRQ pressure or different uarch e.g. ARM big.LITTLE). They may eventually get pulled by other cpus doing periodic/idle/nohz_idle balance, but it may take quite a while before it happens. This patch adds capacity awareness to find_idlest_{group,queue} (used by SD_BALANCE_{FORK,EXEC} and SD_BALANCE_WAKE under certain circumstances) such that groups/cpus that can accommodate the waking task based on task utilization are preferred. In addition, wakeup of existing tasks (SD_BALANCE_WAKE) is sent through find_idlest_{group,queue} also if the task doesn't fit the capacity of the previous cpu to allow it to escape (override wake_affine) when necessary instead of relying on periodic/idle/nohz_idle balance to eventually sort it out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30c98e9c60f0..3e8c5a16e79e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4763,6 +4763,43 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +static unsigned int capacity_margin = 1280; /* ~20% margin */ + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static int cpu_util(int cpu); + +static inline bool task_fits_spare(struct task_struct *p, int cpu) +{ + return __task_fits(p, cpu, cpu_util(cpu)); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4772,7 +4809,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4803,6 +4842,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + /* + * Look for most energy-efficient group that can fit + * that can fit the task. + */ + if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { + fit_capacity = capacity_of(i); + fit_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4816,6 +4864,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); + if (fit_group) + return fit_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4836,7 +4887,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { + if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -4848,7 +4899,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && + } else if (idle_cpu(i) && + (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -4857,6 +4909,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } else if (shallowest_idle_cpu == -1) { + /* + * If we haven't found an idle CPU yet + * pick a non-idle one that can fit the task as + * fallback. + */ + shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -4971,7 +5030,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { From d72801bf86b42594ba6e89b809718d2d401b1660 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 6 Jul 2015 15:01:10 +0100 Subject: [PATCH 461/813] sched: Consider spare cpu capacity at task wake-up find_idlest_group() selects the wake-up target group purely based on group load which leads to suboptimal choices in low load scenarios. An idle group with reduced capacity (due to RT tasks or different cpu type) isn't necessarily a better target than a lightly loaded group with higher capacity. The patch adds spare capacity as an additional group selection parameter. The target group is now selected based on the following criteria: 1. Return the group with the cpu with most spare capacity and this capacity is significant if such group exists. Significant spare capacity is currently at least 20% to spare. 2. Return the group with the lowest load, unless it is the local group in which case NULL is returned and the search is continued at the next (lower) level. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e8c5a16e79e..60f2e982c6bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4809,9 +4809,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL; + struct sched_group *fit_group = NULL, *spare_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; + unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4819,7 +4820,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_capacity; int local_group; int i; @@ -4851,6 +4852,16 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, fit_capacity = capacity_of(i); fit_group = group; } + + /* + * Look for group which has most spare capacity on a + * single cpu. + */ + spare_capacity = capacity_of(i) - cpu_util(i); + if (spare_capacity > max_spare_capacity) { + max_spare_capacity = spare_capacity; + spare_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4867,6 +4878,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (fit_group) return fit_group; + if (spare_group) + return spare_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From 11d962803d25d080f08f3f3c448fa9e5727694b7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 26 Jan 2015 19:47:28 +0000 Subject: [PATCH 462/813] sched: Enable idle balance to pull single task towards cpu with higher capacity We do not want to miss out on the ability to pull a single remaining task from a potential source cpu towards an idle destination cpu. Add an extra criteria to need_active_balance() to kick off active load balance if the source cpu is over-utilized and has lower capacity than the destination cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 60f2e982c6bf..56362ac036b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4800,6 +4800,11 @@ static inline bool task_fits_spare(struct task_struct *p, int cpu) return __task_fits(p, cpu, cpu_util(cpu)); } +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -7016,6 +7021,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 1e960320c965579bd611a6786d1fdc982394c0ce Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 463/813] sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56362ac036b2..2dc28766cf9a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5650,6 +5650,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6612,6 +6613,8 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) @@ -7240,7 +7243,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); From 2fcba55ed9652ab9106fb92801583b890d3e49d3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:43:28 +0000 Subject: [PATCH 464/813] sched: Documentation for scheduler energy cost model This documentation patch provides an overview of the experimental scheduler energy costing model, associated data structures, and a reference recipe on how platforms can be characterized to derive energy models. Signed-off-by: Morten Rasmussen --- Documentation/scheduler/sched-energy.txt | 362 +++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 Documentation/scheduler/sched-energy.txt diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt new file mode 100644 index 000000000000..dab2f9088b33 --- /dev/null +++ b/Documentation/scheduler/sched-energy.txt @@ -0,0 +1,362 @@ +Energy cost model for energy-aware scheduling (EXPERIMENTAL) + +Introduction +============= + +The basic energy model uses platform energy data stored in sched_group_energy +data structures attached to the sched_groups in the sched_domain hierarchy. The +energy cost model offers two functions that can be used to guide scheduling +decisions: + +1. static unsigned int sched_group_energy(struct energy_env *eenv) +2. static int energy_diff(struct energy_env *eenv) + +sched_group_energy() estimates the energy consumed by all cpus in a specific +sched_group including any shared resources owned exclusively by this group of +cpus. Resources shared with other cpus are excluded (e.g. later level caches). + +energy_diff() estimates the total energy impact of a utilization change. That +is, adding, removing, or migrating utilization (tasks). + +Both functions use a struct energy_env to specify the scenario to be evaluated: + + struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + }; + +sg_top: sched_group to be evaluated. Not used by energy_diff(). + +sg_cap: sched_group covering the cpus in the same frequency domain. Set by +sched_group_energy(). + +cap_idx: Capacity state to be used for energy calculations. Set by +find_new_capacity(). + +util_delta: Amount of utilization to be added, removed, or migrated. + +src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be +-1 if no source (e.g. task wake-up). + +dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1 +if utilization is removed (e.g. terminating tasks). + +energy: Result of sched_group_energy(). + +The metric used to represent utilization is the actual per-entity running time +averaged over time using a geometric series. Very similar to the existing +per-entity load-tracking, but _not_ scaled by task priority and capped by the +capacity of the cpu. The latter property does mean that utilization may +underestimate the compute requirements for task on fully/over utilized cpus. +The greatest potential for energy savings without affecting performance too much +is scenarios where the system isn't fully utilized. If the system is deemed +fully utilized load-balancing should be done with task load (includes task +priority) instead in the interest of fairness and performance. + + +Background and Terminology +=========================== + +To make it clear from the start: + +energy = [joule] (resource like a battery on powered devices) +power = energy/time = [joule/second] = [watt] + +The goal of energy-aware scheduling is to minimize energy, while still getting +the job done. That is, we want to maximize: + + performance [inst/s] + -------------------- + power [W] + +which is equivalent to minimizing: + + energy [J] + ----------- + instruction + +while still getting 'good' performance. It is essentially an alternative +optimization objective to the current performance-only objective for the +scheduler. This alternative considers two objectives: energy-efficiency and +performance. Hence, there needs to be a user controllable knob to switch the +objective. Since it is early days, this is currently a sched_feature +(ENERGY_AWARE). + +The idea behind introducing an energy cost model is to allow the scheduler to +evaluate the implications of its decisions rather than applying energy-saving +techniques blindly that may only have positive effects on some platforms. At +the same time, the energy cost model must be as simple as possible to minimize +the scheduler latency impact. + +Platform topology +------------------ + +The system topology (cpus, caches, and NUMA information, not peripherals) is +represented in the scheduler by the sched_domain hierarchy which has +sched_groups attached at each level that covers one or more cpus (see +sched-domains.txt for more details). To add energy awareness to the scheduler +we need to consider power and frequency domains. + +Power domain: + +A power domain is a part of the system that can be powered on/off +independently. Power domains are typically organized in a hierarchy where you +may be able to power down just a cpu or a group of cpus along with any +associated resources (e.g. shared caches). Powering up a cpu means that all +power domains it is a part of in the hierarchy must be powered up. Hence, it is +more expensive to power up the first cpu that belongs to a higher level power +domain than powering up additional cpus in the same high level domain. Two +level power domain hierarchy example: + + Power source + +-------------------------------+----... +per group PD G G + | +----------+ | + +--------+-------| Shared | (other groups) +per-cpu PD G G | resource | + | | +----------+ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + +Frequency domain: + +Frequency domains (P-states) typically cover the same group of cpus as one of +the power domain levels. That is, there might be several smaller power domains +sharing the same frequency (P-state) or there might be a power domain spanning +multiple frequency domains. + +From a scheduling point of view there is no need to know the actual frequencies +[Hz]. All the scheduler cares about is the compute capacity available at the +current state (P-state) the cpu is in and any other available states. For that +reason, and to also factor in any cpu micro-architecture differences, compute +capacity scaling states are called 'capacity states' in this document. For SMP +systems this is equivalent to P-states. For mixed micro-architecture systems +(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture +performance relative to the other cpus in the system. + +Energy modelling: +------------------ + +Due to the hierarchical nature of the power domains, the most obvious way to +model energy costs is therefore to associate power and energy costs with +domains (groups of cpus). Energy costs of shared resources are associated with +the group of cpus that share the resources, only the cost of powering the +cpu itself and any private resources (e.g. private L1 caches) is associated +with the per-cpu groups (lowest level). + +For example, for an SMP system with per-cpu power domains and a cluster level +(group of cpus) power domain we get the overall energy costs to be: + + energy = energy_cluster + n * energy_cpu + +where 'n' is the number of cpus powered up and energy_cluster is the cost paid +as soon as any cpu in the cluster is powered up. + +The power and frequency domains can naturally be mapped onto the existing +sched_domain hierarchy and sched_groups by adding the necessary data to the +existing data structures. + +The energy model considers energy consumption from two contributors (shown in +the illustration below): + +1. Busy energy: Energy consumed while a cpu and the higher level groups that it +belongs to are busy running tasks. Busy energy is associated with the state of +the cpu, not an event. The time the cpu spends in this state varies. Thus, the +most obvious platform parameter for this contribution is busy power +(energy/time). + +2. Idle energy: Energy consumed while a cpu and higher level groups that it +belongs to are idle (in a C-state). Like busy energy, idle energy is associated +with the state of the cpu. Thus, the platform parameter for this contribution +is idle power (energy/time). + +Energy consumed during transitions from an idle-state (C-state) to a busy state +(P-state) or going the other way is ignored by the model to simplify the energy +model calculations. + + + Power + ^ + | busy->idle idle->busy + | transition transition + | + | _ __ + | / \ / \__________________ + |______________/ \ / + | \ / + | Busy \ Idle / Busy + | low P-state \____________/ high P-state + | + +------------------------------------------------------------> time + +Busy |--------------| |-----------------| + +Wakeup |------| |------| + +Idle |------------| + + +The basic algorithm +==================== + +The basic idea is to determine the total energy impact when utilization is +added or removed by estimating the impact at each level in the sched_domain +hierarchy starting from the bottom (sched_group contains just a single cpu). +The energy cost comes from busy time (sched_group is awake because one or more +cpus are busy) and idle time (in an idle-state). Energy model numbers account +for energy costs associated with all cpus in the sched_group as a group. + + for_each_domain(cpu, sd) { + sg = sched_group_of(cpu) + energy_before = curr_util(sg) * busy_power(sg) + + (1-curr_util(sg)) * idle_power(sg) + energy_after = new_util(sg) * busy_power(sg) + + (1-new_util(sg)) * idle_power(sg) + energy_diff += energy_before - energy_after + + } + + return energy_diff + +{curr, new}_util: The cpu utilization at the lowest level and the overall +non-idle time for the entire group for higher levels. Utilization is in the +range 0.0 to 1.0 in the pseudo-code. + +busy_power: The power consumption of the sched_group. + +idle_power: The power consumption of the sched_group when idle. + +Note: It is a fundamental assumption that the utilization is (roughly) scale +invariant. Task utilization tracking factors in any frequency scaling and +performance scaling differences due to difference cpu microarchitectures such +that task utilization can be used across the entire system. + + +Platform energy data +===================== + +struct sched_group_energy can be attached to sched_groups in the sched_domain +hierarchy and has the following members: + +cap_states: + List of struct capacity_state representing the supported capacity states + (P-states). struct capacity_state has two members: cap and power, which + represents the compute capacity and the busy_power of the state. The + list must be ordered by capacity low->high. + +nr_cap_states: + Number of capacity states in cap_states list. + +idle_states: + List of struct idle_state containing idle_state power cost for each + idle-state supported by the system orderd by shallowest state first. + All states must be included at all level in the hierarchy, i.e. a + sched_group spanning just a single cpu must also include coupled + idle-states (cluster states). In addition to the cpuidle idle-states, + the list must also contain an entry for the idling using the arch + default idle (arch_idle_cpu()). Despite this state may not be a true + hardware idle-state it is considered the shallowest idle-state in the + energy model and must be the first entry. cpus may enter this state + (possibly 'active idling') if cpuidle decides not enter a cpuidle + idle-state. Default idle may not be used when cpuidle is enabled. + In this case, it should just be a copy of the first cpuidle idle-state. + +nr_idle_states: + Number of idle states in idle_states list. + +There are no unit requirements for the energy cost data. Data can be normalized +with any reference, however, the normalization must be consistent across all +energy cost data. That is, one bogo-joule/watt must be the same quantity for +data, but we don't care what it is. + +A recipe for platform characterization +======================================= + +Obtaining the actual model data for a particular platform requires some way of +measuring power/energy. There isn't a tool to help with this (yet). This +section provides a recipe for use as reference. It covers the steps used to +characterize the ARM TC2 development platform. This sort of measurements is +expected to be done anyway when tuning cpuidle and cpufreq for a given +platform. + +The energy model needs two types of data (struct sched_group_energy holds +these) for each sched_group where energy costs should be taken into account: + +1. Capacity state information + +A list containing the compute capacity and power consumption when fully +utilized attributed to the group as a whole for each available capacity state. +At the lowest level (group contains just a single cpu) this is the power of the +cpu alone without including power consumed by resources shared with other cpus. +It basically needs to fit the basic modelling approach described in "Background +and Terminology" section: + + energy_system = energy_shared + n * energy_cpu + +for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at +the lowest level. 'energy_shared' is included at the next level which +represents the group of cpus among which the resources are shared. + +This model is, of course, a simplification of reality. Thus, power/energy +attributions might not always exactly represent how the hardware is designed. +Also, busy power is likely to depend on the workload. It is therefore +recommended to use a representative mix of workloads when characterizing the +capacity states. + +If the group has no capacity scaling support, the list will contain a single +state where power is the busy power attributed to the group. The capacity +should be set to a default value (1024). + +When frequency domains include multiple power domains, the group representing +the frequency domain and all child groups share capacity states. This must be +indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at +all levels that share the capacity state must have the list of capacity states +with the power set to the contribution of the individual group. + +2. Idle power information + +Stored in the idle_states list. The power number is the group idle power +consumption in each idle state as well when the group is idle but has not +entered an idle-state ('active idle' as mentioned earlier). Due to the way the +energy model is defined, the idle power of the deepest group idle state can +alternatively be accounted for in the parent group busy power. In that case the +group idle state power values are offset such that the idle power of the +deepest state is zero. It is less intuitive, but it is easier to measure as +idle power consumed by the group and the busy/idle power of the parent group +cannot be distinguished without per group measurement points. + +Measuring capacity states and idle power: + +The capacity states' capacity and power can be estimated by running a benchmark +workload at each available capacity state. By restricting the benchmark to run +on subsets of cpus it is possible to extrapolate the power consumption of +shared resources. + +ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a +shared L2 cache. TC2 has on-chip energy counters per cluster. Running a +benchmark workload on just one cpu in a cluster means that power is consumed in +the cluster (higher level group) and a single cpu (lowest level group). Adding +another benchmark task to another cpu increases the power consumption by the +amount consumed by the additional cpu. Hence, it is possible to extrapolate the +cluster busy power. + +For platforms that don't have energy counters or equivalent instrumentation +built-in, it may be possible to use an external DAQ to acquire similar data. + +If the benchmark includes some performance score (for example sysbench cpu +benchmark), this can be used to record the compute capacity. + +Measuring idle power requires insight into the idle state implementation on the +particular platform. Specifically, if the platform has coupled idle-states (or +package states). To measure non-coupled per-cpu idle-states it is necessary to +keep one cpu busy to keep any shared resources alive to isolate the idle power +of the cpu from idle/busy power of the shared resources. The cpu can be tricked +into different per-cpu idle states by disabling the other states. Based on +various combinations of measurements with specific cpus busy and disabling +idle-states it is possible to extrapolate the idle-state power. From b043eb7320a9304f093224fdbb64419ed052b512 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:45:51 +0000 Subject: [PATCH 465/813] sched: Make energy awareness a sched feature This patch introduces the ENERGY_AWARE sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set false when SCHED_DEBUG is not defined. Hence this doesn't allow energy awareness to be enabled without SCHED_DEBUG. This sched_feature knob will be replaced later with a more appropriate control knob when things have matured a bit. ENERGY_AWARE is based on per-entity load-tracking hence FAIR_GROUP_SCHED must be enable. This dependency isn't checked at compile time yet. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2dc28766cf9a..3fa6b311f93a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4672,6 +4672,11 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..b634151ce286 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,8 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +SCHED_FEAT(ENERGY_AWARE, false) From 029e7b086ea8ebe5e52a46fd74fb9cef7f17135f Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:08:45 +0000 Subject: [PATCH 466/813] sched: Introduce energy data structures The struct sched_group_energy represents the per sched_group related data which is needed for energy aware scheduling. It contains: (1) number of elements of the idle state array (2) pointer to the idle state array which comprises 'power consumption' for each idle state (3) number of elements of the capacity state array (4) pointer to the capacity state array which comprises 'compute capacity and power consumption' tuples for each capacity state The struct sched_group obtains a pointer to a struct sched_group_energy. The function pointer sched_domain_energy_f is introduced into struct sched_domain_topology_level which will allow the arch to pass a particular struct sched_group_energy from the topology shim layer into the scheduler core. The function pointer sched_domain_energy_f has an 'int cpu' parameter since the folding of two adjacent sd levels via sd degenerate doesn't work for all sd levels. I.e. it is not possible for example to use this feature to provide per-cpu energy in sd level DIE on ARM's TC2 platform. It was discussed that the folding of sd levels approach is preferable over the cpu parameter approach, simply because the user (the arch specifying the sd topology table) can introduce less errors. But since it is not working, the 'int cpu' parameter is the only way out. It's possible to use the folding of sd levels approach for sched_domain_flags_f and the cpu parameter approach for the sched_domain_energy_f at the same time though. With the use of the 'int cpu' parameter, an extra check function has to be provided to make sure that all cpus spanned by a sched group are provisioned with the same energy data. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- include/linux/sched.h | 19 +++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 20 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 145c34cb106e..5749286f7b66 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1023,6 +1023,22 @@ struct sched_domain_attr { extern int sched_domain_level_max; +struct capacity_state { + unsigned long cap; /* compute capacity */ + unsigned long power; /* power consumption at this compute capacity */ +}; + +struct idle_state { + unsigned long power; /* power consumption in this idle state */ +}; + +struct sched_group_energy { + unsigned int nr_idle_states; /* number of idle states */ + struct idle_state *idle_states; /* ptr to idle state array */ + unsigned int nr_cap_states; /* number of capacity states */ + struct capacity_state *cap_states; /* ptr to capacity state array */ +}; + struct sched_group; struct sched_domain { @@ -1121,6 +1137,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); +typedef +const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); #define SDTL_OVERLAP 0x01 @@ -1133,6 +1151,7 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; + sched_domain_energy_f energy; int flags; int numa_level; struct sd_data data; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 412a072da775..db1cd295258e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -863,6 +863,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + const struct sched_group_energy const *sge; /* * The CPUs this group covers. From 0605f407b95f3811588e4532a6dae272c7fa1d8d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:20:20 +0000 Subject: [PATCH 467/813] sched: Initialize energy data structures The sched_group_energy (sge) pointer of the first sched_group (sg) in the sched_domain (sd) is initialized to point to the appropriate (in terms of sd level and cpu) sge data defined in the arch and so to the correct part of the Energy Model (EM). Energy-aware scheduling allows that a system has only EM data up to a certain sd level (so called highest energy aware balancing sd level). A check in init_sched_energy() enforces that all sd's below this sd level contain EM data. The 'int cpu' parameter of sched_domain_energy_f requires that check_sched_energy_data() makes sure that all cpus spanned by a sg are provisioned with the same EM data. This patch has also been tested with feature FORCE_SD_OVERLAP enabled. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8bf755697230..e4be7f09d46b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6308,6 +6308,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +/* + * Check that the per-cpu provided sd energy data is consistent for all cpus + * within the mask. + */ +static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn, + const struct cpumask *cpumask) +{ + const struct sched_group_energy * const sge = fn(cpu); + struct cpumask mask; + int i; + + if (cpumask_weight(cpumask) <= 1) + return; + + cpumask_xor(&mask, cpumask, get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) { + const struct sched_group_energy * const e = fn(i); + int y; + + BUG_ON(e->nr_idle_states != sge->nr_idle_states); + + for (y = 0; y < (e->nr_idle_states); y++) { + BUG_ON(e->idle_states[y].power != + sge->idle_states[y].power); + } + + BUG_ON(e->nr_cap_states != sge->nr_cap_states); + + for (y = 0; y < (e->nr_cap_states); y++) { + BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap); + BUG_ON(e->cap_states[y].power != + sge->cap_states[y].power); + } + } +} + +static void init_sched_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sd->groups)) + return; + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup broken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); + + sd->groups->sge = fn(cpu); +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -7014,10 +7074,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } From e2cee0c2f0656074fbda9c2f0fe013419fde47e5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:50:46 +0000 Subject: [PATCH 468/813] sched: Introduce SD_SHARE_CAP_STATES sched_domain flag cpufreq is currently keeping it a secret which cpus are sharing clock source. The scheduler needs to know about clock domains as well to become more energy aware. The SD_SHARE_CAP_STATES domain flag indicates whether cpus belonging to the sched_domain share capacity states (P-states). There is no connection with cpufreq (yet). The flag must be set by the arch specific topology code. cc: Russell King cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- arch/arm/kernel/topology.c | 3 ++- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 614554765e44..38e7be162b79 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -277,7 +277,8 @@ void store_cpu_topology(unsigned int cpuid) static inline int cpu_corepower_flags(void) { - return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; } static struct sched_domain_topology_level arm_topology[] = { diff --git a/include/linux/sched.h b/include/linux/sched.h index 5749286f7b66..4478d3921714 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -991,6 +991,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4be7f09d46b..b4a6fbd7518e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5779,7 +5779,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5811,7 +5812,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6476,6 +6478,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6485,7 +6488,8 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) From 58342bc498f2450e02a2e1aafc28bb8cdf59b5cf Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 10 Jul 2015 13:57:19 +0100 Subject: [PATCH 469/813] arm: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used instead of the arm arch specific cpu_efficiency and dtb property 'clock-frequency' values as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 38e7be162b79..da1c611a3b5e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -153,6 +153,8 @@ static void __init parse_dt_topology(void) } +static const struct sched_group_energy * const cpu_core_energy(int cpu); + /* * Look for a customed capacity of a CPU in the cpu_capacity table during the * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the @@ -160,10 +162,14 @@ static void __init parse_dt_topology(void) */ static void update_cpu_capacity(unsigned int cpu) { - if (!cpu_capacity(cpu)) - return; + unsigned long capacity = SCHED_CAPACITY_SCALE; - set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); pr_info("CPU%u: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); From 70a528b9ada148dacc782e22fe7c0aaed9b6cfb7 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 11:53:48 +0100 Subject: [PATCH 470/813] arm64: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 4 ++- arch/arm64/kernel/topology.c | 42 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 72c47c3fa7b3..9370a336c934 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,11 +22,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity -struct sched_domain; extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #include diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..fb99a6735fd4 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -23,6 +23,18 @@ #include #include +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; + +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -211,6 +223,35 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +static inline int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; +} + +static struct sched_domain_topology_level arm64_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void update_cpu_capacity(unsigned int cpu) +{ + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); + + pr_info("CPU%d: update cpu_capacity %lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); +} + static void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -272,6 +313,7 @@ void store_cpu_topology(unsigned int cpuid) topology_populated: update_siblings_masks(cpuid); + update_cpu_capacity(cpuid); } static void __init reset_cpu_topology(void) From ccfcc4ed08c51b800a829053567af1ab661f4a07 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 14:11:28 +0000 Subject: [PATCH 471/813] sched: Compute cpu capacity available at current frequency capacity_orig_of() returns the max available compute capacity of a cpu. For scale-invariant utilization tracking and energy-aware scheduling decisions it is useful to know the compute capacity available at the current OPP of a cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3fa6b311f93a..0650d998d30f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4672,6 +4672,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); From 0660e45c2f66775fa0ebfa35dd73a070f8542568 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 11 Dec 2014 15:25:29 +0000 Subject: [PATCH 472/813] sched: Relocated cpu_util() and change return type Move cpu_util() to an earlier position in fair.c and change return type to unsigned long as negative usage doesn't make much sense. All other load and capacity related functions use unsigned long including the caller of cpu_util(). cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 70 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0650d998d30f..76edc5be3412 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4683,6 +4683,40 @@ static unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static unsigned long cpu_util(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + return (util >= capacity) ? capacity : util; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); @@ -4809,8 +4843,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static int cpu_util(int cpu); - static inline bool task_fits_spare(struct task_struct *p, int cpu) { return __task_fits(p, cpu, cpu_util(cpu)); @@ -5009,40 +5041,6 @@ done: return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - return (util >= capacity) ? capacity : util; -} - /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, From 6b8bde20df2dd087dd8e3554a9a8c9c511a3b123 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 17:08:52 +0000 Subject: [PATCH 473/813] sched: Highest energy aware balancing sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_ea, points to the highest level at which energy model is provided. At this level and all levels below all sched_groups have energy model data attached. Partial energy model information is possible but restricted to providing energy model data for lower level sched_domains (sd_ea and below) and leaving load-balancing on levels above to non-energy-aware load-balancing. For example, it is possible to apply energy-aware scheduling within each socket on a multi-socket system and let normal scheduling handle load-balancing between sockets. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 11 ++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4a6fbd7518e..42dfdd567179 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5997,11 +5997,12 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; + struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; @@ -6022,6 +6023,14 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db1cd295258e..2d5280253033 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -839,6 +839,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_ea); struct sched_group_capacity { atomic_t ref; From 074bcc729d948432568446cbc5598ecabb80bbb9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 18 Dec 2014 14:47:18 +0000 Subject: [PATCH 474/813] sched: Calculate energy consumption of sched_group For energy-aware load-balancing decisions it is necessary to know the energy consumption estimates of groups of cpus. This patch introduces a basic function, sched_group_energy(), which estimates the energy consumption of the cpus in the group and any resources shared by the members of the group. NOTE: The function has five levels of identation and breaks the 80 character limit. Refactoring is necessary. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 4 ++ kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 161 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 42dfdd567179..1e0b273c20d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5998,6 +5998,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); static void update_top_cache_domain(int cpu) { @@ -6031,6 +6032,9 @@ static void update_top_cache_domain(int cpu) break; } rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76edc5be3412..3987cf828f02 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4722,6 +4722,162 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +/* + * cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +{ + int util = cpu_util(cpu); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static unsigned long group_max_util(struct sched_group *sg) +{ + int i; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(sg)) + max_util = max(max_util, cpu_util(i)); + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +{ + int i; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) + util_sum += cpu_norm_util(i, capacity); + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct sched_group *sg, + const struct sched_group_energy const *sge) +{ + int idx; + unsigned long util = group_max_util(sg); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + return idx; + } + + return idx; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct sched_group *sg_top) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (!sd) + /* + * We most probably raced with hotplug; returning a + * wrong energy estimation is better than entering an + * infinite loop. + */ + return -EINVAL; + + if (sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + struct sched_group *sg_cap_util; + unsigned long group_util; + int sg_busy_energy, sg_idle_energy, cap_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + sg_cap_util = sg_shared_cap; + else + sg_cap_util = sg; + + cap_idx = find_new_capacity(sg_cap_util, sg->sge); + group_util = group_norm_util(sg, cap_idx); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + continue; + } + + return total_energy; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2d5280253033..f4b2bcb017a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -840,6 +840,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_ea); +DECLARE_PER_CPU(struct sched_domain *, sd_scs); struct sched_group_capacity { atomic_t ref; From b7a0598b071c5e2f269d0afbbbd1b6d36b83bb3f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 14:21:56 +0000 Subject: [PATCH 475/813] sched: Extend sched_group_energy to test load-balancing decisions Extended sched_group_energy() to support energy prediction with usage (tasks) added/removed from a specific cpu or migrated between a pair of cpus. Useful for load-balancing decision making. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3987cf828f02..8a912fdc526d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4709,12 +4709,21 @@ static unsigned long capacity_curr_of(int cpu) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static unsigned long cpu_util(int cpu) +static unsigned long __cpu_util(int cpu, int delta) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - return (util >= capacity) ? capacity : util; + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); } static inline bool energy_aware(void) @@ -4722,8 +4731,18 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; +}; + /* - * cpu_norm_util() returns the cpu util relative to a specific capacity, + * __cpu_norm_util() returns the cpu util relative to a specific capacity, * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for * energy calculations. Using the scale-invariant util returned by * cpu_util() and approximating scale-invariant util by: @@ -4736,9 +4755,9 @@ static inline bool energy_aware(void) * * norm_util = running_time/time ~ util/capacity */ -static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) { - int util = cpu_util(cpu); + int util = __cpu_util(cpu, delta); if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -4746,13 +4765,25 @@ static unsigned long cpu_norm_util(int cpu, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct sched_group *sg) +static int calc_util_delta(struct energy_env *eenv, int cpu) { - int i; + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; unsigned long max_util = 0; - for_each_cpu(i, sched_group_cpus(sg)) - max_util = max(max_util, cpu_util(i)); + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } return max_util; } @@ -4766,31 +4797,36 @@ static unsigned long group_max_util(struct sched_group *sg) * latter is used as the estimate as it leads to a more pessimistic energy * estimate (more busy). */ -static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i; + int i, delta; unsigned long util_sum = 0; - unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; - for_each_cpu(i, sched_group_cpus(sg)) - util_sum += cpu_norm_util(i, capacity); + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } if (util_sum > SCHED_CAPACITY_SCALE) return SCHED_CAPACITY_SCALE; return util_sum; } -static int find_new_capacity(struct sched_group *sg, +static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy const *sge) { int idx; - unsigned long util = group_max_util(sg); + unsigned long util = group_max_util(eenv); for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) - return idx; + break; } + eenv->cap_idx = idx; + return idx; } @@ -4804,16 +4840,16 @@ static int find_new_capacity(struct sched_group *sg, * This can probably be done in a faster but more complex way. * Note: sched_group_energy() may fail when racing with sched_domain updates. */ -static int sched_group_energy(struct sched_group *sg_top) +static int sched_group_energy(struct energy_env *eenv) { struct sched_domain *sd; int cpu, total_energy = 0; struct cpumask visit_cpus; struct sched_group *sg; - WARN_ON(!sg_top->sge); + WARN_ON(!eenv->sg_top->sge); - cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -4845,17 +4881,16 @@ static int sched_group_energy(struct sched_group *sg_top) break; do { - struct sched_group *sg_cap_util; unsigned long group_util; int sg_busy_energy, sg_idle_energy, cap_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) - sg_cap_util = sg_shared_cap; + eenv->sg_cap = sg_shared_cap; else - sg_cap_util = sg; + eenv->sg_cap = sg; - cap_idx = find_new_capacity(sg_cap_util, sg->sge); - group_util = group_norm_util(sg, cap_idx); + cap_idx = find_new_capacity(eenv, sg->sge); + group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) @@ -4866,7 +4901,7 @@ static int sched_group_energy(struct sched_group *sg_top) if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); - if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); @@ -4875,7 +4910,8 @@ next_cpu: continue; } - return total_energy; + eenv->energy = total_energy; + return 0; } /* From c77a2807a8d9bdca803255ba189650431ffd83d0 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 6 Jan 2015 17:34:05 +0000 Subject: [PATCH 476/813] sched: Estimate energy impact of scheduling decisions Adds a generic energy-aware helper function, energy_diff(), that calculates energy impact of adding, removing, and migrating utilization in the system. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a912fdc526d..21a4cda8c5ff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4914,6 +4914,58 @@ next_cpu: return 0; } +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static int energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + return energy_after-energy_before; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened From ce646469d55c29913446a4694d1cba2819c3d950 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 16:49:57 +0100 Subject: [PATCH 477/813] sched: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% (1024) regardless of how many task additional task we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at it's highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For system where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 31 +++++++++++++++++++++++++------ kernel/sched/sched.h | 3 +++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 21a4cda8c5ff..d6b6e59e63eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4160,6 +4160,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static bool cpu_overutilized(int cpu); + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4170,6 +4172,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); for_each_sched_entity(se) { if (se->on_rq) @@ -4201,9 +4204,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { add_nr_running(rq, 1); - + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) + rq->rd->overutilized = true; + } hrtick_update(rq); } @@ -6672,11 +6678,12 @@ group_type group_classify(struct sched_group *group, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i; @@ -6706,6 +6713,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) + *overutilized = true; } /* Adjust by relative CPU capacity of the group */ @@ -6811,7 +6821,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6833,7 +6843,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6877,8 +6887,14 @@ next_group: /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) + env->dst_rq->rd->overutilized = overutilized; + } else { + if (!env->dst_rq->rd->overutilized && overutilized) + env->dst_rq->rd->overutilized = true; + } } /** @@ -8271,6 +8287,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + rq->rd->overutilized = true; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4b2bcb017a3..60a6cbc1eb7a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -528,6 +528,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). From df05d846a435268b454eb03fee3b7859dfc94471 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 27 Jan 2015 13:48:07 +0000 Subject: [PATCH 478/813] sched, cpuidle: Track cpuidle state index in the scheduler The idle-state of each cpu is currently pointed to by rq->idle_state but there isn't any information in the struct cpuidle_state that can used to look up the idle-state energy model data stored in struct sched_group_energy. For this purpose is necessary to store the idle state index as well. Ideally, the idle-state data should be unified. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- drivers/cpuidle/cpuidle.c | 4 ++-- include/linux/cpuidle.h | 2 +- kernel/sched/idle.c | 3 ++- kernel/sched/sched.h | 21 +++++++++++++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index d40b2c077746..151971627757 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -192,7 +192,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, } /* Take note of the planned idle state. */ - sched_idle_set_state(target_state); + sched_idle_set_state(target_state, index); trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ktime_get(); @@ -205,7 +205,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ - sched_idle_set_state(NULL); + sched_idle_set_state(NULL, -1); if (broadcast) { if (WARN_ON_ONCE(!irqs_disabled())) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 786ad32631a6..6eae1576499e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, #endif /* kernel/sched/idle.c */ -extern void sched_idle_set_state(struct cpuidle_state *idle_state); +extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index); extern void default_idle_call(void); #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..cbc130efbc5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -19,9 +19,10 @@ * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. */ -void sched_idle_set_state(struct cpuidle_state *idle_state) +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) { idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); } static int __read_mostly cpu_idle_force_poll; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 60a6cbc1eb7a..fd2407b0d58b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -693,6 +693,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -1285,6 +1286,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1295,6 +1307,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void sysrq_sched_debug_show(void); From 60591a4a82053d6e53f499318a6f76c917b5e41e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 27 Jan 2015 14:04:17 +0000 Subject: [PATCH 479/813] sched: Determine the current sched_group idle-state To estimate the energy consumption of a sched_group in sched_group_energy() it is necessary to know which idle-state the group is in when it is idle. For now, it is assumed that this is the current idle-state (though it might be wrong). Based on the individual cpu idle-states group_idle_state() finds the group idle-state. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6b6e59e63eb..42a743dd6eef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4836,6 +4836,20 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } +static int group_idle_state(struct sched_group *sg) +{ + int i, state = INT_MAX; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + return state; +} + /* * sched_group_energy(): Computes the absolute energy consumption of cpus * belonging to the sched_group including shared resources shared only by @@ -4888,7 +4902,8 @@ static int sched_group_energy(struct energy_env *eenv) do { unsigned long group_util; - int sg_busy_energy, sg_idle_energy, cap_idx; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; @@ -4896,11 +4911,13 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) - >> SCHED_CAPACITY_SHIFT; + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; total_energy += sg_busy_energy + sg_idle_energy; From aed8d22cc84fc6086dc7cd259f6d1cf744500a07 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 20:03:19 +0100 Subject: [PATCH 480/813] sched: Energy-aware wake-up task placement Let available compute capacity and estimated energy impact select wake-up target cpu when energy-aware scheduling is enabled and the system in not over-utilized (above the tipping point). energy_aware_wake_cpu() attempts to find group of cpus with sufficient compute capacity to accommodate the task and find a cpu with enough spare capacity to handle the task within that group. Preference is given to cpus with enough spare capacity at the current OPP. Finally, the energy impact of the new target and the previous task cpu is compared to select the wake-up target cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42a743dd6eef..5daa05eb5741 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5308,6 +5308,86 @@ done: return target; } +static int energy_aware_wake_cpu(struct task_struct *p, int target) +{ + struct sched_domain *sd; + struct sched_group *sg, *sg_target; + int target_max_cap = INT_MAX; + int target_cpu = task_cpu(p); + int i; + + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + + if (!sd) + return target; + + sg = sd->groups; + sg_target = sg; + + /* + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. + */ + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); + + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + + if (target_cpu != task_cpu(p)) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = task_cpu(p), + .dst_cpu = target_cpu, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(task_cpu(p))) + return target_cpu; + + if (energy_diff(&eenv) >= 0) + return task_cpu(p); + } + + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5330,8 +5410,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + energy_aware(); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5361,7 +5442,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + new_cpu = energy_aware_wake_cpu(p, prev_cpu); + else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); } else while (sd) { From 8ca5b3ac6be0f13128aeabb81cc7dddd86da1f07 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 10 May 2015 15:17:32 +0100 Subject: [PATCH 481/813] sched: Consider a not over-utilized energy-aware system as balanced In case the system operates below the tipping point indicator, introduced in ("sched: Add over-utilization/tipping point indicator"), bail out in find_busiest_group after the dst and src group statistics have been checked. There is simply no need to move usage around because all involved cpus still have spare cycles available. For an energy-aware system below its tipping point, we rely on the task placement of the wakeup path. This works well for short running tasks. The existence of long running tasks on one of the involved cpus lets the system operate over its tipping point. To be able to move such a task (whose load can't be used to average the load among the cpus) from a src cpu with lower capacity than the dst_cpu, an additional rule has to be implemented in need_active_balance. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5daa05eb5741..a82cf23f8917 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7215,6 +7215,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; From 687f1b3502b038d9bb892fee4895880540ce6292 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 3 Feb 2015 13:54:11 +0000 Subject: [PATCH 482/813] sched: Disable energy-unfriendly nohz kicks With energy-aware scheduling enabled nohz_kick_needed() generates many nohz idle-balance kicks which lead to nothing when multiple tasks get packed on a single cpu to save energy. This causes unnecessary wake-ups and hence wastes energy. Make these conditions depend on !energy_aware() for now until the energy-aware nohz story gets sorted out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a82cf23f8917..72651c54128c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8280,12 +8280,13 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 245dcdb6ab85e738660d87fafce73da66ecec280 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 17:56:20 +0100 Subject: [PATCH 483/813] Documentation: DT bindings for energy model cost data required by EAS EAS (energy aware scheduling) provides the scheduler with an alternative objective - energy efficiency - as opposed to it's current performance oriented objectives. EAS relies on a simple platform energy cost model to guide scheduling decisions. The model only considers the CPU subsystem. This patch adds documentation describing DT bindings that should be used to supply the scheduler with an energy cost model. Signed-off-by: Robin Randhawa --- .../bindings/scheduler/sched-energy-costs.txt | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt new file mode 100644 index 000000000000..11216f09e596 --- /dev/null +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -0,0 +1,360 @@ +=========================================================== +Energy cost bindings for Energy Aware Scheduling +=========================================================== + +=========================================================== +1 - Introduction +=========================================================== + +This note specifies bindings required for energy-aware scheduling +(EAS)[1]. Historically, the scheduler's primary objective has been +performance. EAS aims to provide an alternative objective - energy +efficiency. EAS relies on a simple platform energy cost model to +guide scheduling decisions. The model only considers the CPU +subsystem. + +This note is aligned with the definition of the layout of physical +CPUs in the system as described in the ARM topology binding +description [2]. The concept is applicable to any system so long as +the cost model data is provided for those processing elements in +that system's topology that EAS is required to service. + +Processing elements refer to hardware threads, CPUs and clusters of +related CPUs in increasing order of hierarchy. + +EAS requires two key cost metrics - busy costs and idle costs. Busy +costs comprise of a list of compute capacities for the processing +element in question and the corresponding power consumption at that +capacity. Idle costs comprise of a list of power consumption values +for each idle state [C-state] that the processing element supports. +For a detailed description of these metrics, their derivation and +their use see [3]. + +These cost metrics are required for processing elements in all +scheduling domain levels that EAS is required to service. + +=========================================================== +2 - energy-costs node +=========================================================== + +Energy costs for the processing elements in scheduling domains that +EAS is required to service are defined in the energy-costs node +which acts as a container for the actual per processing element cost +nodes. A single energy-costs node is required for a given system. + +- energy-costs node + + Usage: Required + + Description: The energy-costs node is a container node and + it's sub-nodes describe costs for each processing element at + all scheduling domain levels that EAS is required to + service. + + Node name must be "energy-costs". + + The energy-costs node's parent node must be the cpus node. + + The energy-costs node's child nodes can be: + + - one or more cost nodes. + + Any other configuration is considered invalid. + +The energy-costs node can only contain a single type of child node +whose bindings are described in paragraph 4. + +=========================================================== +3 - energy-costs node child nodes naming convention +=========================================================== + +energy-costs child nodes must follow a naming convention where the +node name must be "thread-costN", "core-costN", "cluster-costN" +depending on whether the costs in the node are for a thread, core or +cluster. N (where N = {0, 1, ...}) is the node number and has no +bearing to the OS' logical thread, core or cluster index. + +=========================================================== +4 - cost node bindings +=========================================================== + +Bindings for cost nodes are defined as follows: + +- cluster-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple clusters and each cluster + serviced by EAS must have a corresponding cluster-costs + node. + + The cluster-cost node name must be "cluster-costN" as + described in 3 above. + + A cluster-cost node must be a leaf node with no children. + + Properties for cluster-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- core-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple cores and each core serviced by + EAS must have a corresponding core-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for core-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- thread-cost node + + Description: must be declared within an energy-costs node. A + system can contain cores with multiple hardware threads and + each thread serviced by EAS must have a corresponding + thread-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for thread-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +=========================================================== +5 - Cost node properties +========================================================== + +All cost node types must have only the following properties: + +- busy-cost-data + + Usage: required + Value type: An array of 2-item tuples. Each item is of type + u32. + Definition: The first item in the tuple is the capacity + value as described in [3]. The second item in the tuple is + the energy cost value as described in [3]. + +- idle-cost-data + + Usage: required + Value type: An array of 1-item tuples. The item is of type + u32. + Definition: The item in the tuple is the energy cost value + as described in [3]. + +=========================================================== +4 - Extensions to the cpu node +=========================================================== + +The cpu node is extended with a property that establishes the +connection between the processing element represented by the cpu +node and the cost-nodes associated with this processing element. + +The connection is expressed in line with the topological hierarchy +that this processing element belongs to starting with the level in +the hierarchy that this processing element itself belongs to through +to the highest level that EAS is required to service. The +connection cannot be sparse and must be contiguous from the +processing element's level through to the highest desired level. The +highest desired level must be the same for all processing elements. + +Example: Given that a cpu node may represent a thread that is a part +of a core, this property may contain multiple elements which +associate the thread with cost nodes describing the costs for the +thread itself, the core the thread belongs to, the cluster the core +belongs to and so on. The elements must be ordered from the lowest +level nodes to the highest desired level that EAS must service. The +highest desired level must be the same for all cpu nodes. The +elements must not be sparse: there must be elements for the current +thread, the next level of hierarchy (core) and so on without any +'holes'. + +Example: Given that a cpu node may represent a core that is a part +of a cluster of related cpus this property may contain multiple +elements which associate the core with cost nodes describing the +costs for the core itself, the cluster the core belongs to and so +on. The elements must be ordered from the lowest level nodes to the +highest desired level that EAS must service. The highest desired +level must be the same for all cpu nodes. The elements must not be +sparse: there must be elements for the current thread, the next +level of hierarchy (core) and so on without any 'holes'. + +If the system comprises of hierarchical clusters of clusters, this +property will contain multiple associations with the relevant number +of cluster elements in hierarchical order. + +Property added to the cpu node: + +- sched-energy-costs + + Usage: required + Value type: List of phandles + Definition: a list of phandles to specific cost nodes in the + energy-costs parent node that correspond to the processing + element represented by this cpu node in hierarchical order + of topology. + + The order of phandles in the list is significant. The first + phandle is to the current processing element's own cost + node. Subsequent phandles are to higher hierarchical level + cost nodes up until the maximum level that EAS is to + service. + + All cpu nodes must have the same highest level cost node. + + The phandle list must not be sparsely populated with handles + to non-contiguous hierarchical levels. See commentary above + for clarity. + + Any other configuration is invalid. + +=========================================================== +5 - Example dts +=========================================================== + +Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one +cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus): + +cpus { + #address-cells = <2>; + #size-cells = <0>; + . + . + . + A57_0: cpu@0 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x0>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A57_1: cpu@1 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x1>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A53_0: cpu@100 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x100>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_1: cpu@101 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x101>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_2: cpu@102 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x102>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_3: cpu@103 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x103>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + energy-costs { + CPU_COST_0: core-cost0 { + busy-cost-data = < + 417 168 + 579 251 + 744 359 + 883 479 + 1024 616 + >; + idle-cost-data = < + 15 + 0 + >; + }; + CPU_COST_1: core-cost1 { + busy-cost-data = < + 235 33 + 302 46 + 368 61 + 406 76 + 447 93 + >; + idle-cost-data = < + 6 + 0 + >; + }; + CLUSTER_COST_0: cluster-cost0 { + busy-cost-data = < + 417 24 + 579 32 + 744 43 + 883 49 + 1024 64 + >; + idle-cost-data = < + 65 + 24 + >; + }; + CLUSTER_COST_1: cluster-cost1 { + busy-cost-data = < + 235 26 + 303 30 + 368 39 + 406 47 + 447 57 + >; + idle-cost-data = < + 56 + 17 + >; + }; + }; +}; + +=============================================================================== +[1] https://lkml.org/lkml/2015/5/12/728 +[2] Documentation/devicetree/bindings/topology.txt +[3] Documentation/scheduler/sched-energy.txt From 907f4d5e6ad2fbe29722be3620d4083d8c6488c9 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 18:01:58 +0100 Subject: [PATCH 484/813] sched: Support for extracting EAS energy costs from DT This patch implements support for extracting energy cost data from DT. The data should conform to the DT bindings for energy cost data needed by EAS (energy aware scheduling). Signed-off-by: Robin Randhawa --- include/linux/sched_energy.h | 36 ++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/energy.c | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched_energy.h create mode 100644 kernel/sched/energy.c diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h new file mode 100644 index 000000000000..a3f1627ac609 --- /dev/null +++ b/include/linux/sched_energy.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_SCHED_ENERGY_H +#define _LINUX_SCHED_ENERGY_H + +#include +#include + +/* + * There doesn't seem to be an NR_CPUS style max number of sched domain + * levels so here's an arbitrary constant one for the moment. + * + * The levels alluded to here correspond to entries in struct + * sched_domain_topology_level that are meant to be populated by arch + * specific code (topology.c). + */ +#define NR_SD_LEVELS 8 + +#define SD_LEVEL0 0 +#define SD_LEVEL1 1 +#define SD_LEVEL2 2 +#define SD_LEVEL3 3 +#define SD_LEVEL4 4 +#define SD_LEVEL5 5 +#define SD_LEVEL6 6 +#define SD_LEVEL7 7 + +/* + * Convenience macro for iterating through said sd levels. + */ +#define for_each_possible_sd_level(level) \ + for (level = 0; level < NR_SD_LEVELS; level++) + +extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +void init_sched_energy_costs(void); + +#endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..a541b5ce1dcc 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c new file mode 100644 index 000000000000..b0656b7a93e3 --- /dev/null +++ b/kernel/sched/energy.c @@ -0,0 +1,124 @@ +/* + * Obtain energy cost data from DT and populate relevant scheduler data + * structures. + * + * Copyright (C) 2015 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define pr_fmt(fmt) "sched-energy: " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include + +struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +static void free_resources(void) +{ + int cpu, sd_level; + struct sched_group_energy *sge; + + for_each_possible_cpu(cpu) { + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (sge) { + kfree(sge->cap_states); + kfree(sge->idle_states); + kfree(sge); + } + } + } +} + +void init_sched_energy_costs(void) +{ + struct device_node *cn, *cp; + struct capacity_state *cap_states; + struct idle_state *idle_states; + struct sched_group_energy *sge; + const struct property *prop; + int sd_level, i, nstates, cpu; + const __be32 *val; + + for_each_possible_cpu(cpu) { + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_warn("CPU device node missing for CPU %d\n", cpu); + return; + } + + if (!of_find_property(cn, "sched-energy-costs", NULL)) { + pr_warn("CPU device node has no sched-energy-costs\n"); + return; + } + + for_each_possible_sd_level(sd_level) { + cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); + if (!cp) + break; + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No busy-cost data, skipping sched_energy init\n"); + goto out; + } + + sge = kcalloc(1, sizeof(struct sched_group_energy), + GFP_NOWAIT); + + nstates = (prop->length / sizeof(u32)) / 2; + cap_states = kcalloc(nstates, + sizeof(struct capacity_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) { + cap_states[i].cap = be32_to_cpup(val++); + cap_states[i].power = be32_to_cpup(val++); + } + + sge->nr_cap_states = nstates; + sge->cap_states = cap_states; + + prop = of_find_property(cp, "idle-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No idle-cost data, skipping sched_energy init\n"); + goto out; + } + + nstates = (prop->length / sizeof(u32)); + idle_states = kcalloc(nstates, + sizeof(struct idle_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) + idle_states[i].power = be32_to_cpup(val++); + + sge->nr_idle_states = nstates; + sge->idle_states = idle_states; + + sge_array[cpu][sd_level] = sge; + } + } + + pr_info("Sched-energy-costs installed from DT\n"); + return; + +out: + free_resources(); +} From 10c505ffd2f0bf6b23e2c02f1180534487f74d6f Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Tue, 9 Jun 2015 15:10:00 +0100 Subject: [PATCH 485/813] arm64, topology: Updates to use DT bindings for EAS costing data With the bindings and the associated accessors to extract data from the bindings in place, remove the static hard-coded data from topology.c and use the accesors instead. Signed-off-by: Robin Randhawa --- arch/arm64/kernel/topology.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fb99a6735fd4..b5b43af6a7dc 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -218,6 +220,33 @@ out: struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu); + return NULL; + } + + return sge; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for CPU%d\n", cpu); + return NULL; + } + + return sge; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; @@ -344,4 +373,8 @@ void __init init_cpu_topology(void) */ if (of_have_populated_dt() && parse_dt_topology()) reset_cpu_topology(); + else + set_sched_topology(arm64_topology); + + init_sched_energy_costs(); } From 74a07a6950cc5b1cf12a8e602d8e5b572906376b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2015 16:47:48 +0100 Subject: [PATCH 486/813] cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support Implements cpufreq_scale_max_freq_capacity() to provide the scheduler with a maximum frequency scaling correction factor for more accurate load-tracking and cpu capacity handling by being able to deal with frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency lower than the absolute possible maximum frequency on load tracking and cpu capacity. The factor is: current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, max_freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for max_freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 19 +++++++++++++++++++ include/linux/cpufreq.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0d494c937920..e5aa57b27d06 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -352,12 +352,14 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) *********************************************************************/ static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { unsigned long cur = freqs ? freqs->new : policy->cur; unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo; int cpu; pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", @@ -365,6 +367,18 @@ scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) for_each_cpu(cpu, policy->cpus) per_cpu(freq_scale, cpu) = scale; + + if (freqs) + return; + + scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq; + + pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n", + cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq, + scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(max_freq_scale, cpu) = scale; } unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) @@ -372,6 +386,11 @@ unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) return per_cpu(freq_scale, cpu); } +unsigned long cpufreq_scale_max_freq_capacity(int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5f1e66e544f5..89e8e04aa73b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -622,4 +622,5 @@ int cpufreq_generic_init(struct cpufreq_policy *policy, struct sched_domain; unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif /* _LINUX_CPUFREQ_H */ From 386e4fc1659381ce5b8a9eabe6c9ab4cf0530596 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 487/813] sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. There is also an outstanding question: Should the cpu capacity of a cpu going idle be set to a very small value? Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++------ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1e0b273c20d5..ab6215a9d0db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5892,6 +5892,8 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; + + init_max_cpu_capacity(&rd->max_cpu_capacity); return 0; free_rto_mask: @@ -7109,15 +7111,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); - - if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) - rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); - if (rq) - pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 72651c54128c..08e2ef16d600 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5099,7 +5099,7 @@ static inline bool __task_fits(struct task_struct *p, int cpu, int util) static inline bool task_fits_max(struct task_struct *p, int cpu) { unsigned long capacity = capacity_of(cpu); - unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; if (capacity == max_capacity) return true; @@ -6585,13 +6585,43 @@ static unsigned long scale_rt_capacity(int cpu) return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fd2407b0d58b..80d66655a5a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -510,6 +510,12 @@ struct dl_rq { #ifdef CONFIG_SMP +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -548,7 +554,7 @@ struct root_domain { struct cpupri cpupri; /* Maximum cpu capacity in the system. */ - unsigned long max_cpu_capacity; + struct max_cpu_capacity max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -1340,6 +1346,8 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From f610f202d3e4f326a357c1796954bf7a0cbfbe7e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 17:59:55 +0100 Subject: [PATCH 488/813] arm: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index da1c611a3b5e..0308342def8c 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#if CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From 81ca8262ce7ba30a17f1843acff934a1a380a1ca Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:34:15 +0100 Subject: [PATCH 489/813] arm64: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_max_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 1 + arch/arm64/kernel/topology.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 9370a336c934..bbd362cd1ed1 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -26,6 +26,7 @@ struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +extern unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif #define arch_scale_cpu_capacity scale_cpu_capacity extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index b5b43af6a7dc..5b2c67a510d8 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -29,7 +29,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#ifdef CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From 20424b480cedaaea10f5c56c6140692c97b9837e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 13 Jan 2016 15:49:44 +0000 Subject: [PATCH 490/813] sched: Do eas idle balance regardless of the rq avg idle value EAS relies on idle balance to migrate a misfit task towards a cpu with higher capacity. When such a cpu becomes idle, idle balance should happen even if the rq avg idle is smaller than the sched migration cost (default 500us). The rq avg idle is updated during the wakeup of a task in case the rq has a non-null idle_stamp. This value stays unchanged and valid until the next task wakes up on this cpu after an idle period. So rq avg idle could be smaller than sched migration cost preventing the idle balance from happening. In this case we would be at the mercy of wakeup, periodic or nohz-idle load balancing to put another task on this cpu. To break this dependency towards rq avg idle make EAS idle balance independent from this rq avg idle has to be larger than sched migration cost. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08e2ef16d600..3c2ddfc8cee8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7793,8 +7793,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) From f61bd44d08c1e7ea97e389ff59167a86d90977f4 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:43:49 +0000 Subject: [PATCH 491/813] sched: Add per-cpu max capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all cpus in the sched_group. Unless it is divided by the group_weight to get the average capacity per cpu it hides differences in cpu capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers cpus of different capacities. Instead, by extending struct sched_group_capacity to indicate max per-cpu capacity in the group a suitable group for a given task utilization can easily be found such that cpus with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 17 ++++++++++++----- kernel/sched/sched.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ab6215a9d0db..69a36dd165e2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5718,7 +5718,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6197,6 +6197,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c2ddfc8cee8..618e94ef803b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6630,13 +6630,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6649,6 +6650,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6673,11 +6675,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); } } else { /* @@ -6687,12 +6690,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 80d66655a5a3..33583123792e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -858,7 +858,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long max_capacity; /* Max per-cpu capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From edb839a29c08b4313c4cbecfae666677ce1815b9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:47:54 +0000 Subject: [PATCH 492/813] sched: Add group_misfit_task load-balance type To maximize throughput in systems with reduced capacity cpus (e.g. high RT/IRQ load and/or ARM big.LITTLE) load-balancing has to consider task and cpu utilization as well as per-cpu compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks that are scheduled on a reduced capacity cpu need to be identified and migrated to a higher capacity cpu if possible. To implement this additional policy an additional group_type (load-balance scenario) is added: group_misfit_task. This represents scenarios where a sched_group has tasks that are not suitable for its per-cpu capacity. group_misfit_task is only considered if the system is not overloaded in any other way (group_imbalanced or group_overloaded). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 29 ++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 618e94ef803b..6c791547225c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5760,6 +5760,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5781,9 +5783,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -5996,6 +6001,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6467,12 +6479,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6488,6 +6494,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6804,6 +6811,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } @@ -6851,8 +6861,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) + if (cpu_overutilized(i)) { *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -8433,6 +8446,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; + + rq->misfit_task = !task_fits_max(curr, rq->cpu); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 33583123792e..d4f9ddfbff73 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -584,6 +584,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; + unsigned int misfit_task; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; From 22a9676637381b670d038f7da1fca82448de033e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:51:35 +0000 Subject: [PATCH 493/813] sched: Consider misfit tasks when load-balancing With the new group_misfit_task load-balancing scenario additional policy conditions are needed when load-balancing. Misfit task balancing only makes sense between source group with lower capacity than the target group. If capacities are the same, fallback to normal group_other balancing. The aim is to balance tasks such that no task has its throughput hindered by compute capacity if a cpu with more capacity is available. Load-balancing is generally based on average load in the sched_groups, but for misfitting tasks it is necessary to introduce exceptions to migrate tasks against usual metrics and optimize throughput. This patch ensures the following load-balance for mixed capacity systems (e.g. ARM big.LITTLE) for always-running tasks: 1. Place a task on each cpu starting in order from cpus with highest capacity to lowest until all cpus are in use (i.e. one task on each cpu). 2. Once all cpus are in use balance according to compute capacity such that load per capacity is approximately the same regardless of the compute capacity (i.e. big cpus get more tasks than little cpus). Necessary changes are introduced in find_busiest_group(), calculate_imbalance(), and find_busiest_queue(). This includes passing the group_type on to find_busiest_queue() through struct lb_env, which is currently only considers imbalance and not the imbalance situation (group_type). To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6c791547225c..42492ee17793 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6037,6 +6037,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -6801,6 +6802,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6907,9 +6920,25 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + /* + * Candiate sg has no more than one task per cpu and has higher + * per-cpu capacity. No reason to pull tasks to less capable cpus. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -7015,6 +7044,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -7191,6 +7229,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7224,6 +7278,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7297,6 +7356,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7320,7 +7384,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -7333,6 +7398,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -7391,7 +7457,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* From 10cbfd68e2ff5bb562f57df2895400d7534a1a10 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:27 +0100 Subject: [PATCH 494/813] cpufreq: introduce cpufreq_driver_is_slow Some architectures and platforms perform CPU frequency transitions through a non-blocking method, while some might block or sleep. Even when frequency transitions do not block or sleep they may be very slow. This distinction is important when trying to change frequency from a non-interruptible context in a scheduler hot path. Describe this distinction with a cpufreq driver flag, CPUFREQ_DRIVER_FAST. The default is to not have this flag set, thus erring on the side of caution. cpufreq_driver_is_slow() is also introduced in this patch. Setting the above flag will allow this function to return false. [smuckle@linaro.org: change flag/API to include drivers that are too slow for scheduler hot paths, in addition to those that block/sleep] Cc: Rafael J. Wysocki Cc: Viresh Kumar Signed-off-by: Michael Turquette Signed-off-by: Steve Muckle --- drivers/cpufreq/cpufreq.c | 6 ++++++ include/linux/cpufreq.h | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e5aa57b27d06..2b99bc305040 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -154,6 +154,12 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +bool cpufreq_driver_is_slow(void) +{ + return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST); +} +EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 89e8e04aa73b..f9bb7039740c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_driver_is_slow(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else static inline unsigned int cpufreq_get(unsigned int cpu) @@ -317,6 +318,14 @@ struct cpufreq_driver { */ #define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5) +/* + * Indicates that it is safe to call cpufreq_driver_target from + * non-interruptable context in scheduler hot paths. Drivers must + * opt-in to this flag, as the safe default is that they might sleep + * or be too slow for hot path use. + */ +#define CPUFREQ_DRIVER_FAST (1 << 6) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From 5c905a0861295cb172b8b7c9138ed595e23a116f Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:48 +0100 Subject: [PATCH 495/813] sched: scheduler-driven cpu frequency selection Scheduler-driven CPU frequency selection hopes to exploit both per-task and global information in the scheduler to improve frequency selection policy, achieving lower power consumption, improved responsiveness/performance, and less reliance on heuristics and tunables. For further discussion on the motivation of this integration see [0]. This patch implements a shim layer between the Linux scheduler and the cpufreq subsystem. The interface accepts capacity requests from the CFS, RT and deadline sched classes. The requests from each sched class are summed on each CPU with a margin applied to the CFS and RT capacity requests to provide some headroom. Deadline requests are expected to be precise enough given their nature to not require headroom. The maximum total capacity request for a CPU in a frequency domain drives the requested frequency for that domain. Policy is determined by both the sched classes and this shim layer. Note that this algorithm is event-driven. There is no polling loop to check cpu idle time nor any other method which is unsynchronized with the scheduler, aside from a throttling mechanism to ensure frequency changes are not attempted faster than the hardware can accommodate them. Thanks to Juri Lelli for contributing design ideas, code and test results, and to Ricky Liang for initialization and static key inc/dec fixes. [0] http://article.gmane.org/gmane.linux.kernel/1499836 [smuckle@linaro.org: various additions and fixes, revised commit text] CC: Ricky Liang Signed-off-by: Michael Turquette Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- drivers/cpufreq/Kconfig | 20 ++ include/linux/cpufreq.h | 3 + include/linux/sched.h | 8 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched.c | 358 +++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 51 +++++ 7 files changed, 442 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq_sched.c diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 75f63efd7b43..298509ff9c34 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -112,6 +112,14 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE loading your cpufreq low-level hardware driver, using the 'interactive' governor for latency-sensitive workloads. +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency using CPU utilization estimates from the + scheduler. + endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -207,6 +215,18 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + bool "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is event-driven by the scheduler. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index f9bb7039740c..60571292a802 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -499,6 +499,9 @@ extern struct cpufreq_governor cpufreq_gov_conservative; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE) extern struct cpufreq_governor cpufreq_gov_interactive; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_interactive) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) +extern struct cpufreq_governor cpufreq_gov_sched; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4478d3921714..c707c613664f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -929,6 +929,14 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +struct sched_capacity_reqs { + unsigned long cfs; + unsigned long rt; + unsigned long dl; + + unsigned long total; +}; + /* * Wake-queues are lists of tasks with a pending wakeup, whose * callers have already marked the task as woken internally, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a541b5ce1dcc..0eabc9db4c3d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 000000000000..58bca8d2ca65 --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2015 Michael Turquette + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; +static bool __read_mostly cpufreq_driver_slow; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static struct cpufreq_governor cpufreq_gov_sched; +#endif + +static DEFINE_PER_CPU(unsigned long, enabled); +DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @requested_freq: last frequency requested by the sched governor + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + unsigned int requested_freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, + unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +static bool finish_last_request(struct gov_data *gd) +{ + ktime_t now = ktime_get(); + + if (ktime_after(now, gd->throttle)) + return false; + + while (1) { + int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + + usec_left /= NSEC_PER_USEC; + usleep_range(usec_left, usec_left + 100); + now = ktime_get(); + if (ktime_after(now, gd->throttle)) + return true; + } +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned int new_request = 0; + unsigned int last_request = 0; + int ret; + + policy = (struct cpufreq_policy *) data; + gd = policy->governor_data; + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + do { + set_current_state(TASK_INTERRUPTIBLE); + new_request = gd->requested_freq; + if (new_request == last_request) { + schedule(); + } else { + /* + * if the frequency thread sleeps while waiting to be + * unthrottled, start over to check for a newer request + */ + if (finish_last_request(gd)) + continue; + last_request = new_request; + cpufreq_sched_try_driver_target(policy, new_request); + } + } while (!kthread_should_stop()); + + return 0; +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) + return; + + wake_up_process(gd->task); +} + +static void update_fdomain_capacity_request(int cpu) +{ + unsigned int freq_new, index_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity = 0; + + /* + * Avoid grabbing the policy if possible. A test is still + * required after locking the CPU's policy to avoid racing + * with the governor changing. + */ + if (!per_cpu(enabled, cpu)) + return; + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + return; + + if (policy->governor != &cpufreq_gov_sched || + !policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) { + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); + capacity = max(capacity, scr->total); + } + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + if (cpufreq_frequency_table_target(policy, policy->freq_table, + freq_new, CPUFREQ_RELATION_L, + &index_new)) + goto out; + freq_new = policy->freq_table[index_new].frequency; + + if (freq_new == gd->requested_freq) + goto out; + + gd->requested_freq = freq_new; + + /* + * Throttling is not yet supported on platforms with fast cpufreq + * drivers. + */ + if (cpufreq_driver_slow) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); +} + +void update_cpu_capacity_request(int cpu, bool request) +{ + unsigned long new_capacity; + struct sched_capacity_reqs *scr; + + /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ + lockdep_assert_held(&cpu_rq(cpu)->lock); + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + + new_capacity = scr->cfs + scr->rt; + new_capacity = new_capacity * capacity_margin + / SCHED_CAPACITY_SCALE; + new_capacity += scr->dl; + + if (new_capacity == scr->total) + return; + + scr->total = new_capacity; + if (request) + update_fdomain_capacity_request(cpu); +} + +static inline void set_sched_freq(void) +{ + static_key_slow_inc(&__sched_freq); +} + +static inline void clear_sched_freq(void) +{ + static_key_slow_dec(&__sched_freq); +} + +static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + for_each_cpu(cpu, policy->cpus) + memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, + sizeof(struct sched_capacity_reqs)); + + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) + return -ENOMEM; + + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_is_slow()) { + cpufreq_driver_slow = true; + gd->task = kthread_create(cpufreq_sched_thread, policy, + "kschedfreq:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kschedfreq thread\n", + __func__); + goto err; + } + get_task_struct(gd->task); + kthread_bind_mask(gd->task, policy->related_cpus); + wake_up_process(gd->task); + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + set_sched_freq(); + + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + clear_sched_freq(); + if (cpufreq_driver_slow) { + kthread_stop(gd->task); + put_task_struct(gd->task); + } + + policy->governor_data = NULL; + + kfree(gd); + return 0; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 1; + + return 0; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 0; + + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, + unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_POLICY_INIT: + return cpufreq_sched_policy_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return cpufreq_sched_policy_exit(policy); + case CPUFREQ_GOV_START: + return cpufreq_sched_start(policy); + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + case CPUFREQ_GOV_LIMITS: + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) + per_cpu(enabled, cpu) = 0; + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42492ee17793..9eb335d977fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5085,7 +5085,7 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -static unsigned int capacity_margin = 1280; /* ~20% margin */ +unsigned int capacity_margin = 1280; /* ~20% margin */ static inline bool __task_fits(struct task_struct *p, int cpu, int util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d4f9ddfbff73..a96fcea5d98b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1455,6 +1455,57 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +extern unsigned int capacity_margin; +extern struct static_key __sched_freq; + +static inline bool sched_freq(void) +{ + return static_key_false(&__sched_freq); +} + +DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +void update_cpu_capacity_request(int cpu, bool request); + +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; + update_cpu_capacity_request(cpu, request); + } +} +#else +static inline bool sched_freq(void) { return false; } +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); From d2bf66aac444a8d0ea95e5ff17a0e8563766c594 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Aug 2015 19:47:12 +0100 Subject: [PATCH 496/813] sched/fair: add triggers for OPP change requests Each time a task is {en,de}queued we might need to adapt the current frequency to the new usage. Add triggers on {en,de}queue_task_fair() for this purpose. Only trigger a freq request if we are effectively waking up or going to sleep. Filter out load balancing related calls to reduce the number of triggers. [smuckle@linaro.org: resolve merge conflicts, define task_new, use renamed static key sched_freq] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9eb335d977fe..eba576fe527a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4160,6 +4160,21 @@ static inline void hrtick_update(struct rq *rq) } #endif +static unsigned long capacity_orig_of(int cpu); +static int cpu_util(int cpu); + +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} + static bool cpu_overutilized(int cpu); /* @@ -4209,6 +4224,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + * + * XXX: how about fork()? Do we need a special + * flag/something to tell if we are here after a + * fork() (wakeup_task_new)? + */ + if (!task_new) + update_capacity_of(cpu_of(rq)); } hrtick_update(rq); } @@ -4267,9 +4296,24 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } hrtick_update(rq); } From a0933a372d84e341911f2a09cdb9980584315f03 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 26 Jun 2015 12:14:23 +0100 Subject: [PATCH 497/813] sched/{core,fair}: trigger OPP change request on fork() Patch "sched/fair: add triggers for OPP change requests" introduced OPP change triggers for enqueue_task_fair(), but the trigger was operating only for wakeups. Fact is that it makes sense to consider wakeup_new also (i.e., fork()), as we don't know anything about a newly created task and thus we most certainly want to jump to max OPP to not harm performance too much. However, it is not currently possible (or at least it wasn't evident to me how to do so :/) to tell new wakeups from other (non wakeup) operations. This patch introduces an additional flag in sched.h that is only set at fork() time and it is then consumed in enqueue_task_fair() for our purpose. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 9 +++------ kernel/sched/sched.h | 1 + 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 69a36dd165e2..569a2f0f01e4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2387,7 +2387,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eba576fe527a..4558c6ac2a66 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4187,7 +4187,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int task_new = !(flags & ENQUEUE_WAKEUP); + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; for_each_sched_entity(se) { if (se->on_rq) @@ -4231,12 +4232,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * because we get here also during load balancing, but * in these cases it seems wise to trigger as single * request after load balancing is done. - * - * XXX: how about fork()? Do we need a special - * flag/something to tell if we are here after a - * fork() (wakeup_task_new)? */ - if (!task_new) + if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } hrtick_update(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a96fcea5d98b..c91e85c90d5d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1181,6 +1181,7 @@ static const u32 prio_to_wmult[40] = { #endif #define ENQUEUE_REPLENISH 0x08 #define ENQUEUE_RESTORE 0x10 +#define ENQUEUE_WAKEUP_NEW 0x20 #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 From c25759ccb88d84b91af1657eb8fafd9dda9c8a94 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 25 Jun 2015 14:37:27 +0100 Subject: [PATCH 498/813] sched/fair: cpufreq_sched triggers for load balancing As we don't trigger freq changes from {en,de}queue_task_fair() during load balancing, we need to do explicitly so on load balancing paths. [smuckle@linaro.org: move update_capacity_of calls so rq lock is held] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4558c6ac2a66..8f93b23596e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6405,6 +6405,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -6426,6 +6430,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -7688,6 +7697,11 @@ more_balance: * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -8058,8 +8072,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } From 1ba27d06088692495c5f087ff93ca91d6c99406c Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 25 Jun 2015 14:12:33 +0100 Subject: [PATCH 499/813] sched/fair: jump to max OPP when crossing UP threshold Since the true utilization of a long running task is not detectable while it is running and might be bigger than the current cpu capacity, create the maximum cpu capacity head room by requesting the maximum cpu capacity once the cpu usage plus the capacity margin exceeds the current capacity. This is also done to try to harm the performance of a task the least. Original fair-class only version authored by Juri Lelli . cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++ kernel/sched/fair.c | 66 ------------------------------------------ kernel/sched/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 569a2f0f01e4..573dcb62b3b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2854,6 +2854,45 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +static unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = cfs_cap + scr->rt; + + total = total * capacity_margin; + total /= SCHED_CAPACITY_SCALE; + total += scr->dl; + return total; +} + +static void sched_freq_tick(int cpu) +{ + struct sched_capacity_reqs *scr; + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to max OPP as soon as the margin of free capacity is + * impacted (specified by capacity_margin). + */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) + set_cfs_cpu_capacity(cpu, true, capacity_max); +} +#else +static inline void sched_freq_tick(int cpu) { } +#endif + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2880,6 +2919,8 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8f93b23596e5..126a0116162d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4160,9 +4160,6 @@ static inline void hrtick_update(struct rq *rq) } #endif -static unsigned long capacity_orig_of(int cpu); -static int cpu_util(int cpu); - static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4537,15 +4534,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4719,60 +4707,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static unsigned long __cpu_util(int cpu, int delta) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - delta += util; - if (delta < 0) - return 0; - - return (delta >= capacity) ? capacity : delta; -} - -static unsigned long cpu_util(int cpu) -{ - return __cpu_util(cpu, 0); -} - static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c91e85c90d5d..2f38f4978df8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1456,7 +1456,75 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static inline unsigned long __cpu_util(int cpu, int delta) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); +} + +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static inline unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHED +#define capacity_max SCHED_CAPACITY_SCALE extern unsigned int capacity_margin; extern struct static_key __sched_freq; From 6e4178facd5f6079a384be7c2071a276096afd9b Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 25 Nov 2015 15:59:25 -0800 Subject: [PATCH 500/813] sched/cpufreq_sched: add trace events Trace events will aid in debugging, profiling and tuning. Signed-off-by: Steve Muckle --- include/trace/events/cpufreq_sched.h | 87 ++++++++++++++++++++++++++++ kernel/sched/cpufreq_sched.c | 9 +++ 2 files changed, 96 insertions(+) create mode 100644 include/trace/events/cpufreq_sched.h diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h new file mode 100644 index 000000000000..a46cd088e969 --- /dev/null +++ b/include/trace/events/cpufreq_sched.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2015 Steve Muckle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpufreq_sched + +#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUFREQ_SCHED_H + +#include +#include + +TRACE_EVENT(cpufreq_sched_throttled, + TP_PROTO(unsigned int rem), + TP_ARGS(rem), + TP_STRUCT__entry( + __field( unsigned int, rem) + ), + TP_fast_assign( + __entry->rem = rem; + ), + TP_printk("throttled - %d usec remaining", __entry->rem) +); + +TRACE_EVENT(cpufreq_sched_request_opp, + TP_PROTO(int cpu, + unsigned long capacity, + unsigned int freq_new, + unsigned int requested_freq), + TP_ARGS(cpu, capacity, freq_new, requested_freq), + TP_STRUCT__entry( + __field( int, cpu) + __field( unsigned long, capacity) + __field( unsigned int, freq_new) + __field( unsigned int, requested_freq) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->capacity = capacity; + __entry->freq_new = freq_new; + __entry->requested_freq = requested_freq; + ), + TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d " + "(cur %d)", + __entry->cpu, __entry->capacity, __entry->freq_new, + __entry->requested_freq) +); + +TRACE_EVENT(cpufreq_sched_update_capacity, + TP_PROTO(int cpu, + bool request, + struct sched_capacity_reqs *scr, + unsigned long new_capacity), + TP_ARGS(cpu, request, scr, new_capacity), + TP_STRUCT__entry( + __field( int, cpu) + __field( bool, request) + __field( unsigned long, cfs) + __field( unsigned long, rt) + __field( unsigned long, dl) + __field( unsigned long, total) + __field( unsigned long, new_total) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->request = request; + __entry->cfs = scr->cfs; + __entry->rt = scr->rt; + __entry->dl = scr->dl; + __entry->total = scr->total; + __entry->new_total = new_capacity; + ), + TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld " + "new_tot=%ld", + __entry->cpu, __entry->request, __entry->cfs, __entry->rt, + __entry->dl, __entry->total, __entry->new_total) +); + +#endif /* _TRACE_CPUFREQ_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 58bca8d2ca65..5afe56a82491 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "sched.h" #define THROTTLE_NSEC 50000000 /* 50ms default */ @@ -78,6 +81,7 @@ static bool finish_last_request(struct gov_data *gd) int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); usec_left /= NSEC_PER_USEC; + trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); if (ktime_after(now, gd->throttle)) @@ -186,6 +190,9 @@ static void update_fdomain_capacity_request(int cpu) goto out; freq_new = policy->freq_table[index_new].frequency; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, + gd->requested_freq); + if (freq_new == gd->requested_freq) goto out; @@ -222,6 +229,8 @@ void update_cpu_capacity_request(int cpu, bool request) if (new_capacity == scr->total) return; + trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); + scr->total = new_capacity; if (request) update_fdomain_capacity_request(cpu); From 39daf1cf5ae48b11441d3084bc2908bdaffc3a5d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 20 Oct 2015 10:46:26 +0200 Subject: [PATCH 501/813] sched: remove call of sched_avg_update from sched_rt_avg_update rt_avg is only used to scale the available CPU's capacity for CFS tasks. As the update of this scaling is done during periodic load balance, we only have to ensure that sched_avg_update has been called before any periodic load balancing. This requirement is already fulfilled by __update_cpu_load so the call in sched_rt_avg_update, which is part of the hotpath, is useless. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f38f4978df8..bee28211a73b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1578,7 +1578,6 @@ static inline void set_dl_cpu_capacity(int cpu, bool request, static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); } #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } From 9a379e95dbeefbb8b891aae0192b899c6d617d94 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Nov 2015 10:39:01 +0100 Subject: [PATCH 502/813] sched: deadline: use deadline bandwidth in scale_rt_capacity Instead of monitoring the exec time of deadline tasks to evaluate the CPU capacity consumed by deadline scheduler class, we can directly calculate it thanks to the sum of utilization of deadline tasks on the CPU. We can remove deadline tasks from rt_avg metric and directly use the average bandwidth of deadline scheduler in scale_rt_capacity. Based in part on a similar patch from Luca Abeni . Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 8 ++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..9d9eb50d4059 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw += se_bw; +} + +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw -= se_bw; + if (dl_rq->avg_bw < 0) { + WARN_ON(1); + dl_rq->avg_bw = 0; + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + if (dl_se->dl_new) + add_average_bw(dl_se, dl_rq); + /* * The arrival of a new instance needs special treatment, i.e., * the actual scheduling parameters have to be "renewed". @@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; @@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_rq *dl_rq = dl_rq_of_se(&p->dl); + struct rq *rq = rq_of_dl_rq(dl_rq); /* * Since we are TASK_DEAD we won't slip out of the domain! @@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); + + clear_average_bw(&p->dl, &rq->dl); } static void set_curr_task_dl(struct rq *rq) @@ -1556,7 +1579,9 @@ retry: } deactivate_task(rq, next_task, 0); + clear_average_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + clear_average_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); + add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!start_dl_timer(p)) __dl_clear_params(p); + clear_average_bw(&p->dl, &rq->dl); + /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 126a0116162d..0b39158dadc1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6571,6 +6571,14 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bee28211a73b..3f52226bb6f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -506,6 +506,8 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* This is the "average utilization" for this runqueue */ + s64 avg_bw; }; #ifdef CONFIG_SMP From a4ec0d1620b9e63e935744602da953feaaaac02f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 26 Oct 2015 18:14:50 +0100 Subject: [PATCH 503/813] sched: rt scheduler sets capacity requirement RT tasks don't provide any running constraints like deadline ones except their running priority. The only current usable input to estimate the capacity needed by RT tasks is the rt_avg metric. We use it to estimate the CPU capacity needed for the RT scheduler class. In order to monitor the evolution for RT task load, we must peridiocally check it during the tick. Then, we use the estimated capacity of the last activity to estimate the next one which can not be that accurate but is a good starting point without any impact on the wake up path of RT tasks. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..9694204660b7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1426,6 +1426,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +#ifdef CONFIG_SMP +static void sched_rt_update_capacity_req(struct rq *rq) +{ + u64 total, used, age_stamp, avg; + s64 delta; + + if (!sched_freq()) + return; + + sched_avg_update(rq); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = rq_clock(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + if (unlikely(used > SCHED_CAPACITY_SCALE)) + used = SCHED_CAPACITY_SCALE; + + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); +} +#else +static inline void sched_rt_update_capacity_req(struct rq *rq) +{ } + +#endif + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1494,8 +1529,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!rt_rq->rt_queued) { + /* + * The next task to be picked on this rq will have a lower + * priority than rt tasks so we can spend some time to update + * the capacity used by rt tasks based on the last activity. + * This value will be the used as an estimation of the next + * activity. + */ + sched_rt_update_capacity_req(rq); return NULL; + } put_prev_task(rq, prev); @@ -2212,6 +2256,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); + if (rq->rt.rt_nr_running) + sched_rt_update_capacity_req(rq); + watchdog(rq, p); /* From d983b1c50beeb12157152da9cd79ed5d200fba63 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:55:51 +0000 Subject: [PATCH 504/813] fixup! sched: scheduler-driven cpu frequency selection Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 5afe56a82491..e1d208e101ed 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -119,9 +119,9 @@ static int cpufreq_sched_thread(void *data) } do { - set_current_state(TASK_INTERRUPTIBLE); new_request = gd->requested_freq; if (new_request == last_request) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); } else { /* From 111a0376ad413bfe37ce2d090d1ba81b0daf468e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:58:05 +0000 Subject: [PATCH 505/813] fixup! sched/fair: jump to max OPP when crossing UP threshold Signed-off-by: Juri Lelli --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 573dcb62b3b3..4d388d070a2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2910,6 +2910,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2919,8 +2920,6 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); - - sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL From f6201d94b085121184e694d00768a79bb5a13e9d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 30 Jun 2015 12:03:26 +0100 Subject: [PATCH 506/813] sched/tune: add detailed documentation The topic of a single simple power-performance tunable, that is wholly scheduler centric, and has well defined and predictable properties has come up on several occasions in the past. With techniques such as a scheduler driven DVFS, we now have a good framework for implementing such a tunable. This patch provides a detailed description of the motivations and design decisions behind the implementation of the SchedTune. cc: Jonathan Corbet cc: linux-doc@vger.kernel.org Signed-off-by: Patrick Bellasi --- Documentation/scheduler/sched-tune.txt | 366 +++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 Documentation/scheduler/sched-tune.txt diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt new file mode 100644 index 000000000000..9bd2231c01b1 --- /dev/null +++ b/Documentation/scheduler/sched-tune.txt @@ -0,0 +1,366 @@ + Central, scheduler-driven, power-performance control + (EXPERIMENTAL) + +Abstract +======== + +The topic of a single simple power-performance tunable, that is wholly +scheduler centric, and has well defined and predictable properties has come up +on several occasions in the past [1,2]. With techniques such as a scheduler +driven DVFS [3], we now have a good framework for implementing such a tunable. +This document describes the overall ideas behind its design and implementation. + + +Table of Contents +================= + +1. Motivation +2. Introduction +3. Signal Boosting Strategy +4. OPP selection using boosted CPU utilization +5. Per task group boosting +6. Question and Answers + - What about "auto" mode? + - What about boosting on a congested system? + - How CPUs are boosted when we have tasks with multiple boost values? +7. References + + +1. Motivation +============= + +Sched-DVFS [3] is a new event-driven cpufreq governor which allows the +scheduler to select the optimal DVFS operating point (OPP) for running a task +allocated to a CPU. The introduction of sched-DVFS enables running workloads at +the most energy efficient OPPs. + +However, sometimes it may be desired to intentionally boost the performance of +a workload even if that could imply a reasonable increase in energy +consumption. For example, in order to reduce the response time of a task, we +may want to run the task at a higher OPP than the one that is actually required +by it's CPU bandwidth demand. + +This last requirement is especially important if we consider that one of the +main goals of the sched-DVFS component is to replace all currently available +CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling +driven governors we currently have, it is already more responsive at selecting +the optimal OPP to run tasks allocated to a CPU. However, just tracking the +actual task load demand may not be enough from a performance standpoint. For +example, it is not possible to get behaviors similar to those provided by the +"performance" and "interactive" CPUFreq governors. + +This document describes an implementation of a tunable, stacked on top of the +sched-DVFS which extends its functionality to support task performance +boosting. + +By "performance boosting" we mean the reduction of the time required to +complete a task activation, i.e. the time elapsed from a task wakeup to its +next deactivation (e.g. because it goes back to sleep or it terminates). For +example, if we consider a simple periodic task which executes the same workload +for 5[s] every 20[s] while running at a certain OPP, a boosted execution of +that task must complete each of its activations in less than 5[s]. + +A previous attempt [5] to introduce such a boosting feature has not been +successful mainly because of the complexity of the proposed solution. The +approach described in this document exposes a single simple interface to +user-space. This single tunable knob allows the tuning of system wide +scheduler behaviours ranging from energy efficiency at one end through to +incremental performance boosting at the other end. This first tunable affects +all tasks. However, a more advanced extension of the concept is also provided +which uses CGroups to boost the performance of only selected tasks while using +the energy efficient default for all others. + +The rest of this document introduces in more details the proposed solution +which has been named SchedTune. + + +2. Introduction +=============== + +SchedTune exposes a simple user-space interface with a single power-performance +tunable: + + /proc/sys/kernel/sched_cfs_boost + +This permits expressing a boost value as an integer in the range [0..100]. + +A value of 0 (default) configures the CFS scheduler for maximum energy +efficiency. This means that sched-DVFS runs the tasks at the minimum OPP +required to satisfy their workload demand. +A value of 100 configures scheduler for maximum performance, which translates +to the selection of the maximum OPP on that CPU. + +The range between 0 and 100 can be set to satisfy other scenarios suitably. For +example to satisfy interactive response or depending on other system events +(battery level etc). + +A CGroup based extension is also provided, which permits further user-space +defined task classification to tune the scheduler for different goals depending +on the specific nature of the task, e.g. background vs interactive vs +low-priority. + +The overall design of the SchedTune module is built on top of "Per-Entity Load +Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating +Performance Point (OPP) selection. +Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune +the operating frequency of that CPU to better match the workload demand. The +selection of the actual OPP being activated is influenced by the global boost +value, or the boost value for the task CGroup when in use. + +This simple biasing approach leverages existing frameworks, which means minimal +modifications to the scheduler, and yet it allows to achieve a range of +different behaviours all from a single simple tunable knob. +The only new concept introduced is that of signal boosting. + + +3. Signal Boosting Strategy +=========================== + +The whole PELT machinery works based on the value of a few load tracking signals +which basically track the CPU bandwidth requirements for tasks and the capacity +of CPUs. The basic idea behind the SchedTune knob is to artificially inflate +some of these load tracking signals to make a task or RQ appears more demanding +that it actually is. + +Which signals have to be inflated depends on the specific "consumer". However, +independently from the specific (signal, consumer) pair, it is important to +define a simple and possibly consistent strategy for the concept of boosting a +signal. + +A boosting strategy defines how the "abstract" user-space defined +sched_cfs_boost value is translated into an internal "margin" value to be added +to a signal to get its inflated value: + + margin := boosting_strategy(sched_cfs_boost, signal) + boosted_signal := signal + margin + +Different boosting strategies were identified and analyzed before selecting the +one found to be most effective. + +Signal Proportional Compensation (SPC) +-------------------------------------- + +In this boosting strategy the sched_cfs_boost value is used to compute a +margin which is proportional to the complement of the original signal. +When a signal has a maximum possible value, its complement is defined as +the delta from the actual value and its possible maximum. + +Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as +the maximum possible value, the margin becomes: + + margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal) + +Using this boosting strategy: +- a 100% sched_cfs_boost means that the signal is scaled to the maximum value +- each value in the range of sched_cfs_boost effectively inflates the signal in + question by a quantity which is proportional to the maximum value. + +For example, by applying the SPC boosting strategy to the selection of the OPP +to run a task it is possible to achieve these behaviors: + +- 0% boosting: run the task at the minimum OPP required by its workload +- 100% boosting: run the task at the maximum OPP available for the CPU +- 50% boosting: run at the half-way OPP between minimum and maximum + +Which means that, at 50% boosting, a task will be scheduled to run at half of +the maximum theoretically achievable performance on the specific target +platform. + +A graphical representation of an SPC boosted signal is represented in the +following figure where: + a) "-" represents the original signal + b) "b" represents a 50% boosted signal + c) "p" represents a 100% boosted signal + + + ^ + | SCHED_LOAD_SCALE + +-----------------------------------------------------------------+ + |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp + | + | boosted_signal + | bbbbbbbbbbbbbbbbbbbbbbbb + | + | original signal + | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+ + | | + |bbbbbbbbbbbbbbbbbb | + | | + | | + | | + | +-----------------------+ + | | + | | + | | + |------------------+ + | + | + +-----------------------------------------------------------------------> + +The plot above shows a ramped load signal (titled 'original_signal') and it's +boosted equivalent. For each step of the original signal the boosted signal +corresponding to a 50% boost is midway from the original signal and the upper +bound. Boosting by 100% generates a boosted signal which is always saturated to +the upper bound. + + +4. OPP selection using boosted CPU utilization +============================================== + +It is worth calling out that the implementation does not introduce any new load +signals. Instead, it provides an API to tune existing signals. This tuning is +done on demand and only in scheduler code paths where it is sensible to do so. +The new API calls are defined to return either the default signal or a boosted +one, depending on the value of sched_cfs_boost. This is a clean an non invasive +modification of the existing existing code paths. + +The signal representing a CPU's utilization is boosted according to the +previously described SPC boosting strategy. To sched-DVFS, this allows a CPU +(ie CFS run-queue) to appear more used then it actually is. + +Thus, with the sched_cfs_boost enabled we have the following main functions to +get the current utilization of a CPU: + + cpu_util() + boosted_cpu_util() + +The new boosted_cpu_util() is similar to the first but returns a boosted +utilization signal which is a function of the sched_cfs_boost value. + +This function is used in the CFS scheduler code paths where sched-DVFS needs to +decide the OPP to run a CPU at. +For example, this allows selecting the highest OPP for a CPU which has +the boost value set to 100%. + + +5. Per task group boosting +========================== + +The availability of a single knob which is used to boost all tasks in the +system is certainly a simple solution but it quite likely doesn't fit many +utilization scenarios, especially in the mobile device space. + +For example, on battery powered devices there usually are many background +services which are long running and need energy efficient scheduling. On the +other hand, some applications are more performance sensitive and require an +interactive response and/or maximum performance, regardless of the energy cost. +To better service such scenarios, the SchedTune implementation has an extension +that provides a more fine grained boosting interface. + +A new CGroup controller, namely "schedtune", could be enabled which allows to +defined and configure task groups with different boosting values. +Tasks that require special performance can be put into separate CGroups. +The value of the boost associated with the tasks in this group can be specified +using a single knob exposed by the CGroup controller: + + schedtune.boost + +This knob allows the definition of a boost value that is to be used for +SPC boosting of all tasks attached to this group. + +The current schedtune controller implementation is really simple and has these +main characteristics: + + 1) It is only possible to create 1 level depth hierarchies + + The root control groups define the system-wide boost value to be applied + by default to all tasks. Its direct subgroups are named "boost groups" and + they define the boost value for specific set of tasks. + Further nested subgroups are not allowed since they do not have a sensible + meaning from a user-space standpoint. + + 2) It is possible to define only a limited number of "boost groups" + + This number is defined at compile time and by default configured to 16. + This is a design decision motivated by two main reasons: + a) In a real system we do not expect utilization scenarios with more then few + boost groups. For example, a reasonable collection of groups could be + just "background", "interactive" and "performance". + b) It simplifies the implementation considerably, especially for the code + which has to compute the per CPU boosting once there are multiple + RUNNABLE tasks with different boost values. + +Such a simple design should allow servicing the main utilization scenarios identified +so far. It provides a simple interface which can be used to manage the +power-performance of all tasks or only selected tasks. +Moreover, this interface can be easily integrated by user-space run-times (e.g. +Android, ChromeOS) to implement a QoS solution for task boosting based on tasks +classification, which has been a long standing requirement. + +Setup and usage +--------------- + +0. Use a kernel with CGROUP_SCHEDTUNE support enabled + +1. Check that the "schedtune" CGroup controller is available: + + root@linaro-nano:~# cat /proc/cgroups + #subsys_name hierarchy num_cgroups enabled + cpuset 0 1 1 + cpu 0 1 1 + schedtune 0 1 1 + +2. Mount a tmpfs to create the CGroups mount point (Optional) + + root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup + +3. Mount the "schedtune" controller + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune + root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune + +4. Setup the system-wide boost value (Optional) + + If not configured the root control group has a 0% boost value, which + basically disables boosting for all tasks in the system thus running in + an energy-efficient mode. + + root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost + +5. Create task groups and configure their specific boost value (Optional) + + For example here we create a "performance" boost group configure to boost + all its tasks to 100% + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance + root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost + +6. Move tasks into the boost group + + For example, the following moves the tasks with PID $TASKPID (and all its + threads) into the "performance" boost group. + + root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs + +This simple configuration allows only the threads of the $TASKPID task to run, +when needed, at the highest OPP in the most capable CPU of the system. + + +6. Question and Answers +======================= + +What about "auto" mode? +----------------------- + +The 'auto' mode as described in [5] can be implemented by interfacing SchedTune +with some suitable user-space element. This element could use the exposed +system-wide or cgroup based interface. + +How are multiple groups of tasks with different boost values managed? +--------------------------------------------------------------------- + +The current SchedTune implementation keeps track of the boosted RUNNABLE tasks +on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization +is boosted with a value which is the maximum of the boost values of the +currently RUNNABLE tasks in its RQ. + +This allows sched-DVFS to boost a CPU only while there are boosted tasks ready +to run and switch back to the energy efficient mode as soon as the last boosted +task is dequeued. + + +7. References +============= +[1] http://lwn.net/Articles/552889 +[2] http://lkml.org/lkml/2012/5/18/91 +[3] http://lkml.org/lkml/2015/6/26/620 From 724d562ae08905854c2940994950b68c2939db78 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:11:44 +0100 Subject: [PATCH 507/813] sched/tune: add sysctl interface to define a boost value The current (CFS) scheduler implementation does not allow "to boost" tasks performance by running them at a higher OPP compared to the minimum required to meet their workload demands. To support tasks performance boosting the scheduler should provide a "knob" which allows to tune how much the system is going to be optimised for energy efficiency vs performance. This patch is the first of a series which provides a simple interface to define a tuning knob. One system-wide "boost" tunable is exposed via: /proc/sys/kernel/sched_cfs_boost which can be configured in the range [0..100], to define a percentage where: - 0% boost requires to operate in "standard" mode by scheduling tasks at the minimum capacities required by the workload demand - 100% boost requires to push at maximum the task performances, "regardless" of the incurred energy consumption A boost value in between these two boundaries is used to bias the power/performance trade-off, the higher the boost value the more the scheduler is biased toward performance boosting instead of energy efficiency. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/sched/sysctl.h | 16 ++++++++++++++++ init/Kconfig | 26 ++++++++++++++++++++++++++ kernel/sched/Makefile | 1 + kernel/sched/tune.c | 16 ++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 5 files changed, 70 insertions(+) create mode 100644 kernel/sched/tune.c diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4479e48c7712 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -77,6 +77,22 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_SCHED_TUNE +extern unsigned int sysctl_sched_cfs_boost; +int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return sysctl_sched_cfs_boost; +} +#else +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return 0; +} +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/init/Kconfig b/init/Kconfig index e1d1d6936f92..ee1dd90254b7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1237,6 +1237,32 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TUNE + bool "Boosting for CFS tasks (EXPERIMENTAL)" + help + This option enables the system-wide support for task boosting. + When this support is enabled a new sysctl interface is exposed to + userspace via: + /proc/sys/kernel/sched_cfs_boost + which allows to set a system-wide boost value in range [0..100]. + + The currently boosting strategy is implemented in such a way that: + - a 0% boost value requires to operate in "standard" mode by + scheduling all tasks at the minimum capacities required by their + workload demand + - a 100% boost value requires to push at maximum the task + performances, "regardless" of the incurred energy consumption + + A boost value in between these two boundaries is used to bias the + power/performance trade-off, the higher the boost value the more the + scheduler is biased toward performance boosting instead of energy + efficiency. + + Since this support exposes a single system-wide knob, the specified + boost value is applied to all (CFS) tasks in the system. + + If unsure, say N. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 0eabc9db4c3d..c6a85f813dfd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 000000000000..a93af9c2f267 --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,16 @@ +#include "sched.h" + +unsigned int sysctl_sched_cfs_boost __read_mostly; + +int +sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + return 0; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11783ed47dd3..2cf1f8610ded 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -435,6 +435,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#ifdef CONFIG_SCHED_TUNE + { + .procname = "sched_cfs_boost", + .data = &sysctl_sched_cfs_boost, + .maxlen = sizeof(sysctl_sched_cfs_boost), + .mode = 0644, + .proc_handler = &sysctl_sched_cfs_boost_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", From f7853a888fb81901c92c22d8c47a67d7b86949c0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:32:36 +0100 Subject: [PATCH 508/813] sched/fair: add function to convert boost value into "margin" The basic idea of the boost knob is to "artificially inflate" a signal to make a task or logical CPU appears more demanding than it actually is. Independently from the specific signal, a consistent and possibly simple semantic for the concept of "signal boosting" must define: 1. how we translate the boost percentage into a "margin" value to be added to the original signal to inflate 2. what is the meaning of a boost value from a user-space perspective This patch provides the implementation of a possible boost semantic, named "Signal Proportional Compensation" (SPC), where the boost percentage (BP) is used to compute a margin (M) which is proportional to the complement of the original signal (OS): M = BP * (SCHED_LOAD_SCALE - OS) The computed margin then added to the OS to obtain the Boosted Signal (BS) BS = OS + M The proposed boost semantic has these main features: - each signal gets a boost which is proportional to its delta with respect to the maximum available capacity in the system (i.e. SCHED_LOAD_SCALE) - a 100% boosting has a clear understanding from a user-space perspective, since it means simply to run (possibly) "all" tasks at the max OPP - each boosting value means to improve the task performance by a quantity which is proportional to the maximum achievable performance on that system Thus this semantics is somehow forcing a behaviour which is: 50% boosting means to run at half-way between the current and the maximum performance which a task could achieve on that system This patch provides the code to implement a fast integer division to convert a boost percentage (BP) value into a margin (M). NOTE: this code is suitable for all signals operating in range [0..SCHED_LOAD_SCALE] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b39158dadc1..c0c9510d871f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5095,6 +5095,44 @@ static bool cpu_overutilized(int cpu) return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); } +#ifdef CONFIG_SCHED_TUNE + +static unsigned long +schedtune_margin(unsigned long signal, unsigned long boost) +{ + unsigned long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_LOAD_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + + /* + * Fast integer division by constant: + * Constant : (C) = 100 + * Precision : 0.1% (P) = 0.1 + * Reference : C * 100 / P (R) = 100000 + * + * Thus: + * Shift bits : ceil(log(R,2)) (S) = 17 + * Mult const : round(2^S/C) (M) = 1311 + * + * + */ + margin *= 1311; + margin >>= 17; + + return margin; +} + +#endif /* CONFIG_SCHED_TUNE */ + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From e6fccee85823905e567c519e3edd297cd8595b35 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 26 Jun 2015 09:55:06 +0100 Subject: [PATCH 509/813] sched/fair: add boosted CPU usage The CPU usage signal is used by the scheduler as an estimation of the overall bandwidth currently allocated on a CPU. When SchedDVFS is in use, this signal affects the selection of the operating points (OPP) required to accommodate all the workload allocated in a CPU. A convenient way to boost the performance of tasks running on a CPU, which is also little intrusive, is to boost the CPU usage signal each time it is used to select an OPP. This patch introduces a new function: get_boosted_cpu_usage(cpu) to return a boosted value for the usage of a specified CPU. The margin added to the original usage is: 1. computed based on the "boosting strategy" in use 2. proportional to the system-wide boost value defined by provided user-space interface The boosted signal is used by SchedDVFS (transparently) each time it requires to get an estimation of the capacity required for a CPU. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0c9510d871f..5ad30f4362e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4160,6 +4160,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static inline unsigned long boosted_cpu_util(int cpu); + static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4168,7 +4170,8 @@ static void update_capacity_of(int cpu) return; /* Convert scale-invariant capacity to cpu. */ - req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } @@ -5131,8 +5134,36 @@ schedtune_margin(unsigned long signal, unsigned long boost) return margin; } +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + unsigned int boost = get_sysctl_sched_cfs_boost(); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ +static inline unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + unsigned long margin = schedtune_cpu_margin(util); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 92757bdea5b275042305a11e95376c1ce05e9aef Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 23 Jun 2015 09:17:54 +0100 Subject: [PATCH 510/813] sched/tune: add initial support for CGroups based boosting To support task performance boosting, the usage of a single knob has the advantage to be a simple solution, both from the implementation and the usability standpoint. However, on a real system it can be difficult to identify a single value for the knob which fits the needs of multiple different tasks. For example, some kernel threads and/or user-space background services should be better managed the "standard" way while we still want to be able to boost the performance of specific workloads. In order to improve the flexibility of the task boosting mechanism this patch is the first of a small series which extends the previous implementation to introduce a "per task group" support. This first patch introduces just the basic CGroups support, a new "schedtune" CGroups controller is added which allows to configure different boost value for different groups of tasks. To keep the implementation simple but still effective for a boosting strategy, the new controller: 1. allows only a two layer hierarchy 2. supports only a limited number of boost groups A two layer hierarchy allows to place each task either: a) in the root control group thus being subject to a system-wide boosting value b) in a child of the root group thus being subject to the specific boost value defined by that "boost group" The limited number of "boost groups" supported is mainly motivated by the observation that in a real system it could be useful to have only few classes of tasks which deserve different treatment. For example, background vs foreground or interactive vs low-priority. As an additional benefit, a limited number of boost groups allows also to have a simpler implementation especially for the code required to compute the boost value for CPUs which have runnable tasks belonging to different boost groups. cc: Tejun Heo cc: Li Zefan cc: Johannes Weiner cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/cgroup_subsys.h | 4 + init/Kconfig | 17 +++ kernel/sched/tune.c | 223 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 4 + 4 files changed, 248 insertions(+) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1a96fdaa33d5..e133705d794a 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -26,6 +26,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE) +SUBSYS(schedtune) +#endif + #if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(io) #endif diff --git a/init/Kconfig b/init/Kconfig index ee1dd90254b7..944bff00c170 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -999,6 +999,23 @@ config CGROUP_CPUACCT Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. +config CGROUP_SCHEDTUNE + bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)" + depends on SCHED_TUNE + help + This option provides the "schedtune" controller which improves the + flexibility of the task boosting mechanism by introducing the support + to define "per task" boost values. + + This new controller: + 1. allows only a two layers hierarchy, where the root defines the + system-wide boost value and its direct childrens define each one a + different "class of tasks" to be boosted with a different value + 2. supports up to 16 different task classes, each one which could be + configured with a different boost value + + Say N if unsure. + config PAGE_COUNTER bool diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a93af9c2f267..95bc8b87c6d4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,230 @@ +#include +#include +#include +#include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; +#ifdef CONFIG_CGROUP_SCHEDTUNE + +/* + * EAS scheduler tunables for task groups. + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, +}; + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 4 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + unsigned boost_max; + struct { + /* The boost for tasks on that boost group */ + unsigned boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static u64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 boost) +{ + struct schedtune *st = css_st(css); + + if (boost < 0 || boost > 100) + return -EINVAL; + + st->boost = boost; + if (css == &root_schedtune.css) + sysctl_sched_cfs_boost = boost; + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_u64 = boost_read, + .write_u64 = boost_write, + }, + { } /* terminate */ +}; + +static int +schedtune_boostgroup_init(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + return 0; +} + +static int +schedtune_init(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info(" schedtune configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + return 0; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) { + schedtune_init(); + return &root_schedtune.css; + } + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; + + return &st->css; + +release: + kfree(st); +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .legacy_cftypes = files, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2cf1f8610ded..46822df92c50 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -440,7 +440,11 @@ static struct ctl_table kern_table[] = { .procname = "sched_cfs_boost", .data = &sysctl_sched_cfs_boost, .maxlen = sizeof(sysctl_sched_cfs_boost), +#ifdef CONFIG_CGROUP_SCHEDTUNE + .mode = 0444, +#else .mode = 0644, +#endif .proc_handler = &sysctl_sched_cfs_boost_handler, .extra1 = &zero, .extra2 = &one_hundred, From 591f354dce7484ff70aaeaf5306734a8b9587c35 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 12:31:35 +0000 Subject: [PATCH 511/813] sched/tune: compute and keep track of per CPU boost value When per task boosting is enabled, we could have multiple RUNNABLE tasks which are concurrently scheduled on the same CPU but each one with a different boost value. For example, we could have a scenarios like this: Task SchedTune CGroup Boost Value T1 root 0 T2 low-priority 10 T3 interactive 90 In these conditions we expect a CPU to be configured according to a proper "aggregation" of the required boost values for all the tasks currently scheduled on this CPU. A suitable aggregation function is the one which tracks the MAX boost value for all the tasks RUNNABLE on a CPU. This approach allows to always satisfy the most boost demanding task while at the same time: a) boosting all the concurrently scheduled tasks thus reducing potential co-scheduling side-effects on demanding tasks b) reduce the number of frequency switch requested towards SchedDVFS, thus being more friendly to architectures with slow frequency switching times Every time a task enters/exits the RQ of a CPU the max boost value should be updated considering all the boost groups currently "affecting" that CPU, i.e. which have at least one RUNNABLE task currently allocated on that CPU. This patch introduces the required support to keep track of the boost groups currently affecting CPUs. Thanks to the limited number of boost groups, a small and memory efficient per-cpu array of boost groups values (cpu_boost_groups) is used which is updated for each CPU entry by schedtune_boostgroup_update() but only when a schedtune CGroup boost value is updated. However, this is expected to be a rare operation, perhaps done just one time at system boot time. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 77 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 95bc8b87c6d4..f62386893725 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -97,6 +97,67 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg; + unsigned boost_max; + int idx; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + boost_max = max(boost_max, bg->group[idx].boost); + } + + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) + schedtune_cpu_update(cpu); + } + + return 0; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -118,6 +179,9 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, if (css == &root_schedtune.css) sysctl_sched_cfs_boost = boost; + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + return 0; } @@ -133,9 +197,19 @@ static struct cftype files[] = { static int schedtune_boostgroup_init(struct schedtune *st) { + struct boost_groups *bg; + int cpu; + /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; + } + return 0; } @@ -203,6 +277,9 @@ out: static void schedtune_boostgroup_release(struct schedtune *st) { + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); + /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } From a9624233c76d852303f0c65bab9701404f5f0504 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 7 Jul 2015 15:33:20 +0100 Subject: [PATCH 512/813] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value When per-task boosting is enabled, every time a task enters/exits a CPU its boost value could impact the currently selected OPP for that CPU. Thus, the "aggregated" boost value for that CPU potentially needs to be updated to match the current maximum boost value among all the tasks currently RUNNABLE on that CPU. This patch introduces the required support to keep track of which boost groups are impacting a CPU. Each time a task is enqueued/dequeued to/from a CPU its boost group is used to increment a per-cpu counter of RUNNABLE tasks on that CPU. Only when the number of runnable tasks for a specific boost group becomes 1 or 0 the corresponding boost group changes its effects on that CPU, specifically: a) boost_group::tasks == 1: this boost group starts to impact the CPU b) boost_group::tasks == 0: this boost group stops to impact the CPU In each of these two conditions the aggregation function: sched_cpu_update(cpu) could be required to run in order to identify the new maximum boost value required for the CPU. The proposed patch minimizes the number of times the aggregation function is executed while still providing the required support to always boost a CPU to the maximum boost value required by all its currently RUNNABLE tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 17 +++++++--- kernel/sched/tune.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 23 +++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/tune.h diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ad30f4362e2..abda1d50aa40 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -34,6 +34,7 @@ #include #include "sched.h" +#include "tune.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -4226,6 +4227,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + schedtune_enqueue_task(p, cpu_of(rq)); + /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4295,6 +4298,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { sub_nr_running(rq, 1); + schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -5135,10 +5139,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) } static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost = get_sysctl_sched_cfs_boost(); + unsigned int boost; +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_cpu_boost(cpu); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif if (boost == 0) return 0; @@ -5148,7 +5157,7 @@ schedtune_cpu_margin(unsigned long util) #else /* CONFIG_SCHED_TUNE */ static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } @@ -5159,7 +5168,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util); + unsigned long margin = schedtune_cpu_margin(util, cpu); return util + margin; } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f62386893725..540b945a01ce 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "sched.h" @@ -158,6 +159,87 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg; + int tasks; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* Update boosted tasks count while avoiding to make it negative */ + if (task_count < 0 && bg->group[idx].tasks <= -task_count) + bg->group[idx].tasks = 0; + else + bg->group[idx].tasks += task_count; + + /* Boost group activation or deactivation on that RQ */ + tasks = bg->group[idx].tasks; + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, 1); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue will be done by cgroup exit() callback. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, -1); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 000000000000..561b5171a19b --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,23 @@ + +#ifdef CONFIG_SCHED_TUNE + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +int schedtune_cpu_boost(int cpu); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_SCHED_TUNE */ From 31bdec0b7a32969d76ce711c0d9bfc1296f83ac5 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:31:53 +0000 Subject: [PATCH 513/813] sched/fair: add boosted task utilization The task utilization signal, which is derived from PELT signals and properly scaled to be architecture and frequency invariant, is used by EAS as an estimation of the task requirements in terms of CPU bandwidth. When the energy aware scheduler is in use, this signal affects the CPU selection. Thus, a convenient way to bias that decision, which is also little intrusive, is to boost the task utilization signal each time it is required to support them. This patch introduces the new function: boosted_task_util(task) which returns a boosted value for the utilization of the specified task. The margin added to the original utilization is: 1. computed based on the "boosting strategy" in use 2. proportional to boost value defined either by the sysctl interface, when global boosting is in use, or the "taskgroup" value, when per-task boosting is enabled. The boosted signal is used by EAS a. transparently, via its integration into the task_fits() function b. explicitly, in the energy-aware wakeup path Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/tune.c | 14 ++++++++++++++ kernel/sched/tune.h | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index abda1d50aa40..3a8d3229e29d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5069,11 +5069,13 @@ static inline unsigned long task_util(struct task_struct *p) unsigned int capacity_margin = 1280; /* ~20% margin */ +static inline unsigned long boosted_task_util(struct task_struct *task); + static inline bool __task_fits(struct task_struct *p, int cpu, int util) { unsigned long capacity = capacity_of(cpu); - util += task_util(p); + util += boosted_task_util(p); return (capacity * 1024) > (util * capacity_margin); } @@ -5154,6 +5156,27 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } +static inline unsigned long +schedtune_task_margin(struct task_struct *task) +{ + unsigned int boost; + unsigned long util; + unsigned long margin; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + #else /* CONFIG_SCHED_TUNE */ static inline unsigned int @@ -5162,6 +5185,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return 0; } +static inline unsigned int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ static inline unsigned long @@ -5173,6 +5202,15 @@ boosted_cpu_util(int cpu) return util + margin; } +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + unsigned long margin = schedtune_task_margin(task); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5407,7 +5445,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + task_util(p); + int new_util = cpu_util(i) + boosted_task_util(p); if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 540b945a01ce..87213861bde5 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -240,6 +240,20 @@ int schedtune_cpu_boost(int cpu) return bg->boost_max; } +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 561b5171a19b..d756ce7b06e0 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -4,6 +4,7 @@ #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); From 6b529bb329d8cc179524d4747ebb72b5a4b0353e Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:35:13 +0000 Subject: [PATCH 514/813] sched/fair: keep track of energy/capacity variations The current EAS implementation does not allow "to boost" tasks performances, for example by running them at an higher OPP (or a more capable CPU), even if that could require a "reasonable" increase in energy consumption. To defined how much reasonable is an energy increase with respect to a required boost value, it is required to define and compute a trade-off between the expected energy and performance variations. However, the current EAS implementation considers only energy variations while completely disregard the impact on performance for the selection of a certain schedule candidate. This patch extends the eenv energy environment to keep track of both energy and performance deltas which are implied by the activation of a schedule candidate. The performance variation is estimated considering the different capacities of the CPUs in which the task could be scheduled. The idea is that while running on a CPU with higher capacity (e.g. higher operating point) the task could (potentially) complete faster and thus get better performance. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a8d3229e29d..f8e97f845e72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4727,6 +4727,16 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + struct { + int before; + int after; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; }; /* @@ -4893,6 +4903,22 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) @@ -4941,6 +4967,8 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0 }, + .cap = { 0, 0, 0 }, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -4962,13 +4990,21 @@ static int energy_diff(struct energy_env *eenv) return 0; /* Invalid result abort */ energy_before += eenv_before.energy; + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + if (sched_group_energy(eenv)) return 0; /* Invalid result abort */ energy_after += eenv->energy; } } while (sg = sg->next, sg != sd->groups); - return energy_after-energy_before; + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + + return eenv->nrg.diff; } /* From 641e534d8a189177f76e6d72a72009929840b04a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 12 Jan 2016 18:12:13 +0000 Subject: [PATCH 515/813] sched/tune: add support to compute normalized energy The current EAS implementation considers only energy variations, while it disregards completely the impact on performance for the selection of a certain schedule candidate. Moreover, it also makes its decision based on the "absolute" value of expected energy variations. In order to properly define a trade-off strategy between increased energy consumption and performances benefits it is required to compare energy variations with performance variations. Thus, both performance and energy metrics must be expressed in comparable units. While the performance variations are expressed in terms of capacity deltas, which are defined in the range [0..SCHED_LOAD_SCALE], the same scale is not used for energy variations. This patch introduces the function: schedtune_normalize_energy(energy_diff) which returns a normalized value in the same range of capacity variations, i.e. [0..SCHED_LOAD_SCALE]. A proper set of energy normalization constants are required to provide a fast division by a constant during the normalziation of the energy_diff. The value of these constants depends on the specific energy model and topology of a target device. Thus, this patch provides also the required support for the computation at boot time of this set of variables. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 321 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 7 + 2 files changed, 328 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 87213861bde5..1a8ba5a6d99b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,9 @@ #include #include +#include #include #include +#include #include #include @@ -9,6 +11,84 @@ unsigned int sysctl_sched_cfs_boost __read_mostly; +/* + * System energy normalization constants + */ +static struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +} schedtune_target_nrg; + +/* Performance Boost region (B) threshold params */ +static int perf_boost_idx; + +/* Performance Constraint region (C) threshold params */ +static int perf_constrain_idx; + +/** + * Performance-Energy (P-E) Space thresholds constants + */ +struct threshold_params { + int nrg_gain; + int cap_gain; +}; + +/* + * System specific P-E space thresholds constants + */ +static struct threshold_params +threshold_gains[] = { + { 0, 4 }, /* >= 0% */ + { 0, 4 }, /* >= 10% */ + { 1, 4 }, /* >= 20% */ + { 2, 4 }, /* >= 30% */ + { 3, 4 }, /* >= 40% */ + { 4, 3 }, /* >= 50% */ + { 4, 2 }, /* >= 60% */ + { 4, 1 }, /* >= 70% */ + { 4, 0 }, /* >= 80% */ + { 4, 0 } /* >= 90% */ +}; + +static int +__schedtune_accept_deltas(int nrg_delta, int cap_delta, + int perf_boost_idx, int perf_constrain_idx) +{ + int payoff = -INT_MAX; + + /* Performance Boost (B) region */ + if (nrg_delta > 0 && cap_delta > 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta < cap_gain / nrg_gain + * which is: + * nrg_delta * cap_gain > cap_delta * nrg_gain + */ + payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; + payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + return payoff; + } + + /* Performance Constraint (C) region */ + if (nrg_delta < 0 && cap_delta < 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta > cap_gain / nrg_gain + * which is: + * cap_delta * nrg_gain > nrg_delta * cap_gain + */ + payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + return payoff; + } + + /* Default: reject schedule candidate */ + return payoff; +} + #ifdef CONFIG_CGROUP_SCHEDTUNE /* @@ -26,6 +106,11 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; + /* Performance Boost (B) region threshold params */ + int perf_boost_idx; + + /* Performance Constraint (C) region threshold params */ + int perf_constrain_idx; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, + .perf_boost_idx = 0, + .perf_constrain_idx = 0, }; +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + struct schedtune *ct; + int perf_boost_idx; + int perf_constrain_idx; + + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + /* Get task specific perf Boost/Constraints indexes */ + rcu_read_lock(); + ct = task_schedtune(task); + perf_boost_idx = ct->perf_boost_idx; + perf_constrain_idx = ct->perf_constrain_idx; + rcu_read_unlock(); + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of @@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + #endif /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -408,5 +540,194 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* Performance Boost (B) region threshold params */ + perf_boost_idx = sysctl_sched_cfs_boost; + perf_boost_idx /= 10; + + /* Performance Constraint (C) region threshold params */ + perf_constrain_idx = 100 - sysctl_sched_cfs_boost; + perf_constrain_idx /= 10; + return 0; } + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +int +schedtune_normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + int max_delta; + +#ifdef CONFIG_SCHED_DEBUG + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +#ifdef CONFIG_SCHED_DEBUG +static void +schedtune_test_nrg(unsigned long delta_pwr) +{ + unsigned long test_delta_pwr; + unsigned long test_norm_pwr; + int idx; + + /* + * Check normalization constants using some constant system + * energy values + */ + pr_info("schedtune: verify normalization constants...\n"); + for (idx = 0; idx < 6; ++idx) { + test_delta_pwr = delta_pwr >> idx; + + /* Normalize on max energy for target platform */ + test_norm_pwr = reciprocal_divide( + test_delta_pwr << SCHED_LOAD_SHIFT, + schedtune_target_nrg.rdiv); + + pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", + idx, test_delta_pwr, test_norm_pwr); + } +} +#else +#define schedtune_test_nrg(delta_pwr) +#endif + +/* + * Compute the min/max power consumption of a cluster and all its CPUs + */ +static void +schedtune_add_cluster_nrg( + struct sched_domain *sd, + struct sched_group *sg, + struct target_nrg *ste) +{ + struct sched_domain *sd2; + struct sched_group *sg2; + + struct cpumask *cluster_cpus; + char str[32]; + + unsigned long min_pwr; + unsigned long max_pwr; + int cpu; + + /* Get Cluster energy using EM data for the first CPU */ + cluster_cpus = sched_group_cpus(sg); + snprintf(str, 32, "CLUSTER[%*pbl]", + cpumask_pr_args(cluster_cpus)); + + min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; + max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Keep track of this cluster's energy in the computation of the + * overall system energy + */ + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + /* Get CPU energy using EM data for each CPU in the group */ + for_each_cpu(cpu, cluster_cpus) { + /* Get a SD view for the specific CPU */ + for_each_domain(cpu, sd2) { + /* Get the CPU group */ + sg2 = sd2->groups; + min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; + max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; + + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + snprintf(str, 32, "CPU[%d]", cpu); + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Assume we have EM data only at the CPU and + * the upper CLUSTER level + */ + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); + break; + } + } +} + +/* + * Initialize the constants required to compute normalized energy. + * The values of these constants depends on the EM data for the specific + * target system and topology. + * Thus, this function is expected to be called by the code + * that bind the EM to the topology information. + */ +static int +schedtune_init_late(void) +{ + struct target_nrg *ste = &schedtune_target_nrg; + unsigned long delta_pwr = 0; + struct sched_domain *sd; + struct sched_group *sg; + + pr_info("schedtune: init normalization constants...\n"); + ste->max_power = 0; + ste->min_power = 0; + + rcu_read_lock(); + + /* + * When EAS is in use, we always have a pointer to the highest SD + * which provides EM data. + */ + sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); + if (!sd) { + pr_info("schedtune: no energy model data\n"); + goto nodata; + } + + sg = sd->groups; + do { + schedtune_add_cluster_nrg(sd, sg, ste); + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + "SYSTEM", ste->min_power, ste->max_power); + + /* Compute normalization constants */ + delta_pwr = ste->max_power - ste->min_power; + ste->rdiv = reciprocal_value(delta_pwr); + pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", + ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); + + schedtune_test_nrg(delta_pwr); + return 0; + +nodata: + rcu_read_unlock(); + return -EINVAL; +} +late_initcall(schedtune_init_late); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index d756ce7b06e0..f7273a5d994a 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -16,9 +16,16 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #endif /* CONFIG_CGROUP_SCHEDTUNE */ +int schedtune_normalize_energy(int energy); +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + #else /* CONFIG_SCHED_TUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_normalize_energy(energy) energy +#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta + #endif /* CONFIG_SCHED_TUNE */ From 45668ef621bd3c3635865d92807b6578582246e7 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 15 Jan 2016 15:48:03 +0000 Subject: [PATCH 516/813] sched/fair: filter energy_diff() based on energy_payoff value Once the SchedTune support is enabled and the CPU bandwidth demand of a task is boosted, we could expect increased energy consumptions which are balanced by corresponding increases of tasks performance. However, the current implementation of the energy_diff() function accepts all and _only_ the schedule candidates which results into a reduced expected system energy, which works against the boosting strategy. This patch links the energy_diff() function with the "energy payoff" engine provided by SchedTune. The energy variation computed by the energy_diff() function is now filtered using the SchedTune support to evaluated the energy payoff for a boosted task. With that patch, the energy_diff() function is going to reported as "acceptable schedule candidate" only the schedule candidate which corresponds to a positive energy_payoff. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8e97f845e72..5ccc9f13a295 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4727,9 +4727,12 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + int payoff; + struct task_struct *task; struct { int before; int after; + int delta; int diff; } nrg; struct { @@ -4950,6 +4953,44 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +#ifdef CONFIG_SCHED_TUNE +static int energy_diff_evaluate(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff_evaluate(eenv) eenv->nrg.diff +#endif + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -4967,7 +5008,7 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, - .nrg = { 0, 0, 0 }, + .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, }; @@ -5003,8 +5044,9 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.before = energy_before; eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; - return eenv->nrg.diff; + return energy_diff_evaluate(eenv); } /* @@ -5502,6 +5544,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) .util_delta = task_util(p), .src_cpu = task_cpu(p), .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ From f4886c38ff4052b20b0bb22210308c8d07a048a9 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:06:24 +0000 Subject: [PATCH 517/813] DEBUG: sched: add tracepoint for cpu/freq scale invariance Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 24 ++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index d34eba74af27..67e465f8b159 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -611,6 +611,30 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +TRACE_EVENT(sched_contrib_scale_f, + + TP_PROTO(int cpu, unsigned long freq_scale_factor, + unsigned long cpu_scale_factor), + + TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, freq_scale_factor) + __field(unsigned long, cpu_scale_factor) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->freq_scale_factor = freq_scale_factor; + __entry->cpu_scale_factor = cpu_scale_factor; + ), + + TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu", + __entry->cpu, __entry->freq_scale_factor, + __entry->cpu_scale_factor) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ccc9f13a295..7444383c032f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2585,6 +2585,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; From c604a5fa23600c15999d0a8e75669bb58f906fe7 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:27 +0000 Subject: [PATCH 518/813] DEBUG: sched: add tracepoint for task load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 43 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 3 +++ 2 files changed, 46 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 67e465f8b159..cef05f0abea2 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -635,6 +635,49 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu, __entry->freq_scale_factor, __entry->cpu_scale_factor) ); + +/* + * Tracepoint for accounting sched averages for tasks. + */ +TRACE_EVENT(sched_load_avg_task, + + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + __field( u64, load_sum ) + __field( u32, util_sum ) + __field( u32, period_contrib ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->util_avg = avg->util_avg; + __entry->load_sum = avg->load_sum; + __entry->util_sum = avg->util_sum; + __entry->period_contrib = avg->period_contrib; + ), + + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu" + " util_sum=%u period_contrib=%u", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->load_avg, + __entry->util_avg, + (u64)__entry->load_sum, + (u32)__entry->util_sum, + (u32)__entry->period_contrib) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7444383c032f..7b55cc057288 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2746,6 +2746,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + trace_sched_load_avg_task(task_of(se), &se->avg); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From 69852bd9a380fdd066e156d3511db645340b9e8c Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:48 +0000 Subject: [PATCH 519/813] DEBUG: sched: add tracepoint for CPU load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 25 +++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cef05f0abea2..5a8951cb4a10 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -678,6 +678,31 @@ TRACE_EVENT(sched_load_avg_task, (u32)__entry->util_sum, (u32)__entry->period_contrib) ); + +/* + * Tracepoint for accounting sched averages for cpus. + */ +TRACE_EVENT(sched_load_avg_cpu, + + TP_PROTO(int cpu, struct cfs_rq *cfs_rq), + + TP_ARGS(cpu, cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load_avg = cfs_rq->avg.load_avg; + __entry->util_avg = cfs_rq->avg.util_avg; + ), + + TP_printk("cpu=%d load_avg=%lu util_avg=%lu", + __entry->cpu, __entry->load_avg, __entry->util_avg) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b55cc057288..7ca5003005d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2749,6 +2749,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); + trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From bd818ccdeef84bef9fed1cdbd143018a89b63454 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 17:35:23 +0100 Subject: [PATCH 520/813] DEBUG: sched,cpufreq: add cpu_capacity change tracepoint This is useful when we want to compare cpu utilization and cpu curr capacity side by side. Signed-off-by: Juri Lelli --- drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/sched.h | 2 ++ include/trace/events/power.h | 7 +++++++ kernel/sched/fair.c | 11 +++++++++++ kernel/sched/sched.h | 11 ----------- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 2b99bc305040..7264820e6443 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include static LIST_HEAD(cpufreq_policy_list); @@ -473,6 +474,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { + int cpu; /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -501,6 +503,8 @@ wait: spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); + for_each_cpu(cpu, policy->cpus) + trace_cpu_capacity(capacity_curr_of(cpu), cpu); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } diff --git a/include/linux/sched.h b/include/linux/sched.h index c707c613664f..951422587dd9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1048,6 +1048,8 @@ struct sched_group_energy { struct capacity_state *cap_states; /* ptr to capacity state array */ }; +unsigned long capacity_curr_of(int cpu); + struct sched_group; struct sched_domain { diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 9af0d898016a..8924cc2b4ca8 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -145,6 +145,13 @@ TRACE_EVENT(cpu_frequency_limits, (unsigned long)__entry->cpu_id) ); +DEFINE_EVENT(cpu, cpu_capacity, + + TP_PROTO(unsigned int capacity, unsigned int cpu_id), + + TP_ARGS(capacity, cpu_id) +); + TRACE_EVENT(device_pm_callback_start, TP_PROTO(struct device *dev, const char *pm_ops, int event), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ca5003005d8..7d1302d85818 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4719,6 +4719,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3f52226bb6f3..a537f1864dd0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1512,17 +1512,6 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static inline unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From 8321f3874e0fea10cc8dd9ed46b05d9f00eee3a6 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:25:50 +0000 Subject: [PATCH 521/813] DEBUG: sched: add energy procfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the energy data available via procfs. The related files are placed as sub-directory named 'energy' inside the /proc/sys/kernel/sched_domain/cpuX/domainY/groupZ directory for those cpu/domain/group tuples which have energy information. The following example depicts the contents of /proc/sys/kernel/sched_domain/cpu0/domain0/group[01] for a system which has energy information attached to domain level 0. ├── cpu0 │ ├── domain0 │ │ ├── busy_factor │ │ ├── busy_idx │ │ ├── cache_nice_tries │ │ ├── flags │ │ ├── forkexec_idx │ │ ├── group0 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── group1 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── idle_idx │ │ ├── imbalance_pct │ │ ├── max_interval │ │ ├── max_newidle_lb_cost │ │ ├── min_interval │ │ ├── name │ │ ├── newidle_idx │ │ └── wake_idx │ └── domain1 │ ├── busy_factor │ ├── busy_idx │ ├── cache_nice_tries │ ├── flags │ ├── forkexec_idx │ ├── idle_idx │ ├── imbalance_pct │ ├── max_interval │ ├── max_newidle_lb_cost │ ├── min_interval │ ├── name │ ├── newidle_idx │ └── wake_idx The files 'nr_idle_states' and 'nr_cap_states' contain a scalar value whereas 'idle_states' and 'cap_states' contain a vector of power consumption at this idle state respectively (compute capacity, power consumption) at this capacity state. Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d388d070a2e..1f007c545c8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5415,10 +5415,61 @@ set_table_entry(struct ctl_table *entry, } } +static struct ctl_table * +sd_alloc_ctl_energy_table(struct sched_group_energy *sge) +{ + struct ctl_table *table = sd_alloc_ctl_entry(5); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power, + sge->nr_idle_states*sizeof(struct idle_state), 0644, + proc_doulongvec_minmax, false); + set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap, + sge->nr_cap_states*sizeof(struct capacity_state), 0644, + proc_doulongvec_minmax, false); + + return table; +} + +static struct ctl_table * +sd_alloc_ctl_group_table(struct sched_group *sg) +{ + struct ctl_table *table = sd_alloc_ctl_entry(2); + + if (table == NULL) + return NULL; + + table->procname = kstrdup("energy", GFP_KERNEL); + table->mode = 0555; + table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge); + + return table; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table; + unsigned int nr_entries = 14; + + int i = 0; + struct sched_group *sg = sd->groups; + + if (sg->sge) { + int nr_sgs = 0; + + do {} while (nr_sgs++, sg = sg->next, sg != sd->groups); + + nr_entries += nr_sgs; + } + + table = sd_alloc_ctl_entry(nr_entries); if (table == NULL) return NULL; @@ -5451,7 +5502,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + sg = sd->groups; + if (sg->sge) { + char buf[32]; + struct ctl_table *entry = &table[13]; + + do { + snprintf(buf, 32, "group%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_group_table(sg); + } while (entry++, i++, sg = sg->next, sg != sd->groups); + } + /* &table[nr_entries-1] is terminator */ return table; } From 9b8dc08c8883d423a97a591cee139937b6f5b050 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:49:07 +0100 Subject: [PATCH 522/813] DEBUG: schedtune: add tracepoint for SchedTune configuration update Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 21 +++++++++++++++++++++ kernel/sched/tune.c | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5a8951cb4a10..f4ce46f0a4c4 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -703,6 +703,27 @@ TRACE_EVENT(sched_load_avg_cpu, TP_printk("cpu=%d load_avg=%lu util_avg=%lu", __entry->cpu, __entry->load_avg, __entry->util_avg) ); + +/* + * Tracepoint for sched_tune_config settings + */ +TRACE_EVENT(sched_tune_config, + + TP_PROTO(int boost), + + TP_ARGS(boost), + + TP_STRUCT__entry( + __field( int, boost ) + ), + + TP_fast_assign( + __entry->boost = boost; + ), + + TP_printk("boost=%d ", __entry->boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 1a8ba5a6d99b..f5f4c57efb9e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -7,6 +7,8 @@ #include #include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; @@ -392,6 +394,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); + trace_sched_tune_config(st->boost); + return 0; } From 37437e7367e443498a23f49144471a4f3594121d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:51:07 +0100 Subject: [PATCH 523/813] DEBUG: schedtune: add tracepoint for CPU boost signal Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 29 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f4ce46f0a4c4..d5563004033f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -724,6 +724,33 @@ TRACE_EVENT(sched_tune_config, TP_printk("boost=%d ", __entry->boost) ); +/* + * Tracepoint for accounting CPU boosted utilization + */ +TRACE_EVENT(sched_boost_cpu, + + TP_PROTO(int cpu, unsigned long util, unsigned long margin), + + TP_ARGS(cpu, util, margin), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("cpu=%d util=%lu margin=%lu", + __entry->cpu, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d1302d85818..4fd9ddad3f69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5293,6 +5293,8 @@ boosted_cpu_util(int cpu) unsigned long util = cpu_util(cpu); unsigned long margin = schedtune_cpu_margin(util, cpu); + trace_sched_boost_cpu(cpu, util, margin); + return util + margin; } From a727f6b626be0e22bfed24ab17180fde3dcbe2f2 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Jun 2015 15:36:08 +0100 Subject: [PATCH 524/813] DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 62 ++++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 12 ++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index d5563004033f..9bb0a264ad7d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -751,6 +751,68 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_tasks_update, + + TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, + unsigned int boost, unsigned int max_boost), + + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, tasks ) + __field( int, idx ) + __field( unsigned int, boost ) + __field( unsigned int, max_boost ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->tasks = tasks; + __entry->idx = idx; + __entry->boost = boost; + __entry->max_boost = max_boost; + ), + + TP_printk("pid=%d comm=%s " + "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + __entry->pid, __entry->comm, + __entry->cpu, __entry->tasks, __entry->idx, + __entry->boost, __entry->max_boost) +); + +/* + * Tracepoint for schedtune_boostgroup_update + */ +TRACE_EVENT(sched_tune_boostgroup_update, + + TP_PROTO(int cpu, int variation, int max_boost), + + TP_ARGS(cpu, variation, max_boost), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( int, variation ) + __field( int, max_boost ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->variation = variation; + __entry->max_boost = max_boost; + ), + + TP_printk("cpu=%d variation=%d max_boost=%d", + __entry->cpu, __entry->variation, __entry->max_boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f5f4c57efb9e..7a434f2394e7 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -264,12 +264,18 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ - if (cur_boost_max == old_boost && old_boost > boost) + if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); + continue; + } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; @@ -293,6 +299,10 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) tasks = bg->group[idx].tasks; if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + } /* From 75f2b9bac833f006ec434e1ec1346909e9b13bb4 Mon Sep 17 00:00:00 2001 From: Joseph Lo Date: Mon, 22 Apr 2013 14:39:18 +0800 Subject: [PATCH 525/813] CHROMIUM: sched: update the average of nr_running Doing a Exponential moving average per nr_running++/-- does not guarantee a fixed sample rate which induces errors if there are lots of threads being enqueued/dequeued from the rq (Linpack mt). Instead of keeping track of the avg, the scheduler now keeps track of the integral of nr_running and allows the readers to perform filtering on top. Original-author: Sai Charan Gurrappadi Change-Id: Id946654f32fa8be0eaf9d8fa7c9a8039b5ef9fab Signed-off-by: Joseph Lo Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/174694 Reviewed-on: https://chromium-review.googlesource.com/272853 [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 30 ++++++++++++++++++++++++++ kernel/sched/sched.h | 49 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 951422587dd9..f1a28bafe7ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -173,6 +173,9 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); +#ifdef CONFIG_CPU_QUIET +extern u64 nr_running_integral(unsigned int cpu); +#endif extern void calc_global_load(unsigned long ticks); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f007c545c8c..07f389a0f22f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2768,6 +2768,36 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +#ifdef CONFIG_CPU_QUIET +u64 nr_running_integral(unsigned int cpu) +{ + unsigned int seqcnt; + u64 integral; + struct rq *q; + + if (cpu >= nr_cpu_ids) + return 0; + + q = cpu_rq(cpu); + + /* + * Update average to avoid reading stalled value if there were + * no run-queue changes for a long time. On the other hand if + * the changes are happening right now, just read current value + * directly. + */ + + seqcnt = read_seqcount_begin(&q->ave_seqcnt); + integral = do_nr_running_integral(q); + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) { + read_seqcount_begin(&q->ave_seqcnt); + integral = q->nr_running_integral; + } + + return integral; +} +#endif + void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a537f1864dd0..1a605bbec684 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,14 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif + +#ifdef CONFIG_CPU_QUIET + /* time-based average load */ + u64 nr_last_stamp; + u64 nr_running_integral; + seqcount_t ave_seqcnt; +#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -1353,7 +1361,7 @@ extern void init_entity_runnable_average(struct sched_entity *se); extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); -static inline void add_nr_running(struct rq *rq, unsigned count) +static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1381,11 +1389,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } } -static inline void sub_nr_running(struct rq *rq, unsigned count) +static inline void __sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; } +#ifdef CONFIG_CPU_QUIET +#define NR_AVE_SCALE(x) ((x) << FSHIFT) +static inline u64 do_nr_running_integral(struct rq *rq) +{ + s64 nr, deltax; + u64 nr_running_integral = rq->nr_running_integral; + + deltax = rq->clock_task - rq->nr_last_stamp; + nr = NR_AVE_SCALE(rq->nr_running); + + nr_running_integral += nr * deltax; + + return nr_running_integral; +} + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __add_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __sub_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} +#else +#define add_nr_running __add_nr_running +#define sub_nr_running __sub_nr_running +#endif + static inline void rq_last_tick_reset(struct rq *rq) { #ifdef CONFIG_NO_HZ_FULL From 2585150a548a272fe87f32cc3c7a536863ef27b7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 17:16:41 +0000 Subject: [PATCH 526/813] arm: topology: Define TC2 energy and provide it to the scheduler This patch is only here to be able to test provisioning of energy related data from an arch topology shim layer to the scheduler. Since there is no code today which deals with extracting energy related data from the dtb or acpi, and process it in the topology shim layer, the content of the sched_group_energy structures as well as the idle_state and capacity_state arrays are hard-coded here. This patch defines the sched_group_energy structure as well as the idle_state and capacity_state array for the cluster (relates to sched groups (sgs) in DIE sched domain level) and for the core (relates to sgs in MC sd level) for a Cortex A7 as well as for a Cortex A15. It further provides related implementations of the sched_domain_energy_f functions (cpu_cluster_energy() and cpu_core_energy()). To be able to propagate this information from the topology shim layer to the scheduler, the elements of the arm_topology[] table have been provisioned with the appropriate sched_domain_energy_f functions. Change-Id: I8c014bbd04f6a1d57892be9bfa16affe07948dcf cc: Russell King Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 126 ++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 0308342def8c..f5941004efba 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -287,6 +287,127 @@ void store_cpu_topology(unsigned int cpuid) cpu_topology[cpuid].socket_id, mpidr); } +/* + * ARM TC2 specific energy cost model data. There are no unit requirements for + * the data. Data can be normalized to any reference point, but the + * normalization must be consistent. That is, one bogo-joule/watt must be the + * same quantity for all data, but we don't care what it is. + */ +static struct idle_state idle_states_cluster_a7[] = { + { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 25 }, /* WFI */ + { .power = 10 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_cluster_a15[] = { + { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 70 }, /* WFI */ + { .power = 25 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_cluster_a7[] = { + /* Cluster only power */ + { .cap = 150, .power = 2967, }, /* 350 MHz */ + { .cap = 172, .power = 2792, }, /* 400 MHz */ + { .cap = 215, .power = 2810, }, /* 500 MHz */ + { .cap = 258, .power = 2815, }, /* 600 MHz */ + { .cap = 301, .power = 2919, }, /* 700 MHz */ + { .cap = 344, .power = 2847, }, /* 800 MHz */ + { .cap = 387, .power = 3917, }, /* 900 MHz */ + { .cap = 430, .power = 4905, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_cluster_a15[] = { + /* Cluster only power */ + { .cap = 426, .power = 7920, }, /* 500 MHz */ + { .cap = 512, .power = 8165, }, /* 600 MHz */ + { .cap = 597, .power = 8172, }, /* 700 MHz */ + { .cap = 682, .power = 8195, }, /* 800 MHz */ + { .cap = 768, .power = 8265, }, /* 900 MHz */ + { .cap = 853, .power = 8446, }, /* 1000 MHz */ + { .cap = 938, .power = 11426, }, /* 1100 MHz */ + { .cap = 1024, .power = 15200, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_cluster_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7), + .idle_states = idle_states_cluster_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7), + .cap_states = cap_states_cluster_a7, +}; + +static struct sched_group_energy energy_cluster_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15), + .idle_states = idle_states_cluster_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15), + .cap_states = cap_states_cluster_a15, +}; + +static struct idle_state idle_states_core_a7[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_core_a15[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_core_a7[] = { + /* Power per cpu */ + { .cap = 150, .power = 187, }, /* 350 MHz */ + { .cap = 172, .power = 275, }, /* 400 MHz */ + { .cap = 215, .power = 334, }, /* 500 MHz */ + { .cap = 258, .power = 407, }, /* 600 MHz */ + { .cap = 301, .power = 447, }, /* 700 MHz */ + { .cap = 344, .power = 549, }, /* 800 MHz */ + { .cap = 387, .power = 761, }, /* 900 MHz */ + { .cap = 430, .power = 1024, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_core_a15[] = { + /* Power per cpu */ + { .cap = 426, .power = 2021, }, /* 500 MHz */ + { .cap = 512, .power = 2312, }, /* 600 MHz */ + { .cap = 597, .power = 2756, }, /* 700 MHz */ + { .cap = 682, .power = 3125, }, /* 800 MHz */ + { .cap = 768, .power = 3524, }, /* 900 MHz */ + { .cap = 853, .power = 3846, }, /* 1000 MHz */ + { .cap = 938, .power = 5177, }, /* 1100 MHz */ + { .cap = 1024, .power = 6997, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_core_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a7), + .idle_states = idle_states_core_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a7), + .cap_states = cap_states_core_a7, +}; + +static struct sched_group_energy energy_core_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a15), + .idle_states = idle_states_core_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a15), + .cap_states = cap_states_core_a15, +}; + +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_cluster_a7 : + &energy_cluster_a15; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_core_a7 : + &energy_core_a15; +} + static inline int cpu_corepower_flags(void) { return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ @@ -295,10 +416,9 @@ static inline int cpu_corepower_flags(void) static struct sched_domain_topology_level arm_topology[] = { #ifdef CONFIG_SCHED_MC - { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, { NULL, }, }; From 88c457224370ea2047597fdfaf93329c21c12cf9 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:43:37 +0000 Subject: [PATCH 527/813] DEBUG: sched/tune: add tracepoint for task boost signal Change-Id: I545d3bf5569fc41c0fa70f51dff9a19c11d532ee Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 30 ++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 32 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9bb0a264ad7d..564e090fc005 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -813,6 +813,36 @@ TRACE_EVENT(sched_tune_boostgroup_update, __entry->cpu, __entry->variation, __entry->max_boost) ); +/* + * Tracepoint for accounting task boosted utilization + */ +TRACE_EVENT(sched_boost_task, + + TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + + TP_ARGS(tsk, util, margin), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("comm=%s pid=%d util=%lu margin=%lu", + __entry->comm, __entry->pid, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4fd9ddad3f69..1781c634e215 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5304,6 +5304,8 @@ boosted_task_util(struct task_struct *task) unsigned long util = task_util(task); unsigned long margin = schedtune_task_margin(task); + trace_sched_boost_task(task, util, margin); + return util + margin; } From 486c7afa76ec1b1249b1c8f9d387561bc86c268f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:47:21 +0000 Subject: [PATCH 528/813] DEBUG: sched/tune: add tracepoint for energy_diff() values Change-Id: Id8fafbd85f6d81248f322e073ee790a7ceec0bf7 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 57 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 564e090fc005..5dcbc803e233 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -843,6 +843,63 @@ TRACE_EVENT(sched_boost_task, __entry->margin) ); +/* + * Tracepoint for accounting sched group energy + */ +TRACE_EVENT(sched_energy_diff, + + TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta, + int nrgb, int nrga, int nrgd, int capb, int capa, int capd, + int nrgn, int nrgp), + + TP_ARGS(tsk, scpu, dcpu, udelta, + nrgb, nrga, nrgd, capb, capa, capd, + nrgn, nrgp), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, scpu ) + __field( int, dcpu ) + __field( int, udelta ) + __field( int, nrgb ) + __field( int, nrga ) + __field( int, nrgd ) + __field( int, capb ) + __field( int, capa ) + __field( int, capd ) + __field( int, nrgn ) + __field( int, nrgp ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->scpu = scpu; + __entry->dcpu = dcpu; + __entry->udelta = udelta; + __entry->nrgb = nrgb; + __entry->nrga = nrga; + __entry->nrgd = nrgd; + __entry->capb = capb; + __entry->capa = capa; + __entry->capd = capd; + __entry->nrgn = nrgn; + __entry->nrgp = nrgp; + ), + + TP_printk("pid=%d comm=%s " + "src_cpu=%d dst_cpu=%d usage_delta=%d " + "nrg_before=%d nrg_after=%d nrg_diff=%d " + "cap_before=%d cap_after=%d cap_delta=%d " + "nrg_delta=%d nrg_payoff=%d", + __entry->pid, __entry->comm, + __entry->scpu, __entry->dcpu, __entry->udelta, + __entry->nrgb, __entry->nrga, __entry->nrgd, + __entry->capb, __entry->capa, __entry->capd, + __entry->nrgn, __entry->nrgp) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1781c634e215..2f0546ef0af1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5019,6 +5019,7 @@ static int energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int result; struct energy_env eenv_before = { .util_delta = 0, @@ -5062,7 +5063,15 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - return energy_diff_evaluate(eenv); + result = energy_diff_evaluate(eenv); + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + return result; } /* From 13a60dc148bd6646e09349a2d424e65c8c30c195 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 20 Jan 2016 14:06:05 +0000 Subject: [PATCH 529/813] DEBUG: sched/tune: add tracepoint on P-E space filtering Change-Id: I31dfed67c0486713b88efb75df767329f2802e06 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5dcbc803e233..abbfaeedd3ae 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -900,6 +900,41 @@ TRACE_EVENT(sched_energy_diff, __entry->nrgn, __entry->nrgp) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_filter, + + TP_PROTO(int nrg_delta, int cap_delta, + int nrg_gain, int cap_gain, + int payoff, int region), + + TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region), + + TP_STRUCT__entry( + __field( int, nrg_delta ) + __field( int, cap_delta ) + __field( int, nrg_gain ) + __field( int, cap_gain ) + __field( int, payoff ) + __field( int, region ) + ), + + TP_fast_assign( + __entry->nrg_delta = nrg_delta; + __entry->cap_delta = cap_delta; + __entry->nrg_gain = nrg_gain; + __entry->cap_gain = cap_gain; + __entry->payoff = payoff; + __entry->region = region; + ), + + TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d", + __entry->nrg_delta, __entry->cap_delta, + __entry->nrg_gain, __entry->cap_gain, + __entry->payoff, __entry->region) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 7a434f2394e7..b40d40dc3c49 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -70,6 +70,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_boost_idx].nrg_gain, + threshold_gains[perf_boost_idx].cap_gain, + payoff, 8); + return payoff; } @@ -84,6 +91,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_constrain_idx].nrg_gain, + threshold_gains[perf_constrain_idx].cap_gain, + payoff, 6); + return payoff; } @@ -155,12 +169,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_constrain_idx; /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } /* Get task specific perf Boost/Constraints indexes */ rcu_read_lock(); @@ -531,12 +549,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) { /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } return __schedtune_accept_deltas(nrg_delta, cap_delta, perf_boost_idx, perf_constrain_idx); From f0ba6a5d0c42e689bab7ed76738ac13046e7bd1a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 22 Jul 2016 11:35:59 +0100 Subject: [PATCH 530/813] FIXUP: sched: fix build for non-SMP target Currently the build for a single-core (e.g. user-mode) Linux is broken and this configuration is required (at least) to run some network tests. The main issues for the current code support on single-core systems are: 1. {se,rq}::sched_avg is not available nor maintained for !SMP systems This means that load and utilisation signals are NOT available in single core systems. All the EAS code depends on these signals. 2. sched_group_energy is also SMP dependant. Again this means that all the EAS setup and preparation code (energyn model initialization) has to be properly guarded/disabled for !SMP systems. 3. SchedFreq depends on utilization signal, which is not available on !SMP systems. 4. SchedTune is useless on unicore systems if SchedFreq is not available. 5. WALT machinery is not required on single-core systems. This patch addresses all these issues by enforcing some constraints for single-core systems: a) WALT, SchedTune and SchedTune are now dependant on SMP b) The default governor for !SMP systems is INTERACTIVE c) The energy model initialisation/build functions are d) Other minor code re-arrangements and CONFIG_SMP guarding to enable single core builds. Signed-off-by: Patrick Bellasi --- drivers/cpufreq/Kconfig | 1 + include/linux/sched_energy.h | 8 ++++++++ include/trace/events/sched.h | 4 ++++ init/Kconfig | 1 + kernel/sched/Makefile | 4 ++-- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++---- kernel/sched/sched.h | 3 +-- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 298509ff9c34..d43c401ff190 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -218,6 +218,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config CPU_FREQ_GOV_SCHED bool "'sched' cpufreq governor" depends on CPU_FREQ + depends on SMP select CPU_FREQ_GOV_COMMON help 'sched' - this governor scales cpu frequency from the diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h index a3f1627ac609..1daf3e1f98a7 100644 --- a/include/linux/sched_energy.h +++ b/include/linux/sched_energy.h @@ -29,8 +29,16 @@ #define for_each_possible_sd_level(level) \ for (level = 0; level < NR_SD_LEVELS; level++) +#ifdef CONFIG_SMP + extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; void init_sched_energy_costs(void); +#else + +#define init_sched_energy_costs() do { } while (0) + +#endif /* CONFIG_SMP */ + #endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index abbfaeedd3ae..11898fb48c01 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -636,6 +636,8 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu_scale_factor) ); +#ifdef CONFIG_SMP + /* * Tracepoint for accounting sched averages for tasks. */ @@ -935,6 +937,8 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#endif /* CONFIG_SMP */ + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 944bff00c170..71f3ce810734 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1256,6 +1256,7 @@ config SCHED_AUTOGROUP config SCHED_TUNE bool "Boosting for CFS tasks (EXPERIMENTAL)" + depends on SMP help This option enables the system-wide support for task boosting. When this support is enabled a new sysctl interface is exposed to diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c6a85f813dfd..174762d8695b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,9 +12,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2f0546ef0af1..95f2ba4859d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4166,8 +4166,14 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util(cpu) +#endif +#ifdef CONFIG_SMP static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4180,8 +4186,7 @@ static void update_capacity_of(int cpu) req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } - -static bool cpu_overutilized(int cpu); +#endif /* * The enqueue_task method is called before nr_running is @@ -4193,8 +4198,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; int task_wakeup = flags & ENQUEUE_WAKEUP; +#endif for_each_sched_entity(se) { if (se->on_rq) @@ -4226,8 +4233,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) add_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4244,6 +4255,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4301,8 +4314,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) sub_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { schedtune_dequeue_task(p, cpu_of(rq)); /* @@ -4320,6 +4337,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) set_cfs_cpu_capacity(cpu_of(rq), false, 0); } } + +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -5713,6 +5733,8 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -8737,10 +8759,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); +#ifdef CONFIG_SMP if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a605bbec684..5cd947923e11 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1277,6 +1277,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); @@ -1359,8 +1360,6 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); - static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From 24884e54340e35d43bd09af0b12caef57a63458f Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 1 Aug 2016 11:34:05 +0100 Subject: [PATCH 531/813] sched/cpufreq_sched: Consolidated update Contains: sched/cpufreq_sched: use shorter throttle for raising OPP Avoid cases where a brief drop in load causes a change to a low OPP for the full throttle period. Use a shorter throttle period for raising OPP than for lowering OPP. sched-freq: Fix handling of max/min frequency This reverts commit 9726142608f5b3bf5df4280243c9d324e692a510. Change-Id: Ia78095354f7ad9492f00deb509a2b45112361eda sched/cpufreq: Increasing throttle_down_nsec to 50ms Change-Id: I2d8969cf2a64fa719b9dd86f43f9dd14b1ff84fe sched-freq: make throttle times tunable Change-Id: I127879645367425b273441d7f0306bb15d5633cb Signed-off-by: Srinath Sridharan Signed-off-by: Todd Kjos Signed-off-by: Juri Lelli [jstultz: Fwdported to 4.4] Signed-off-by: John Stultz --- drivers/cpufreq/Kconfig | 2 +- kernel/sched/cpufreq_sched.c | 175 +++++++++++++++++++++++++++++++---- 2 files changed, 160 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index d43c401ff190..e93877f38cae 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -114,7 +114,7 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_SCHED + select CPU_FREQ_GOV_INTERACTIVE help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index e1d208e101ed..3f8c67a3ea0f 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -19,7 +19,8 @@ #include "sched.h" -#define THROTTLE_NSEC 50000000 /* 50ms default */ +#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ +#define THROTTLE_UP_NSEC 500000 /* 500us default */ struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; static bool __read_mostly cpufreq_driver_slow; @@ -33,8 +34,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); /** * gov_data - per-policy data internal to the governor - * @throttle: next throttling period expiry. Derived from throttle_nsec - * @throttle_nsec: throttle period length in nanoseconds + * @up_throttle: next throttling period expiry if increasing OPP + * @down_throttle: next throttling period expiry if decreasing OPP + * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP + * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP * @task: worker thread for dvfs transition that may block/sleep * @irq_work: callback used to wake up worker thread * @requested_freq: last frequency requested by the sched governor @@ -48,11 +51,14 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); * call down_write(policy->rwsem). */ struct gov_data { - ktime_t throttle; - unsigned int throttle_nsec; + ktime_t up_throttle; + ktime_t down_throttle; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; + int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -66,25 +72,29 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); up_write(&policy->rwsem); } -static bool finish_last_request(struct gov_data *gd) +static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) { ktime_t now = ktime_get(); - if (ktime_after(now, gd->throttle)) + ktime_t throttle = gd->requested_freq < cur_freq ? + gd->down_throttle : gd->up_throttle; + + if (ktime_after(now, throttle)) return false; while (1) { - int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + int usec_left = ktime_to_ns(ktime_sub(throttle, now)); usec_left /= NSEC_PER_USEC; trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); - if (ktime_after(now, gd->throttle)) + if (ktime_after(now, throttle)) return true; } } @@ -128,7 +138,7 @@ static int cpufreq_sched_thread(void *data) * if the frequency thread sleeps while waiting to be * unthrottled, start over to check for a newer request */ - if (finish_last_request(gd)) + if (finish_last_request(gd, policy->cur)) continue; last_request = new_request; cpufreq_sched_try_driver_target(policy, new_request); @@ -183,16 +193,21 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) goto out; freq_new = policy->freq_table[index_new].frequency; + if (freq_new > policy->max) + freq_new = policy->max; + + if (freq_new < policy->min) + freq_new = policy->min; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, gd->requested_freq); - if (freq_new == gd->requested_freq) goto out; @@ -246,10 +261,17 @@ static inline void clear_sched_freq(void) static_key_slow_dec(&__sched_freq); } +static struct attribute_group sched_attr_group_gov_pol; +static struct attribute_group *get_sysfs_attr(void) +{ + return &sched_attr_group_gov_pol; +} + static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) { struct gov_data *gd; int cpu; + int rc; for_each_cpu(cpu, policy->cpus) memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, @@ -259,11 +281,20 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (!gd) return -ENOMEM; - gd->throttle_nsec = policy->cpuinfo.transition_latency ? + gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? policy->cpuinfo.transition_latency : - THROTTLE_NSEC; + THROTTLE_UP_NSEC; + gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->throttle_nsec); + __func__, gd->up_throttle_nsec); + + gd->max = policy->max; + + rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + if (rc) { + pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); + goto err; + } if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; @@ -301,6 +332,8 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } + sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + policy->governor_data = NULL; kfree(gd); @@ -317,6 +350,32 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) return 0; } +static void cpufreq_sched_limits(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", + policy->cpu, policy->min, policy->max, + policy->cur); + + if (!down_write_trylock(&policy->rwsem)) + return; + /* + * Need to keep track of highest max frequency for + * capacity calculations + */ + gd = policy->governor_data; + if (gd->max < policy->max) + gd->max = policy->max; + + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + + up_write(&policy->rwsem); +} + static int cpufreq_sched_stop(struct cpufreq_policy *policy) { int cpu; @@ -340,11 +399,95 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy, case CPUFREQ_GOV_STOP: return cpufreq_sched_stop(policy); case CPUFREQ_GOV_LIMITS: + cpufreq_sched_limits(policy); break; } return 0; } +/* Tunables */ +static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->up_throttle_nsec); +} + +static ssize_t store_up_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->up_throttle_nsec = val; + return count; +} + +static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->down_throttle_nsec); +} + +static ssize_t store_down_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->down_throttle_nsec = val; + return count; +} + +/* + * Create show/store routines + * - sys: One governor instance for complete SYSTEM + * - pol: One governor instance per struct cpufreq_policy + */ +#define show_gov_pol_sys(file_name) \ +static ssize_t show_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, char *buf) \ +{ \ + return show_##file_name(policy->governor_data, buf); \ +} + +#define store_gov_pol_sys(file_name) \ +static ssize_t store_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, const char *buf, size_t count) \ +{ \ + return store_##file_name(policy->governor_data, buf, count); \ +} + +#define gov_pol_attr_rw(_name) \ + static struct freq_attr _name##_gov_pol = \ + __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) + +#define show_store_gov_pol_sys(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name) +#define tunable_handlers(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name); \ + gov_pol_attr_rw(file_name) + +tunable_handlers(down_throttle_nsec); +tunable_handlers(up_throttle_nsec); + +/* Per policy governor instance */ +static struct attribute *sched_attributes_gov_pol[] = { + &up_throttle_nsec_gov_pol.attr, + &down_throttle_nsec_gov_pol.attr, + NULL, +}; + +static struct attribute_group sched_attr_group_gov_pol = { + .attrs = sched_attributes_gov_pol, + .name = "sched", +}; + #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED static #endif From 7169e3a0733b59fc82debcd0f1da5ac7b8ecdfdb Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 09:57:29 +0100 Subject: [PATCH 532/813] sched: EAS: take cstate into account when selecting idle core Introduce a new sysctl for this option, 'sched_cstate_aware'. When this is enabled, select_idle_sibling in CFS is modified to choose the idle CPU in the sibling group which has the lowest idle state index - idle state indexes are assumed to increase as sleep depth and hence wakeup latency increase. In this way, we attempt to minimise wakeup latency when an idle CPU is required. Signed-off-by: Srinath Sridharan Includes: sched: EAS: fix select_idle_sibling when sysctl_sched_cstate_aware is enabled, best_idle cpu will not be chosen in the original flow because it will goto done directly Bug: 30107557 Change-Id: Ie09c2e3960cafbb976f8d472747faefab3b4d6ac Signed-off-by: martin_liu --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 55 +++++++++++++++++++++++++++--------- kernel/sysctl.c | 7 +++++ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4479e48c7712..7d021393b0da 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95f2ba4859d1..4742a17c7d53 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,6 +51,7 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_cstate_aware = 1; /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5489,15 +5490,20 @@ static int select_idle_sibling(struct task_struct *p, int target) struct sched_domain *sd; struct sched_group *sg; int i = task_cpu(p); + int best_idle = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; - if (idle_cpu(target)) - return target; + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) + return target; - /* - * If the prevous cpu is cache affine and idle, don't be stupid. - */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -5510,18 +5516,41 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; - target = cpumask_first_and(sched_group_cpus(sg), + if (i == target && new_usage <= capacity_curr_of(target)) + return target; + + if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { + best_idle = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); - goto done; + goto done; + } next: sg = sg->next; } while (sg != sd->groups); } + if (best_idle > 0) + target = best_idle; + done: return target; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 46822df92c50..fc204ae8487d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -304,6 +304,13 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_cstate_aware", + .data = &sysctl_sched_cstate_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, From d42fb8f959562bc34f7f2b17ca1e370f93a306a9 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 29 Jul 2016 14:04:11 +0100 Subject: [PATCH 533/813] sched/fair: add tunable to force selection at cpu granularity EAS assumes that clusters with smaller capacity cores are more energy-efficient. This may not be true on non-big-little devices, so EAS can make incorrect cluster selections when finding a CPU to wake. The "sched_is_big_little" hint can be used to cause a cpu-based selection instead of cluster-based selection. This change incorporates the addition of the sync hint enable patch EAS did not honour synchronous wakeup hints, a new sysctl is created to ask EAS to use this information when selecting a CPU. The control is called "sched_sync_hint_enable". Also contains: EAS: sched/fair: for SMP bias toward idle core with capacity For SMP devices, on wakeup bias towards idle cores that have capacity vs busy devices that need a higher OPP eas: favor idle cpus for boosted tasks BUG: 29533997 BUG: 29512132 Change-Id: I0cc9a1b1b88fb52916f18bf2d25715bdc3634f9c Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan eas/sched/fair: Favoring busy cpus with low OPPs BUG: 29533997 BUG: 29512132 Change-Id: I9305b3239698d64278db715a2e277ea0bb4ece79 Signed-off-by: Juri Lelli --- include/linux/sched/sysctl.h | 2 + kernel/sched/fair.c | 191 +++++++++++++++++++++++++++-------- kernel/sysctl.c | 14 +++ 3 files changed, 167 insertions(+), 40 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7d021393b0da..4883dcf3e1a9 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_is_big_little; +extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4742a17c7d53..e2b6174db07d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,7 +51,10 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_is_big_little = 0; +unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; + /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5555,7 +5558,97 @@ done: return target; } -static int energy_aware_wake_cpu(struct task_struct *p, int target) +static inline int find_best_target(struct task_struct *p) +{ + int i, boosted; + int target_cpu = -1; + int target_capacity = 0; + int backup_capacity = 0; + int idle_cpu = -1; + int best_idle_cstate = INT_MAX; + int backup_cpu = -1; + unsigned long task_util_boosted, new_util; + + /* + * Favor 1) busy cpu with most capacity at current OPP + * 2) idle_cpu with capacity at current OPP + * 3) busy cpu with capacity at higher OPP + */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p); +#else + boosted = 0; +#endif + task_util_boosted = boosted_task_util(p); + for_each_cpu(i, tsk_cpus_allowed(p)) { + int cur_capacity = capacity_curr_of(i); + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util_boosted; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + + if (new_util > capacity_orig_of(i)) + continue; + + /* + * For boosted tasks we favor idle cpus unconditionally to + * improve latency. + */ + if (idle_idx >= 0 && boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + continue; + } + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + if (target_capacity == 0 || + target_capacity > cur_capacity) { + /* busy CPU with most capacity at current OPP */ + target_cpu = i; + target_capacity = cur_capacity; + } + } else if (!boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + } + } else if (backup_capacity == 0 || + backup_capacity > cur_capacity) { + /* first busy CPU with capacity at higher OPP */ + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + + if (!boosted && target_cpu < 0) { + target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; + } + + if (boosted && idle_cpu >= 0) + target_cpu = idle_cpu; + return target_cpu; +} + +static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) { struct sched_domain *sd; struct sched_group *sg, *sg_target; @@ -5563,6 +5656,14 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) int target_cpu = task_cpu(p); int i; + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); if (!sd) @@ -5571,50 +5672,60 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) sg = sd->groups; sg_target = sg; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + if (sysctl_sched_is_big_little) { /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + boosted_task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + } else { /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. + * Find a cpu with sufficient capacity */ - int new_util = cpu_util(i) + boosted_task_util(p); - - if (new_util > capacity_orig_of(i)) - continue; - - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; - } - - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; + int tmp_target = find_best_target(p); + if (tmp_target >= 0) + target_cpu = tmp_target; } if (target_cpu != task_cpu(p)) { @@ -5691,7 +5802,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu); + new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fc204ae8487d..831d674a5566 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -304,6 +304,20 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_is_big_little", + .data = &sysctl_sched_is_big_little, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From b312c991e9055198e96571feaf73df26e647df56 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 11 Mar 2016 16:44:16 -0800 Subject: [PATCH 534/813] sched/fair: add tunable to set initial task load The choice of initial task load upon fork has a large influence on CPU and OPP selection when scheduler-driven DVFS is in use. Make this tuneable by adding a new sysctl "sched_initial_task_util". If the sched governor is not used, the default remains at SCHED_LOAD_SCALE Otherwise, the value from the sysctl is used. This defaults to 0. Signed-off-by: "Todd Kjos " --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 5 ++++- kernel/sysctl.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4883dcf3e1a9..2834841c507e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,7 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; +extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e2b6174db07d..c60fd6685a05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -53,6 +53,7 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; /* @@ -687,7 +688,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_avg = sched_freq() ? + sysctl_sched_initial_task_util : + scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 831d674a5566..dd46f370b73a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -318,6 +318,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_initial_task_util", + .data = &sysctl_sched_initial_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From d3dd88bcc7a0b4ece9e52a8983c0025fec9d2f8a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 16:39:27 +0100 Subject: [PATCH 535/813] FIX: sched/tune: update usage of boosted task utilisation on CPU selection A boosted task needs to be scheduled on a CPU which can grant a minimum capacity which is higher than its utilization. However, a task can be allocated on a CPU which already provides an utilization which is higher than the task boosted utilization itself. Moreover, with the previous approach a task 100% boosted is not fitting any CPU. This patch makes use of the boosted task utilization just as a threashold which defines the minimum capacity should be available on a CPU to host that task. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c60fd6685a05..08dad006801c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5657,6 +5657,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) struct sched_group *sg, *sg_target; int target_max_cap = INT_MAX; int target_cpu = task_cpu(p); + unsigned long task_util_boosted, new_util; int i; if (sysctl_sched_sync_hint_enable && sync) { @@ -5700,6 +5701,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) } } while (sg = sg->next, sg != sd->groups); + task_util_boosted = boosted_task_util(p); /* Find cpu with sufficient capacity */ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /* @@ -5707,8 +5709,13 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + boosted_task_util(p); + new_util = cpu_util(i) + task_util_boosted; + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ if (new_util > capacity_orig_of(i)) continue; From 9064187216fa872e72548618c5cced462e5bee24 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:45:57 +0100 Subject: [PATCH 536/813] FIX: sched/tune: move schedtune_nornalize_energy into fair.c The energy normalization function is required to get the proper values for the P-E space filtering function to work. That normalization is part of the hot wakeup path and currently implemented with a function call. Moving the normalization function into fair.c allows the compiler to further optimize that code by reducing overheads in the wakeup hot path. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++---------------- kernel/sched/tune.c | 42 +-------------- kernel/sched/tune.h | 12 ++++- 3 files changed, 91 insertions(+), 84 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08dad006801c..f8f5529dcae8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4996,44 +4996,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -#ifdef CONFIG_SCHED_TUNE -static int energy_diff_evaluate(struct energy_env *eenv) -{ - unsigned int boost; - int nrg_delta; - - /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif - if (boost == 0) - return eenv->nrg.diff; - - /* Compute normalized energy diff */ - nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); - eenv->nrg.delta = nrg_delta; - - eenv->payoff = schedtune_accept_deltas( - eenv->nrg.delta, - eenv->cap.delta, - eenv->task); - - /* - * When SchedTune is enabled, the energy_diff() function will return - * the computed energy payoff value. Since the energy_diff() return - * value is expected to be negative by its callers, this evaluation - * function return a negative value each time the evaluation return a - * positive payoff, which is the condition for the acceptance of - * a scheduling decision - */ - return -eenv->payoff; -} -#else /* CONFIG_SCHED_TUNE */ -#define energy_diff_evaluate(eenv) eenv->nrg.diff -#endif - /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5041,12 +5003,11 @@ static int energy_diff_evaluate(struct energy_env *eenv) * utilization is removed from or added to the system (e.g. task wake-up). If * both are specified, the utilization is migrated. */ -static int energy_diff(struct energy_env *eenv) +static inline int __energy_diff(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; - int result; struct energy_env eenv_before = { .util_delta = 0, @@ -5090,17 +5051,91 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - result = energy_diff_evaluate(eenv); - trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); - return result; + return eenv->nrg.diff; } +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; +#ifdef CONFIG_SCHED_DEBUG + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +static inline int +energy_diff(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b40d40dc3c49..8ca8db2de818 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -3,24 +3,17 @@ #include #include #include -#include #include #include #include #include "sched.h" +#include "tune.h" unsigned int sysctl_sched_cfs_boost __read_mostly; -/* - * System energy normalization constants - */ -static struct target_nrg { - unsigned long min_power; - unsigned long max_power; - struct reciprocal_value rdiv; -} schedtune_target_nrg; +extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; @@ -587,37 +580,6 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, return 0; } -/* - * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], - * corresponding to the specified energy variation. - */ -int -schedtune_normalize_energy(int energy_diff) -{ - u32 normalized_nrg; - int max_delta; - -#ifdef CONFIG_SCHED_DEBUG - /* Check for boundaries */ - max_delta = schedtune_target_nrg.max_power; - max_delta -= schedtune_target_nrg.min_power; - WARN_ON(abs(energy_diff) >= max_delta); -#endif - - /* Do scaling using positive numbers to increase the range */ - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; - - /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; - - /* Normalize on max energy for target platform */ - normalized_nrg = reciprocal_divide( - normalized_nrg, schedtune_target_nrg.rdiv); - - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; -} - #ifdef CONFIG_SCHED_DEBUG static void schedtune_test_nrg(unsigned long delta_pwr) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index f7273a5d994a..7d2aa7951554 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -1,6 +1,17 @@ #ifdef CONFIG_SCHED_TUNE +#include + +/* + * System energy normalization constants + */ +struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +}; + #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); @@ -25,7 +36,6 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) -#define schedtune_normalize_energy(energy) energy #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta #endif /* CONFIG_SCHED_TUNE */ From 3fc52a99e795d0086f402c36c45bb64e66e7b126 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 28 Jul 2016 17:28:55 +0100 Subject: [PATCH 537/813] sched/tune: Add support for negative boost values Change-Id: I164ee04ba98c3a776605f18cb65ee61b3e917939 Contains also: eas/stune: schedtune cpu boost_max must be non-negative. This is to avoid under-accounting cpu capacity which may cause task stacking and frequency spikes. Change-Id: Ie1c1cbd52a6edb77b4c15a830030aa748dff6f29 --- include/trace/events/sched.h | 20 +++++++++---------- kernel/sched/fair.c | 37 ++++++++++++++++++++---------------- kernel/sched/tune.c | 25 +++++++++++++++--------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 11898fb48c01..debcf417c535 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -731,14 +731,14 @@ TRACE_EVENT(sched_tune_config, */ TRACE_EVENT(sched_boost_cpu, - TP_PROTO(int cpu, unsigned long util, unsigned long margin), + TP_PROTO(int cpu, unsigned long util, long margin), TP_ARGS(cpu, util, margin), TP_STRUCT__entry( __field( int, cpu ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field(long, margin ) ), TP_fast_assign( @@ -747,7 +747,7 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin = margin; ), - TP_printk("cpu=%d util=%lu margin=%lu", + TP_printk("cpu=%d util=%lu margin=%ld", __entry->cpu, __entry->util, __entry->margin) @@ -759,7 +759,7 @@ TRACE_EVENT(sched_boost_cpu, TRACE_EVENT(sched_tune_tasks_update, TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, - unsigned int boost, unsigned int max_boost), + int boost, int max_boost), TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), @@ -769,8 +769,8 @@ TRACE_EVENT(sched_tune_tasks_update, __field( int, cpu ) __field( int, tasks ) __field( int, idx ) - __field( unsigned int, boost ) - __field( unsigned int, max_boost ) + __field( int, boost ) + __field( int, max_boost ) ), TP_fast_assign( @@ -784,7 +784,7 @@ TRACE_EVENT(sched_tune_tasks_update, ), TP_printk("pid=%d comm=%s " - "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", __entry->pid, __entry->comm, __entry->cpu, __entry->tasks, __entry->idx, __entry->boost, __entry->max_boost) @@ -820,7 +820,7 @@ TRACE_EVENT(sched_tune_boostgroup_update, */ TRACE_EVENT(sched_boost_task, - TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + TP_PROTO(struct task_struct *tsk, unsigned long util, long margin), TP_ARGS(tsk, util, margin), @@ -828,7 +828,7 @@ TRACE_EVENT(sched_boost_task, __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field( long, margin ) ), @@ -839,7 +839,7 @@ TRACE_EVENT(sched_boost_task, __entry->margin = margin; ), - TP_printk("comm=%s pid=%d util=%lu margin=%lu", + TP_printk("comm=%s pid=%d util=%lu margin=%ld", __entry->comm, __entry->pid, __entry->util, __entry->margin) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8f5529dcae8..a3e6fa63be4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5271,22 +5271,25 @@ static bool cpu_overutilized(int cpu) #ifdef CONFIG_SCHED_TUNE -static unsigned long -schedtune_margin(unsigned long signal, unsigned long boost) +static long +schedtune_margin(unsigned long signal, long boost) { - unsigned long long margin = 0; + long long margin = 0; /* * Signal proportional compensation (SPC) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S) + * M = B * (SCHED_LOAD_SCALE - S), if B is positive + * M = B * S, if B is negative * The obtained M could be used by the caller to "boost" S. */ - margin = SCHED_LOAD_SCALE - signal; - margin *= boost; - + if (boost >= 0) { + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; /* * Fast integer division by constant: * Constant : (C) = 100 @@ -5302,13 +5305,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) margin *= 1311; margin >>= 17; + if (boost < 0) + margin *= -1; return margin; } -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost; + int boost; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_cpu_boost(cpu); @@ -5321,12 +5326,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } -static inline unsigned long +static inline long schedtune_task_margin(struct task_struct *task) { - unsigned int boost; + int boost; unsigned long util; - unsigned long margin; + long margin; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_task_boost(task); @@ -5344,13 +5349,13 @@ schedtune_task_margin(struct task_struct *task) #else /* CONFIG_SCHED_TUNE */ -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } -static inline unsigned int +static inline int schedtune_task_margin(struct task_struct *task) { return 0; @@ -5362,7 +5367,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util, cpu); + long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); @@ -5373,7 +5378,7 @@ static inline unsigned long boosted_task_util(struct task_struct *task) { unsigned long util = task_util(task); - unsigned long margin = schedtune_task_margin(task); + long margin = schedtune_task_margin(task); trace_sched_boost_task(task, util, margin); diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 8ca8db2de818..afc4a7747161 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -213,10 +213,11 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ - unsigned boost_max; + bool idle; + int boost_max; struct { /* The boost for tasks on that boost group */ - unsigned boost; + int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; @@ -229,7 +230,7 @@ static void schedtune_cpu_update(int cpu) { struct boost_groups *bg; - unsigned boost_max; + int boost_max; int idx; bg = &per_cpu(cpu_boost_groups, cpu); @@ -243,9 +244,13 @@ schedtune_cpu_update(int cpu) */ if (bg->group[idx].tasks == 0) continue; + boost_max = max(boost_max, bg->group[idx].boost); } - + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); bg->boost_max = boost_max; } @@ -391,7 +396,7 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } -static u64 +static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); @@ -401,11 +406,13 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft) static int boost_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 boost) + s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; - if (boost < 0 || boost > 100) + if (boost < -100 || boost > 100) return -EINVAL; st->boost = boost; @@ -423,8 +430,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, static struct cftype files[] = { { .name = "boost", - .read_u64 = boost_read, - .write_u64 = boost_write, + .read_s64 = boost_read, + .write_s64 = boost_write, }, { } /* terminate */ }; From 6102e30f9d42c7827bb5f46c863d247bb4ae4038 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:38:25 +0100 Subject: [PATCH 538/813] FIXUP: sched/tune: fix payoff calculation for boost region The definition of the acceptance regions as well as the translation of these regions into a payoff value was both wrong which turned out in: a) a wrong definition of payoff for the performance boost region b) a correct "by chance" definition of the payoff for the performance constraint region (i.e. two sign errors together fixing the formula) This patch provides a better description of the cut regions as well as a fixed version of the payoff computations, which are now reduced to a single formula usable for both cases. Reported-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 79 +++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index afc4a7747161..6d5fbde9c70e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -51,50 +51,51 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_boost_idx, int perf_constrain_idx) { int payoff = -INT_MAX; + int gain_idx = -1; /* Performance Boost (B) region */ - if (nrg_delta > 0 && cap_delta > 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta < cap_gain / nrg_gain - * which is: - * nrg_delta * cap_gain > cap_delta * nrg_gain - */ - payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; - payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_boost_idx].nrg_gain, - threshold_gains[perf_boost_idx].cap_gain, - payoff, 8); - - return payoff; - } - + if (nrg_delta >= 0 && cap_delta > 0) + gain_idx = perf_boost_idx; /* Performance Constraint (C) region */ - if (nrg_delta < 0 && cap_delta < 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta > cap_gain / nrg_gain - * which is: - * cap_delta * nrg_gain > nrg_delta * cap_gain - */ - payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; - payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_constrain_idx].nrg_gain, - threshold_gains[perf_constrain_idx].cap_gain, - payoff, 6); - - return payoff; - } + else if (nrg_delta < 0 && cap_delta <= 0) + gain_idx = perf_constrain_idx; /* Default: reject schedule candidate */ + if (gain_idx == -1) + return payoff; + + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * + * - Performance Boost (B) region + * + * Condition: nrg_delta > 0 && cap_delta > 0 + * Payoff criteria: + * cap_gain / nrg_gain < cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since both nrg_gain and nrg_delta are positive, the + * inequality does not change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * - Performance Constraint (C) region + * + * Condition: nrg_delta < 0 && cap_delta < 0 + * payoff criteria: + * cap_gain / nrg_gain > cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since nrg_gain > 0 while nrg_delta < 0, the + * inequality change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * This means that, in case of same positive defined {cap,nrg}_gain + * for both the B and C regions, we can use the same payoff formula + * where a positive value represents the accept condition. + */ + payoff = cap_delta * threshold_gains[gain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain; + return payoff; } From 877bc05081eb9925290721d32fa0bedcc2e932a1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:42:36 +0100 Subject: [PATCH 539/813] sched/{fair,tune}: simplify fair.c code The usage of conditional compiled code is discouraged in fair.c. This patch clean up a bit fair.c by moving schedtune_{cpu.task}_boost definitions into tune.h. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 21 +++------------------ kernel/sched/tune.h | 6 ++++++ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a3e6fa63be4d..9c717c3be75d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5098,18 +5098,13 @@ normalize_energy(int energy_diff) static inline int energy_diff(struct energy_env *eenv) { - unsigned int boost; + int boost = schedtune_task_boost(eenv->task); int nrg_delta; /* Conpute "absolute" energy diff */ __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return eenv->nrg.diff; @@ -5313,13 +5308,8 @@ schedtune_margin(unsigned long signal, long boost) static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - int boost; + int boost = schedtune_cpu_boost(cpu); -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_cpu_boost(cpu); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; @@ -5329,15 +5319,10 @@ schedtune_cpu_margin(unsigned long util, int cpu) static inline long schedtune_task_margin(struct task_struct *task) { - int boost; + int boost = schedtune_task_boost(task); unsigned long util; long margin; -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 7d2aa7951554..99637758a8af 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -22,6 +22,9 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #else /* CONFIG_CGROUP_SCHEDTUNE */ +#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() +#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -33,6 +36,9 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #else /* CONFIG_SCHED_TUNE */ +#define schedtune_cpu_boost(cpu) 0 +#define schedtune_task_boost(tsk) 0 + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 6effe93fe5df6555ebe2f24390a8402cdc1ab962 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:19:41 +0100 Subject: [PATCH 540/813] sched/tune: use a single initialisation function With the introduction of initialization function required to compute the energy normalization constants from DTB at boot time, we have now a late_initcall which is already used by SchedTune. This patch consolidate within that function the other initialization bits which was previously deferred to the first CGroup creation. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/tune.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 6d5fbde9c70e..a691b8db2888 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -410,8 +410,6 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); - unsigned threshold_idx; - int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; @@ -456,33 +454,14 @@ schedtune_boostgroup_init(struct schedtune *st) return 0; } -static int -schedtune_init(void) -{ - struct boost_groups *bg; - int cpu; - - /* Initialize the per CPU boost groups */ - for_each_possible_cpu(cpu) { - bg = &per_cpu(cpu_boost_groups, cpu); - memset(bg, 0, sizeof(struct boost_groups)); - } - - pr_info(" schedtune configured to support %d boost groups\n", - BOOSTGROUPS_COUNT); - return 0; -} - static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { struct schedtune *st; int idx; - if (!parent_css) { - schedtune_init(); + if (!parent_css) return &root_schedtune.css; - } /* Allow only single level hierachies */ if (parent_css != &root_schedtune.css) { @@ -543,6 +522,22 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +static inline void +schedtune_init_cgroups(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info("schedtune: configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); +} + #else /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -690,7 +685,7 @@ schedtune_add_cluster_nrg( * that bind the EM to the topology information. */ static int -schedtune_init_late(void) +schedtune_init(void) { struct target_nrg *ste = &schedtune_target_nrg; unsigned long delta_pwr = 0; @@ -730,10 +725,17 @@ schedtune_init_late(void) ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); schedtune_test_nrg(delta_pwr); + +#ifdef CONFIG_CGROUP_SCHEDTUNE + schedtune_init_cgroups(); +#else + pr_info("schedtune: configured to support global boosting only\n"); +#endif + return 0; nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init_late); +late_initcall(schedtune_init); From 414e73880e4a57b75b177f1f7aa6980c87150a80 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 18:44:40 +0100 Subject: [PATCH 541/813] FIXUP: sched/tune: fix accounting for runnable tasks Contains: sched/tune: fix accounting for runnable tasks (1/5) The accounting for tasks into boost groups of different CPUs is currently broken mainly because: a) we do not properly track the change of boost group of a RUNNABLE task b) there are race conditions between migration code and accounting code This patch provides a fixes to ensure enqueue/dequeue accounting also for throttled tasks. Without this patch is can happen that a task is enqueued into a throttled RQ thus not being accounted for the boosting of the corresponding RQ. We could argue that a throttled task should not boost a CPU, however: a) properly implementing CPU boosting considering throttled tasks will increase a lot the complexity of the solution b) it's not easy to quantify the benefits introduced by such a more complex solution Since task throttling requires the usage of the CFS bandwidth controller, which is not widely used on mobile systems (at least not by Android kernels so far), for the time being we go for the simple solution and boost also for throttled RQs. sched/tune: fix accounting for runnable tasks (2/5) This patch provides the code required to enforce proper locking. A per boost group spinlock has been added to grant atomic accounting of tasks as well as to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. sched/tune: fix accounting for runnable tasks (3/5) This patch adds cgroups {allow,can,cancel}_attach callbacks. Since a task can be migrated between boost groups while it's running, the CGroups's attach callbacks have been added to properly migrate boost contributions of RUNNABLE tasks. The RQ's lock is used to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. While the SchedTune's CPU lock is used to grant atrocity of the accounting within the CPU. NOTE: the current implementation does not allows a concurrent CPU migration and CGroups change. sched/tune: fix accounting for runnable tasks (4/5) This fixes accounting for exiting tasks by adding a dedicated call early in the do_exit() syscall, which disables SchedTune accounting as soon as a task is flagged PF_EXITING. This flag is set before the multiple dequeue/enqueue dance triggered by cgroup_exit() which is useful only to inject useless tasks movements thus increasing possibilities for race conditions with the migration code. The schedtune_exit_task() call does the last dequeue of a task from its current boost group. This is a solution more aligned with what happens in mainline kernels (>v4.4) where the exit_cgroup does not move anymore a dying task to the root control group. sched/tune: fix accounting for runnable tasks (5/5) To avoid accounting issues at startup, this patch disable the SchedTune accounting until the required data structures have been properly initialized. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/exit.c | 5 ++ kernel/sched/core.c | 12 +++ kernel/sched/fair.c | 11 ++- kernel/sched/sched.h | 3 + kernel/sched/tune.c | 182 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/tune.h | 6 ++ 6 files changed, 195 insertions(+), 24 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index ffba5df4abd5..62c4bd4abd3a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,8 @@ #include #include +#include "sched/tune.h" + #include #include #include @@ -699,6 +701,9 @@ void do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + schedtune_exit_task(tsk); + /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 07f389a0f22f..b814c13f850f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000; /* cpus with isolated domains */ cpumask_var_t cpu_isolated_map; +struct rq * +lock_rq_of(struct task_struct *p, unsigned long *flags) +{ + return task_rq_lock(p, flags); +} + +void +unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags) +{ + task_rq_unlock(rq, p, flags); +} + /* * this_rq_lock - lock this runqueue and disable interrupts. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9c717c3be75d..736adab1a503 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4250,8 +4250,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; - schedtune_enqueue_task(p, cpu_of(rq)); - /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4262,6 +4260,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } + + /* Update SchedTune accouting */ + schedtune_enqueue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -4327,7 +4329,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { - schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -4345,6 +4346,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } + /* Update SchedTune accouting */ + schedtune_dequeue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5625,7 +5629,6 @@ static inline int find_best_target(struct task_struct *p) * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5cd947923e11..1b838cff2f20 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } +extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags); +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags); + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a691b8db2888..4c77cc23e65b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -11,6 +11,10 @@ #include "sched.h" #include "tune.h" +#ifdef CONFIG_CGROUP_SCHEDTUNE +static bool schedtune_initialized = false; +#endif + unsigned int sysctl_sched_cfs_boost __read_mostly; extern struct target_nrg schedtune_target_nrg; @@ -222,6 +226,8 @@ struct boost_groups { /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ @@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { - struct boost_groups *bg; - int tasks; - - bg = &per_cpu(cpu_boost_groups, cpu); + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; /* Update boosted tasks count while avoiding to make it negative */ - if (task_count < 0 && bg->group[idx].tasks <= -task_count) - bg->group[idx].tasks = 0; - else - bg->group[idx].tasks += task_count; - - /* Boost group activation or deactivation on that RQ */ - tasks = bg->group[idx].tasks; - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + bg->group[idx].tasks = max(0, tasks); trace_sched_tune_tasks_update(p, cpu, tasks, idx, bg->group[idx].boost, bg->boost_max); + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); } /* @@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. @@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, 1); + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_allow_attach(struct cgroup_taskset *tset) +{ + /* We always allows tasks to be moved between existing CGroups */ + return 0; +} + +int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct boost_groups *bg; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + + cgroup_taskset_for_each(task, css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = lock_rq_of(task, &irq_flags); + + if (!task->on_rq) { + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} + +void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); } /* @@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. - * The last dequeue will be done by cgroup exit() callback. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). */ if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, -1); + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct schedtune *st; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + rq = lock_rq_of(tsk, &irq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + unlock_rq_of(rq, tsk, &irq_flags); } int schedtune_cpu_boost(int cpu) @@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, +// .allow_attach = schedtune_allow_attach, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 99637758a8af..be1785eb1c5b 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +void schedtune_exit_task(struct task_struct *tsk); + void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); @@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_cpu_boost(cpu) 0 #define schedtune_task_boost(tsk) 0 +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 554ece8cfaa52de25955794f2c1a5467209c591c Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 29 Jul 2016 14:41:25 +0100 Subject: [PATCH 542/813] sched/fair: optimize idle cpu selection for boosted tasks find_best_target CPU selection is biased towards lower CPU IDs. Bias towards higher CPUs for boosted tasks. For boosted tasks unconditionally use the idle CPU returned by find_best_target. BUG: 29512132 Change-Id: I3d650051752163fcf3dc7909751d1fde3f9d17c0 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 70 +++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 736adab1a503..84f5e12c8e12 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5590,32 +5590,30 @@ done: return target; } -static inline int find_best_target(struct task_struct *p) +static inline int find_best_target(struct task_struct *p, bool boosted) { - int i, boosted; + int iter_cpu; int target_cpu = -1; int target_capacity = 0; int backup_capacity = 0; - int idle_cpu = -1; + int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; unsigned long task_util_boosted, new_util; - /* - * Favor 1) busy cpu with most capacity at current OPP - * 2) idle_cpu with capacity at current OPP - * 3) busy cpu with capacity at higher OPP - */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boosted = schedtune_task_boost(p); -#else - boosted = 0; -#endif task_util_boosted = boosted_task_util(p); - for_each_cpu(i, tsk_cpus_allowed(p)) { - int cur_capacity = capacity_curr_of(i); - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + struct rq *rq; + int idle_idx; + + /* + * favor higher cpus for boosted tasks + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) + continue; /* * p's blocked utilization is still accounted for on prev_cpu @@ -5636,46 +5634,43 @@ static inline int find_best_target(struct task_struct *p) * For boosted tasks we favor idle cpus unconditionally to * improve latency. */ - if (idle_idx >= 0 && boosted) { - if (idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - idle_cpu = i; - } + if (idle_cpu(i) && boosted) { + if (best_idle_cpu < 0) + best_idle_cpu = i; continue; } + cur_capacity = capacity_curr_of(i); + rq = cpu_rq(i); + idle_idx = idle_get_state_idx(rq); + if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { if (target_capacity == 0 || target_capacity > cur_capacity) { - /* busy CPU with most capacity at current OPP */ target_cpu = i; target_capacity = cur_capacity; } } else if (!boosted) { - if (idle_cpu < 0 || + if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { best_idle_cstate = idle_idx; - idle_cpu = i; + best_idle_cpu = i; } } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { - /* first busy CPU with capacity at higher OPP */ backup_capacity = cur_capacity; backup_cpu = i; } } - if (!boosted && target_cpu < 0) { - target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; - } + if (boosted && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + else if (target_cpu < 0) + target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; - if (boosted && idle_cpu >= 0) - target_cpu = idle_cpu; return target_cpu; } @@ -5761,9 +5756,16 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) /* * Find a cpu with sufficient capacity */ - int tmp_target = find_best_target(p); +#ifdef CONFIG_CGROUP_SCHEDTUNE + bool boosted = schedtune_task_boost(p) > 0; +#else + bool boosted = 0; +#endif + int tmp_target = find_best_target(p, boosted); if (tmp_target >= 0) target_cpu = tmp_target; + if (boosted && idle_cpu(target_cpu)) + return target_cpu; } if (target_cpu != task_cpu(p)) { From 3a7e62318271ddee0ba042becec9e2312ecd4927 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:32:26 +0100 Subject: [PATCH 543/813] sched/tune: fix PB and PC cuts indexes definition The current definition of the Performance Boost (PB) and Performance Constraint (PC) regions is has two main issues: 1) in the computation of the boost index we overflow the thresholds_gains table for boost=100 2) the two cuts had _NOT_ the same ratio The last point means that when boost=0 we do _not_ have a "standard" EAS behaviour, i.e. accepting all candidate which decrease energy regardless of their impact on performances. Instead, we accept only schedule candidate which are in the Optimal region, i.e. decrease energy while increasing performances. This behaviour can have a negative impact also on CPU selection policies which tries to spread tasks to reduce latencies. Indeed, for example we could end up rejecting a schedule candidate which want to move a task from a congested CPU to an idle one while, specifically in the case where the target CPU will be running on a lower OPP. This patch fixes these two issues by properly clamping the boost value in the appropriate range to compute the threshold indexes as well as by using the same threshold index for both cuts. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: fix update of threshold index for boost groups When SchedTune is configured to work with CGroup mode, each time we update the boost value of a group we do not update the threshed indexes for the definition of the Performance Boost (PC) and Performance Constraint (PC) region. This means that while the OPP boosting and CPU biasing selection is working as expected, the __schedtune_accept_deltas function is always using the initial values for these cuts. This patch ensure that each time a new boost value is configured for a boost group, the cuts for the PB and PC region are properly updated too. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: update PC and PB cuts definition The current definition of Performance Boost (PB) and Performance Constraint (PC) cuts defines two "dead regions": - up to 20% boost: we are in energy-reduction only mode, i.e. accept all candidate which reduce energy - over 70% boost: we are in performance-increase only mode, i.e. accept only sched candidate which do not reduce performances This patch uses a more fine grained configuration where these two "dead regions" are reduced to: up to 10% and over 90%. This should allow to have some boosting benefits starting from 10% boost values as well as not being to much permissive starting from boost values of 80%. Suggested-by: Leo Yan Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan bug: 28312446 Change-Id: Ia326c66521e38c98e7a7eddbbb7c437875efa1ba Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4c77cc23e65b..d24f365b0c90 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -38,16 +38,16 @@ struct threshold_params { */ static struct threshold_params threshold_gains[] = { - { 0, 4 }, /* >= 0% */ - { 0, 4 }, /* >= 10% */ - { 1, 4 }, /* >= 20% */ - { 2, 4 }, /* >= 30% */ - { 3, 4 }, /* >= 40% */ - { 4, 3 }, /* >= 50% */ - { 4, 2 }, /* >= 60% */ - { 4, 1 }, /* >= 70% */ - { 4, 0 }, /* >= 80% */ - { 4, 0 } /* >= 90% */ + { 0, 5 }, /* < 10% */ + { 1, 5 }, /* < 20% */ + { 2, 5 }, /* < 30% */ + { 3, 5 }, /* < 40% */ + { 4, 5 }, /* < 50% */ + { 5, 4 }, /* < 60% */ + { 5, 3 }, /* < 70% */ + { 5, 2 }, /* < 80% */ + { 5, 1 }, /* < 90% */ + { 5, 0 } /* <= 100% */ }; static int @@ -549,13 +549,29 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; + boost_pct = boost; + + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + st->perf_boost_idx = threshold_idx; + st->perf_constrain_idx = threshold_idx; st->boost = boost; - if (css == &root_schedtune.css) + if (css == &root_schedtune.css) { sysctl_sched_cfs_boost = boost; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; + } /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); @@ -710,17 +726,25 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + unsigned threshold_idx; + int boost_pct; if (ret || !write) return ret; - /* Performance Boost (B) region threshold params */ - perf_boost_idx = sysctl_sched_cfs_boost; - perf_boost_idx /= 10; + if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100) + return -EINVAL; + boost_pct = sysctl_sched_cfs_boost; - /* Performance Constraint (C) region threshold params */ - perf_constrain_idx = 100 - sysctl_sched_cfs_boost; - perf_constrain_idx /= 10; + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; return 0; } From b41fa2aec51a031e8b53486966e885116c314579 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 31 May 2016 09:08:38 -0700 Subject: [PATCH 544/813] sched: Introduce Window Assisted Load Tracking (WALT) use a window based view of time in order to track task demand and CPU utilization in the scheduler. Window Assisted Load Tracking (WALT) implementation credits: Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla and Todd Kjos Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708 Includes fixes for issues: eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a race resulting in watchdog resets BUG: 29353986 Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb Handle walt accounting anomoly during resume During resume, there is a corner case where on wakeup, a task's prev_runnable_sum can go negative. This is a workaround that fixes the condition and warns (instead of crashing). BUG: 29464099 Change-Id: I173e7874324b31a3584435530281708145773508 Signed-off-by: Todd Kjos Signed-off-by: Srinath Sridharan Signed-off-by: Juri Lelli [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- include/linux/sched.h | 53 ++ include/linux/sched/sysctl.h | 5 + include/trace/events/sched.h | 149 +++++ init/Kconfig | 9 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 43 +- kernel/sched/fair.c | 20 + kernel/sched/rt.c | 4 + kernel/sched/sched.h | 34 ++ kernel/sched/stop_task.c | 3 + kernel/sched/walt.c | 1098 ++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 57 ++ kernel/sysctl.c | 23 + 13 files changed, 1498 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/walt.c create mode 100644 kernel/sched/walt.h diff --git a/include/linux/sched.h b/include/linux/sched.h index f1a28bafe7ea..ede29e8db82d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -317,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!( /* Task command name length */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + #include /* @@ -1276,6 +1285,41 @@ struct sched_statistics { }; #endif +#ifdef CONFIG_SCHED_WALT +#define RAVG_HIST_SIZE_MAX 5 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window' represents task's contribution to cpu busy time + * statistics (rq->curr_runnable_sum) in current window + * + * 'prev_window' represents task's contribution to cpu busy time + * statistics (rq->prev_runnable_sum) in previous window + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 curr_window, prev_window; + u16 active_windows; +}; +#endif + struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; @@ -1433,6 +1477,15 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2834841c507e..710f58a28d63 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index debcf417c535..fa1b3df836bc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -937,6 +937,155 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#ifdef CONFIG_SCHED_WALT +struct rq; + +TRACE_EVENT(walt_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, int evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) + __field( u64, wallclock ) + __field( u64, mark_start ) + __field( u64, delta_m ) + __field( u64, win_start ) + __field( u64, delta ) + __field( u64, irqtime ) + __field( int, evt ) + __field(unsigned int, demand ) + __field(unsigned int, sum ) + __field( int, cpu ) + __field( u64, cs ) + __field( u64, ps ) + __field( u32, curr_window ) + __field( u32, prev_window ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" + " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" + , __entry->wallclock, __entry->win_start, __entry->delta, + __entry->evt, __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->cs, __entry->ps, + __entry->curr_window, __entry->prev_window, + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows + ) +); + +TRACE_EVENT(walt_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + int evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, runtime ) + __field( int, samples ) + __field( int, evt ) + __field( u64, demand ) + __field(unsigned int, walt_avg ) + __field(unsigned int, pelt_avg ) + __array( u32, hist, RAVG_HIST_SIZE_MAX) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->pelt_avg = p->se.avg.util_avg; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %d demand %llu" + " walt %u pelt %u (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, __entry->evt, + __entry->demand, + __entry->walt_avg, + __entry->pelt_avg, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(walt_migration_update_sum, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __field(int, cpu ) + __field(int, pid ) + __field( u64, cs ) + __field( u64, ps ) + __field( s64, nt_cs ) + __field( s64, nt_ps ) + ), + + TP_fast_assign( + __entry->cpu = cpu_of(rq); + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->nt_cs = (s64)rq->nt_curr_runnable_sum; + __entry->nt_ps = (s64)rq->nt_prev_runnable_sum; + __entry->pid = p->pid; + ), + + TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d", + __entry->cpu, __entry->cs, __entry->ps, + __entry->nt_cs, __entry->nt_ps, __entry->pid) +); +#endif /* CONFIG_SCHED_WALT */ + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/init/Kconfig b/init/Kconfig index 71f3ce810734..e71e35cf723c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING endchoice +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 174762d8695b..623ce4bde0d5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b814c13f850f..4c981dfc34ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -89,6 +89,7 @@ #define CREATE_TRACE_POINTS #include +#include "walt.h" DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1085,7 +1086,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); + double_unlock_balance(rq, cpu_rq(new_cpu)); raw_spin_unlock(&rq->lock); rq = cpu_rq(new_cpu); @@ -1309,6 +1312,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); + + walt_fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1937,6 +1942,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; +#ifdef CONFIG_SMP + struct rq *rq; + u64 wallclock; +#endif /* * If we are going to wake up a thread waiting for CONDITION we @@ -1994,6 +2003,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + rq = cpu_rq(task_cpu(p)); + + raw_spin_lock(&rq->lock); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + raw_spin_unlock(&rq->lock); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2001,10 +2018,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); @@ -2053,8 +2072,13 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + u64 wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); @@ -2120,6 +2144,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -2387,6 +2412,9 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + + walt_init_new_task_load(p); + /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2399,6 +2427,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); + walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2948,9 +2977,12 @@ void scheduler_tick(void) sched_clock_tick(); raw_spin_lock(&rq->lock); + walt_set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); @@ -3189,6 +3221,7 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3250,6 +3283,9 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -5672,6 +5708,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; account_reset_rq(rq); break; @@ -5692,6 +5731,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -7536,6 +7576,7 @@ void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; + walt_init_cpu_efficiency(); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84f5e12c8e12..15b8a8f34bd9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,11 +30,13 @@ #include #include #include +#include #include #include "sched.h" #include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -56,6 +58,10 @@ unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -4225,6 +4231,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4232,6 +4239,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4246,6 +4254,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4295,6 +4304,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4315,6 +4325,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4329,6 +4340,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_dec_cumulative_runnable_avg(rq, p); /* * We want to potentially trigger a freq switch @@ -5228,6 +5240,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif return p->se.avg.util_avg; } @@ -6620,7 +6638,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9694204660b7..be700bfa1ae4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,6 +8,8 @@ #include #include +#include "walt.h" + int sched_rr_timeslice = RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -1261,6 +1263,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1272,6 +1275,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1b838cff2f20..f48fb371913a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -410,6 +410,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; @@ -663,6 +667,27 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_WALT + /* + * max_freq = user or thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq, max_possible_freq; + struct cpumask freq_domain_cpumask; + + u64 cumulative_runnable_avg; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + int capacity; + int max_possible_capacity; + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +#endif /* CONFIG_SCHED_WALT */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1513,6 +1538,10 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_ravg_window; +extern unsigned int walt_disabled; + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -1544,6 +1573,11 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / + walt_ravg_window; +#endif delta += util; if (delta < 0) return 0; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..61f852d46858 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -42,12 +43,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 000000000000..1dff3d2e2358 --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1098 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * Window Assisted Load Tracking (WALT) implementation credits: + * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, + * Pavan Kumar Kondeti, Olav Haugan + * + * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla + * and Todd Kjos + */ + +#include +#include +#include +#include "sched.h" +#include "walt.h" + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +static __read_mostly unsigned int walt_ravg_hist_size = 5; +static __read_mostly unsigned int walt_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +static __read_mostly unsigned int walt_account_wait_time = 1; +static __read_mostly unsigned int walt_freq_account_wait_time = 0; +static __read_mostly unsigned int walt_io_is_busy = 0; + +unsigned int sysctl_sched_walt_init_task_load_pct = 15; + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +static unsigned int max_possible_efficiency = 1024; +static unsigned int min_possible_efficiency = 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +static unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +static unsigned int min_max_freq = 1; + +static unsigned int max_capacity = 1024; +static unsigned int min_capacity = 1024; +static unsigned int max_load_scale_factor = 1024; +static unsigned int max_possible_capacity = 1024; + +/* Mask of all CPUs that have max_possible_capacity */ +static cpumask_t mpc_mask = CPU_MASK_ALL; + +/* Window size (in ns) */ +__read_mostly unsigned int walt_ravg_window = 20000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +static unsigned int sync_cpu; +static ktime_t ktime_last; +static bool walt_ktime_suspended; + +static unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +void +walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg += p->ravg.demand; +} + +void +walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg -= p->ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); +} + +static void +fixup_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p, s64 task_load_delta) +{ + rq->cumulative_runnable_avg += task_load_delta; + if ((s64)rq->cumulative_runnable_avg < 0) + panic("cra less than zero: tld: %lld, task_load(p) = %u\n", + task_load_delta, task_load(p)); +} + +u64 walt_ktime_clock(void) +{ + if (unlikely(walt_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void walt_resume(void) +{ + walt_ktime_suspended = false; +} + +static int walt_suspend(void) +{ + ktime_last = ktime_get(); + walt_ktime_suspended = true; + return 0; +} + +static struct syscore_ops walt_syscore_ops = { + .resume = walt_resume, + .suspend = walt_suspend +}; + +static int __init walt_init_ops(void) +{ + register_syscore_ops(&walt_syscore_ops); + return 0; +} +late_initcall(walt_init_ops); + +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg += p->ravg.demand; +} + +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg -= p->ravg.demand; +} + +static int exiting_task(struct task_struct *p) +{ + if (p->flags & PF_EXITING) { + if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) { + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + } + return 1; + } + return 0; +} + +static int __init set_walt_ravg_window(char *str) +{ + get_option(&str, &walt_ravg_window); + + walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + return 0; +} + +early_param("walt_ravg_window", set_walt_ravg_window); + +static void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < walt_ravg_window) + return; + + nr_windows = div64_u64(delta, walt_ravg_window); + rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; +} + +static u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned int cur_freq = rq->cur_freq; + int sf; + + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = rq->max_possible_freq; + + /* round up div64 */ + delta = div64_u64(delta * cur_freq + max_possible_freq - 1, + max_possible_freq); + + sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); + + delta *= sf; + delta >>= 10; + + return delta; +} + +static int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!walt_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE || + event == TASK_UPDATE) + return 1; + + /* Only TASK_MIGRATE && PICK_NEXT_TASK left */ + return walt_freq_account_wait_time; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, nr_full_windows = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = walt_ravg_window; + u64 delta; + + new_window = mark_start < window_start; + if (new_window) { + nr_full_windows = div64_u64((window_start - mark_start), + window_size); + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + /* Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!nr_full_windows) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO */ + + if (!new_window) + return; + + /* A new window has started. The RQ demand must be rolled + * over if p is the current task. */ + if (p_is_curr_task) { + u64 prev_sum = 0; + + /* p is either idle task or an exiting task */ + if (!nr_full_windows) { + prev_sum = rq->curr_runnable_sum; + } + + rq->prev_runnable_sum = prev_sum; + rq->curr_runnable_sum = 0; + } + + return; + } + + if (!new_window) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + rq->curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + rq->prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum += delta; + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + + delta += rq->curr_runnable_sum; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + + } + /* + * Rollover for normal runnable sum is done here by overwriting + * the values in prev_runnable_sum and curr_runnable_sum. + * Rollover for new task runnable sum has completed by previous + * if-else statement. + */ + rq->prev_runnable_sum = delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum = delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. */ + rq->prev_runnable_sum = rq->curr_runnable_sum; + if (mark_start > window_start) { + rq->curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + rq->prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* No need to bother updating task demand for exiting tasks + * or the idle task. */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. */ + if (event == TASK_WAKE || (!walt_account_wait_time && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = walt_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (walt_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (walt_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, walt_ravg_hist_size); + if (walt_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + fixup_cumulative_runnable_avg(rq, p, demand); + + p->ravg.demand = demand; + +done: + trace_walt_update_history(rq, p, runtime, samples, event); + return; +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, + u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > walt_ravg_window)) + p->ravg.sum = walt_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = walt_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* The simple case - busy time contained within the existing + * window. */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* Roll window_start back to current to process any remainder + * in current window. */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + if (walt_disabled || !rq->window_start) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + +done: + trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime); + + p->ravg.mark_start = wallclock; +} + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +void walt_init_cpu_efficiency(void) +{ + int i, efficiency; + unsigned int max = 0, min = UINT_MAX; + + for_each_possible_cpu(i) { + efficiency = arch_get_cpu_efficiency(i); + cpu_rq(i)->efficiency = efficiency; + + if (efficiency > max) + max = efficiency; + if (efficiency < min) + min = efficiency; + } + + if (max) + max_possible_efficiency = max; + + if (min) + min_possible_efficiency = min; +} + +static void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void walt_mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start) { + reset_task_stats(p); + return; + } + + wallclock = walt_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +void walt_set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start) + return; + + if (cpu == sync_cpu) { + rq->window_start = walt_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +void walt_migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +void walt_fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + + if (p->ravg.curr_window) { + src_rq->curr_runnable_sum -= p->ravg.curr_window; + dest_rq->curr_runnable_sum += p->ravg.curr_window; + } + + if (p->ravg.prev_window) { + src_rq->prev_runnable_sum -= p->ravg.prev_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + } + + if ((s64)src_rq->prev_runnable_sum < 0) { + src_rq->prev_runnable_sum = 0; + WARN_ON(1); + } + if ((s64)src_rq->curr_runnable_sum < 0) { + src_rq->curr_runnable_sum = 0; + WARN_ON(1); + } + + trace_walt_migration_update_sum(src_rq, p); + trace_walt_migration_update_sum(dest_rq, p); + + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max = 0, min = INT_MAX; + + for_each_online_cpu(i) { + if (cpu_rq(i)->capacity > max) + max = cpu_rq(i)->capacity; + if (cpu_rq(i)->capacity < min) + min = cpu_rq(i)->capacity; + } + + max_capacity = max; + min_capacity = min; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long capacity_scale_cpu_efficiency(int cpu) +{ + return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(int cpu) +{ + return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static unsigned long load_scale_cpu_efficiency(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cpu_rq(cpu)->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static unsigned long load_scale_cpu_freq(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); +} + +static int compute_capacity(int cpu) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cpu); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cpu); + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(int cpu) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cpu); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cpu); + load_scale >>= 10; + + return load_scale; +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + int i, update_max = 0; + u64 highest_mpc = 0, highest_mplsf = 0; + const struct cpumask *cpus = policy->related_cpus; + unsigned int orig_min_max_freq = min_max_freq; + unsigned int orig_max_possible_freq = max_possible_freq; + /* Initialized to policy->max in case policy->related_cpus is empty! */ + unsigned int orig_max_freq = policy->max; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + for_each_cpu(i, policy->related_cpus) { + cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, + policy->related_cpus); + orig_max_freq = cpu_rq(i)->max_freq; + cpu_rq(i)->min_freq = policy->min; + cpu_rq(i)->max_freq = policy->max; + cpu_rq(i)->cur_freq = policy->cur; + cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + /* Changes to policy other than max_freq don't require any updates */ + if (orig_max_freq == policy->max) + return 0; + + /* + * A changed min_max_freq or max_possible_freq (possible during bootup) + * needs to trigger re-computation of load_scale_factor and capacity for + * all possible cpus (even those offline). It also needs to trigger + * re-computation of nr_big_task count on all online cpus. + * + * A changed rq->max_freq otoh needs to trigger re-computation of + * load_scale_factor and capacity for just the cluster of cpus involved. + * Since small task definition depends on max_load_scale_factor, a + * changed load_scale_factor of one cluster could influence + * classification of tasks in another cluster. Hence a changed + * rq->max_freq will need to trigger re-computation of nr_big_task + * count on all online cpus. + * + * While it should be sufficient for nr_big_tasks to be + * re-computed for only online cpus, we have inadequate context + * information here (in policy notifier) with regard to hotplug-safety + * context in which notification is issued. As a result, we can't use + * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is + * fixed up to issue notification always in hotplug-safe context, + * re-compute nr_big_task for all possible cpus. + */ + + if (orig_min_max_freq != min_max_freq || + orig_max_possible_freq != max_possible_freq) { + cpus = cpu_possible_mask; + update_max = 1; + } + + /* + * Changed load_scale_factor can trigger reclassification of tasks as + * big or small. Make this change "atomic" so that tasks are accounted + * properly due to changed load_scale_factor + */ + for_each_cpu(i, cpus) { + struct rq *rq = cpu_rq(i); + + rq->capacity = compute_capacity(i); + rq->load_scale_factor = compute_load_scale_factor(i); + + if (update_max) { + u64 mpc, mplsf; + + mpc = div_u64(((u64) rq->capacity) * + rq->max_possible_freq, rq->max_freq); + rq->max_possible_capacity = (int) mpc; + + mplsf = div_u64(((u64) rq->load_scale_factor) * + rq->max_possible_freq, rq->max_freq); + + if (mpc > highest_mpc) { + highest_mpc = mpc; + cpumask_clear(&mpc_mask); + cpumask_set_cpu(i, &mpc_mask); + } else if (mpc == highest_mpc) { + cpumask_set_cpu(i, &mpc_mask); + } + + if (mplsf > highest_mplsf) + highest_mplsf = mplsf; + } + } + + if (update_max) { + max_possible_capacity = highest_mpc; + max_load_scale_factor = highest_mplsf; + } + + __update_min_max_capacity(); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + int i; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_rq(cpu)->cur_freq == new_freq) + return 0; + + for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { + struct rq *rq = cpu_rq(i); + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); + rq->cur_freq = new_freq; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +void walt_init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = + div64_u64((u64)sysctl_sched_walt_init_task_load_pct * + (u64)walt_ravg_window, 100); + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)walt_ravg_window, 100); + } + + p->ravg.demand = init_load_windows; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..cabc193a683d --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_fixup_busy_time(struct task_struct *p, int new_cpu); +void walt_init_new_task_load(struct task_struct *p); +void walt_mark_task_starting(struct task_struct *p); +void walt_set_window_start(struct rq *rq); +void walt_migrate_sync_cpu(int cpu); +void walt_init_cpu_efficiency(void); +u64 walt_ktime_clock(void); + +#else /* CONFIG_SCHED_WALT */ + +static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void walt_init_new_task_load(struct task_struct *p) { } +static inline void walt_mark_task_starting(struct task_struct *p) { } +static inline void walt_set_window_start(struct rq *rq) { } +static inline void walt_migrate_sync_cpu(int cpu) { } +static inline void walt_init_cpu_efficiency(void) { } +static inline u64 walt_ktime_clock(void) { return 0; } + +#endif /* CONFIG_SCHED_WALT */ + +extern unsigned int walt_disabled; + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dd46f370b73a..e2d9953822be 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -311,6 +311,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "sched_sync_hint_enable", .data = &sysctl_sched_sync_hint_enable, From cf8449f421c99c6482c5b8ef26858dc5aa206628 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 22 Jul 2016 13:21:15 +0100 Subject: [PATCH 545/813] sched/walt: Accounting for number of irqs pending on each core Schedules on a core whose irq count is less than a threshold. Improves I/O performance of EAS. Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5 --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 5 +++ kernel/sched/cputime.c | 16 +++++++++ kernel/sched/fair.c | 7 +++- kernel/sched/sched.h | 3 ++ kernel/sched/walt.c | 65 ++++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 5 +++ kernel/sysctl.c | 7 ++++ 8 files changed, 108 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 710f58a28d63..d68e88c9d4d7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware; extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_walt_cpu_high_irqload; #endif enum sched_tunable_scaling { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c981dfc34ee..67abbbd3965b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7754,6 +7754,11 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f74ea89e77a8..3f232c8b2bdd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,7 @@ #include #include #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; @@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); +#ifdef CONFIG_SCHED_WALT + else + account = false; +#endif irq_time_write_end(); +#ifdef CONFIG_SCHED_WALT + if (account) + walt_account_irqtime(cpu, curr, delta, wallclock); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 15b8a8f34bd9..8560a5530035 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,6 +61,8 @@ unsigned int sysctl_sched_cstate_aware = 1; #ifdef CONFIG_SCHED_WALT unsigned int sysctl_sched_use_walt_cpu_util = 1; unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); #endif /* * The initial- and re-scaling of tunables is configurable @@ -4274,7 +4276,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) schedtune_enqueue_task(p, cpu_of(rq)); #endif /* CONFIG_SMP */ - hrtick_update(rq); } @@ -5648,6 +5649,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util > capacity_orig_of(i)) continue; +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif /* * For boosted tasks we favor idle cpus unconditionally to * improve latency. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f48fb371913a..51c632bc94b6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -685,6 +685,9 @@ struct rq { u64 prev_runnable_sum; u64 nt_curr_runnable_sum; u64 nt_prev_runnable_sum; + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; #endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 1dff3d2e2358..b9ae8d5c4393 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -221,6 +221,71 @@ static int cpu_is_waiting_on_io(struct rq *rq) return atomic_read(&rq->nr_iowait); } +void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + + +#define WALT_HIGH_IRQ_TIMEOUT 3 + +u64 walt_irqload(int cpu) { + struct rq *rq = cpu_rq(cpu); + s64 delta; + delta = get_jiffies_64() - rq->irqload_ts; + + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < WALT_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +int walt_cpu_high_irqload(int cpu) { + return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload; +} + static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, u64 irqtime, int event) { diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index cabc193a683d..e181c87a928d 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -31,6 +31,11 @@ void walt_set_window_start(struct rq *rq); void walt_migrate_sync_cpu(int cpu); void walt_init_cpu_efficiency(void); u64 walt_ktime_clock(void); +void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta, + u64 wallclock); + +u64 walt_irqload(int cpu); +int walt_cpu_high_irqload(int cpu); #else /* CONFIG_SCHED_WALT */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e2d9953822be..d964422eb601 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -333,6 +333,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_walt_cpu_high_irqload", + .data = &sysctl_sched_walt_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { .procname = "sched_sync_hint_enable", From 3dfa385fc028080c7952b7226c72f4424d221d2c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:00:41 +0100 Subject: [PATCH 546/813] FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use The CPU utilization reported when WALT is in use already tracks the contributions due to RT and DL workloads. However, SchedFreq exposes different capacity update functions, one for each class, and does classes utilization internally at update_cpu_capacity_request() call time. This patch ensures that when WALT is in use, the: cpu_sched_capacity_reqs::cfs value is tracking just the load generated by SCHED_OTHER tasks. Change-Id: Ibd9c9a10874a1d91f62477034548f7664e57cd6a Signed-off-by: Patrick Bellasi --- kernel/sched/sched.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 51c632bc94b6..0b1bd6e8e1c8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1611,8 +1611,27 @@ void update_cpu_capacity_request(int cpu, bool request); static inline void set_cfs_cpu_capacity(int cpu, bool request, unsigned long capacity) { - if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + int rtdl = scr->rt + scr->dl; + /* + * WALT tracks the utilization of a CPU considering the load + * generated by all the scheduling classes. + * Since the following call to: + * update_cpu_capacity + * is already adding the RT and DL utilizations let's remove + * these contributions from the WALT signal. + */ + if (capacity > rtdl) + capacity -= rtdl; + else + capacity = 0; + } +#endif + if (scr->cfs != capacity) { + scr->cfs = capacity; update_cpu_capacity_request(cpu, request); } } From 0b20f616bbf6f23c3dd745d8660a64c04b98d3d5 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 29 Jun 2016 11:30:07 -0700 Subject: [PATCH 547/813] sched: EAS: Avoid causing spikes to max-freq unnecessarily During scheduler tick handling, the frequency was being set to max-freq if the current frequency is less than the current utilization. Change to just request "right" frequency instead of max. BUG: 29871410 Change-Id: I6fe65b14413da44b1520ba116f72320083eb92f8 --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67abbbd3965b..19decf8c07d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2940,7 +2940,7 @@ static unsigned long sum_capacity_reqs(unsigned long cfs_cap, static void sched_freq_tick(int cpu) { struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr; + unsigned long capacity_orig, capacity_curr, capacity_sum; if (!sched_freq()) return; @@ -2953,12 +2953,15 @@ static void sched_freq_tick(int cpu) /* * To make free room for a task that is building up its "real" * utilization and to harm its performance the least, request - * a jump to max OPP as soon as the margin of free capacity is - * impacted (specified by capacity_margin). + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) - set_cfs_cpu_capacity(cpu, true, capacity_max); + capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); + if (capacity_curr < capacity_sum) { + set_cfs_cpu_capacity(cpu, true, capacity_sum); + } } #else static inline void sched_freq_tick(int cpu) { } From f1bfd7f09d63fefa0859a20e91279eb66dbf7a6f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:09:24 +0100 Subject: [PATCH 548/813] FIXUP: sched: fix SchedFreq integration for both PELT and WALT The current kernel allows to use either PELT or WALT to track CPUs utilizations. One of the main differences between the two approaches is that PELT tracks only utilization of SCHED_OTHER classes while WALT tracks all tasks with a single signal. The current sched_freq_tick does not make this distinction and, when WALT is in use, we end up adding multiple time the contribution related to the RT and DL classes. This patch fixes this issue by: 1. providing two different code paths for PELT and WALT, thus granting that when we switch to PELT we get the original behaviour based on the assumption that class aggregations is done underneath by SchedFreq. 2. avoiding the double accounting of DL and RT workloads, when WALT is in use, by just adding a margin to the original WALT signal when we need to check if the CFS capacity has to be increased. Change-Id: I7326fd50e868e97fb5e12351917e9d2969bfdae7 Signed-off-by: Patrick Bellasi --- kernel/sched/core.c | 91 +++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 19decf8c07d5..e001ee1e3175 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2926,21 +2926,77 @@ unsigned long long task_sched_runtime(struct task_struct *p) } #ifdef CONFIG_CPU_FREQ_GOV_SCHED -static unsigned long sum_capacity_reqs(unsigned long cfs_cap, - struct sched_capacity_reqs *scr) -{ - unsigned long total = cfs_cap + scr->rt; - total = total * capacity_margin; - total /= SCHED_CAPACITY_SCALE; - total += scr->dl; - return total; +static inline +unsigned long add_capacity_margin(unsigned long cpu_capacity) +{ + cpu_capacity = cpu_capacity * capacity_margin; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; } +static inline +unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = add_capacity_margin(cfs_cap + scr->rt); + return total += scr->dl; +} + +static void sched_freq_tick_pelt(int cpu) +{ + unsigned long cpu_utilization = capacity_max; + unsigned long capacity_curr = capacity_curr_of(cpu); + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); +} + +#ifdef CONFIG_SCHED_WALT +static void sched_freq_tick_walt(int cpu) +{ + unsigned long cpu_utilization = cpu_util(cpu); + unsigned long capacity_curr = capacity_curr_of(cpu); + + if (walt_disabled || !sysctl_sched_use_walt_cpu_util) + return sched_freq_tick_pelt(cpu); + + /* + * Add a margin to the WALT utilization. + * NOTE: WALT tracks a single CPU signal for all the scheduling + * classes, thus this margin is going to be added to the DL class as + * well, which is something we do not do in sched_freq_tick_pelt case. + */ + cpu_utilization = add_capacity_margin(cpu_utilization); + if (cpu_utilization <= capacity_curr) + return; + + /* + * It is likely that the load is growing so we + * keep the added margin in our request as an + * extra boost. + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); + +} +#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) +#else +#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) +#endif /* CONFIG_SCHED_WALT */ + static void sched_freq_tick(int cpu) { - struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr, capacity_sum; + unsigned long capacity_orig, capacity_curr; if (!sched_freq()) return; @@ -2950,22 +3006,11 @@ static void sched_freq_tick(int cpu) if (capacity_curr == capacity_orig) return; - /* - * To make free room for a task that is building up its "real" - * utilization and to harm its performance the least, request - * a jump to a higher OPP as soon as the margin of free capacity - * is impacted (specified by capacity_margin). - */ - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); - if (capacity_curr < capacity_sum) { - set_cfs_cpu_capacity(cpu, true, capacity_sum); - } + _sched_freq_tick(cpu); } #else static inline void sched_freq_tick(int cpu) { } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ /* * This function gets called by the timer code, with HZ frequency. From abdb60d816bfd20b8d4f61c3e7c95a61ad12212a Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 16 Jun 2016 16:33:54 -0700 Subject: [PATCH 549/813] FIXUP: sched/fair: Fix hang during suspend in sched_group_energy BUG: 29353986 Change-Id: I0d0d8d5c107a2e0bd219819e036091106bb40e11 --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8560a5530035..a45a6e1a692c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5001,6 +5001,7 @@ static int sched_group_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); continue; } From 923b7aa7f0e6e83a26be3d54db12060dc7651c09 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 4 Jul 2016 15:04:45 +0100 Subject: [PATCH 550/813] FIXUP: sched: Fix double-release of spinlock in move_queued_task BUG: 29519455 Change-Id: I4d1c27a1b4bcbba03d4b175d170cfe1701a90ffd --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0b1bd6e8e1c8..b2d8ad59f41f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1832,7 +1832,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); + if (this_rq != busiest) + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } From 7bba794daaa85dce327e52b201439bb2c148b6c9 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 25 Jul 2016 15:13:58 +0100 Subject: [PATCH 551/813] arch_timer: add error handling when the MPM global timer is cleared Bug: 29000863 Signed-off-by: albert.zl_huang Change-Id: I2b5a28b0a9edb31bdaa1ca2310397dd2f36f6c23 Updated to use arch_timer_read_counter() as arch_counter_get_cntvct doesn't exist in this kernel. Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index b9ae8d5c4393..d9d09914ce30 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -185,7 +185,14 @@ update_window_start(struct rq *rq, u64 wallclock) int nr_windows; delta = wallclock - rq->window_start; - BUG_ON(delta < 0); + /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ + if (delta < 0) { + if (arch_timer_read_counter() == 0) + delta = 0; + else + BUG_ON(1); + } + if (delta < walt_ravg_window) return; From 831623e6fe5ea7a8dc858b10f369882b6d4fa39f Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 13 Jul 2016 16:13:47 -0700 Subject: [PATCH 552/813] sched: use util instead of capacity to select busy cpu If cpus are busy, the cpu selection algorithm was favoring cpus with lower capacity. This can result in uneven packing since there will be a bias toward the same cpu until there is a capacity change. Instead use the utilization so there is immediate feedback as tasks are assigned BUG: 30115868 Change-Id: I0ac7ae3ab5d8f2f5a5838c29bb6da2c3e8ef44e8 --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a45a6e1a692c..26c7e3fd332f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5614,7 +5614,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) { int iter_cpu; int target_cpu = -1; - int target_capacity = 0; + int target_util = 0; int backup_capacity = 0; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; @@ -5670,10 +5670,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_capacity == 0 || - target_capacity > cur_capacity) { + if (target_util == 0 || + target_util > new_util) { target_cpu = i; - target_capacity = cur_capacity; + target_util = new_util; } } else if (!boosted) { if (best_idle_cpu < 0 || From b57cebe41f2805a2cc541d4bfcdf6cac0e12c0d4 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 13:09:03 -0700 Subject: [PATCH 553/813] sched/tune: Introducing a new schedtune attribute prefer_idle Hint to enable biasing of tasks towards idle cpus, even when a given task is negatively boosted. The mechanism allows upto 20% reduction in camera power without hurting performance. bug: 28312446 Change-Id: I97ea5671aa1e6bcb165408b41e17bc82e41c2c9e --- kernel/sched/fair.c | 23 +++++++++++++---------- kernel/sched/tune.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 2 ++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26c7e3fd332f..781e7676df89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5610,7 +5610,7 @@ done: return target; } -static inline int find_best_target(struct task_struct *p, bool boosted) +static inline int find_best_target(struct task_struct *p, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5628,9 +5628,9 @@ static inline int find_best_target(struct task_struct *p, bool boosted) int idle_idx; /* - * favor higher cpus for boosted tasks + * favor higher cpus for tasks that prefer idle cores */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5655,10 +5655,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) continue; #endif /* - * For boosted tasks we favor idle cpus unconditionally to + * Unconditionally favoring tasks that prefer idle cpus to * improve latency. */ - if (idle_cpu(i) && boosted) { + if (idle_cpu(i) && prefer_idle) { if (best_idle_cpu < 0) best_idle_cpu = i; continue; @@ -5675,7 +5675,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) target_cpu = i; target_util = new_util; } - } else if (!boosted) { + } else if (!prefer_idle) { if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { @@ -5690,7 +5690,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) } } - if (boosted && best_idle_cpu >= 0) + if (prefer_idle && best_idle_cpu >= 0) target_cpu = best_idle_cpu; else if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; @@ -5782,14 +5782,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) */ #ifdef CONFIG_CGROUP_SCHEDTUNE bool boosted = schedtune_task_boost(p) > 0; + bool prefer_idle = schedtune_prefer_idle(p) > 0; #else bool boosted = 0; + bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted); - if (tmp_target >= 0) + int tmp_target = find_best_target(p, boosted || prefer_idle); + if (tmp_target >= 0) { target_cpu = tmp_target; - if (boosted && idle_cpu(target_cpu)) + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) return target_cpu; + } } if (target_cpu != task_cpu(p)) { diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d24f365b0c90..644f8e9ee96f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -125,6 +125,10 @@ struct schedtune { /* Performance Constraint (C) region threshold params */ int perf_constrain_idx; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards idle CPUs */ + int prefer_idle; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -156,6 +160,7 @@ root_schedtune = { .boost = 0, .perf_boost_idx = 0, .perf_constrain_idx = 0, + .prefer_idle = 0, }; int @@ -536,6 +541,38 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } +int schedtune_prefer_idle(struct task_struct *p) +{ + struct schedtune *st; + int prefer_idle; + + /* Get prefer_idle value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_idle = st->prefer_idle; + rcu_read_unlock(); + + return prefer_idle; +} + +static u64 +prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_idle; +} + +static int +prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_idle) +{ + struct schedtune *st = css_st(css); + st->prefer_idle = prefer_idle; + + return 0; +} + static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -587,6 +624,11 @@ static struct cftype files[] = { .read_s64 = boost_read, .write_s64 = boost_write, }, + { + .name = "prefer_idle", + .read_u64 = prefer_idle_read, + .write_u64 = prefer_idle_write, + }, { } /* terminate */ }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index be1785eb1c5b..4f6441771e4c 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +int schedtune_prefer_idle(struct task_struct *tsk); + void schedtune_exit_task(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); From 49e4bbf3cc1e3d468f463a46643a574cf0fe419a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 10 Feb 2016 09:24:36 +0000 Subject: [PATCH 554/813] DEBUG: sched: add tracepoint for RD overutilized Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fa1b3df836bc..c50310a7fd6d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -937,6 +937,26 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +/* + * Tracepoint for system overutilized flag + */ +TRACE_EVENT(sched_overutilized, + + TP_PROTO(bool overutilized), + + TP_ARGS(overutilized), + + TP_STRUCT__entry( + __field( bool, overutilized ) + ), + + TP_fast_assign( + __entry->overutilized = overutilized; + ), + + TP_printk("overutilized=%d", + __entry->overutilized ? 1 : 0) +); #ifdef CONFIG_SCHED_WALT struct rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 781e7676df89..9139e153671a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4258,8 +4258,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) + cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } /* * We want to potentially trigger a freq switch @@ -7524,12 +7526,17 @@ next_group: env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) + if (env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } } else { - if (!env->dst_rq->rd->overutilized && overutilized) + if (!env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } + } /** @@ -8969,8 +8976,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } rq->misfit_task = !task_fits_max(curr, rq->cpu); #endif From 4d8776f3cc03da1bab0ffa507675ef1e184990c8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 16:09:03 +0100 Subject: [PATCH 555/813] FIXUP: sched/tune: do initialization as a postcore_initicall SchedTune needs to walk the scheduling domains to compute the energy normalization constants used for PE space filtering. To build such constants we need the energy model data for each CPU in the system. However, by walking the SDs as a late initcall stage, the userspace has been already initialized and it could happen that some CPUs are hotplugged out. For example, this could happen if a user-space thermal manager daemon detects that CPUs are to much hot during the boot process. To avoid such a race condition we can anticipate the SchedTune initialization code to be a postcore_initicall. This allows to keep the SchedTune initialization code as simple as an initcall while still safely relaying on SDs provided data. Such calls are executed before user-space is initialized and thus, apart from the case of unlucky early-init kernel space generated hotplugs, this solution should be safe enough to get all the data we need. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 644f8e9ee96f..bd7f319ce53e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -946,4 +946,4 @@ nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init); +postcore_initcall(schedtune_init); From 08786c13717cfc228a84cf3928d91708df7caf0c Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Tue, 2 Aug 2016 14:05:46 -0700 Subject: [PATCH 556/813] sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs When idle cpus cannot be found for Top-app/FG tasks, the cpu selection algorithm picks a cpu with lowest OPP amongst the busy cpus as a second choice. Mitigates the "runnable" time for ui and render threads. bug: 30481949 bug: 30342017 bug: 30508678 Change-Id: I5a97e31d33284895c0fa6f6942102713ee576d77 --- kernel/sched/fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9139e153671a..2e08b88cce12 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5672,10 +5672,22 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; + if(prefer_idle) { + // Find a target cpu with lowest + // utilization. + if (target_util == 0 || + target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + // Find a target cpu with highest + // utilization. + if (target_util == 0 || + target_util > new_util) { + target_cpu = i; + target_util = new_util; + } } } else if (!prefer_idle) { if (best_idle_cpu < 0 || @@ -5687,6 +5699,7 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { + // Find a backup cpu with least capacity. backup_capacity = cur_capacity; backup_cpu = i; } From 09eb72059996637e0cc6db43ed76098d29f02eff Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 4 Aug 2016 12:20:04 +0100 Subject: [PATCH 557/813] sched/cpufreq_sched: fix thermal capping events cpufreq_sched_limits (called when CPUFREQ_GOV_LIMITS event happens) bails out if policy->rwsem is already locked. However, that rwsem is always guaranteed to be locked when we get here after a thermal throttling event happens: th_throttling -> cpufreq_update_policy() ... down_write(&policy->rwsem); ... cpufreq_set_policy() -> ... __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); -> cpufreq_sched_limits() ... if (!down_write_trylock(&policy->rwsem)) return; <-- BAIL OUT! So, we don't currently react immediately to thermal capping event (even if reaction is still quick in practice, ~1ms, as lots of events are likely to trigger a frequency selection on a high loaded system). Fix this bug by removing the bail out condition. While we are at it we also slightly change handling of the new limits by clamping the last requested_freq between policy's max and min. Doing so gives us the oppurtunity to correctly restore the last requested frequency as soon as a thermal unthrottling event happens. bug: 30481949 Change-Id: I3c13e818f238c1ffa66b34e419e8b87314b57427 Suggested-by: Javi Merino Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/cpufreq_sched.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 3f8c67a3ea0f..4fea269a6598 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -58,7 +58,6 @@ struct gov_data { struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; - int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -193,7 +192,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) @@ -288,8 +287,6 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - gd->max = policy->max; - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); @@ -352,28 +349,17 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) static void cpufreq_sched_limits(struct cpufreq_policy *policy) { - struct gov_data *gd; + unsigned int clamp_freq; + struct gov_data *gd = policy->governor_data;; pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", policy->cpu, policy->min, policy->max, policy->cur); - if (!down_write_trylock(&policy->rwsem)) - return; - /* - * Need to keep track of highest max frequency for - * capacity calculations - */ - gd = policy->governor_data; - if (gd->max < policy->max) - gd->max = policy->max; + clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); - - up_write(&policy->rwsem); + if (policy->cur != clamp_freq) + __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); } static int cpufreq_sched_stop(struct cpufreq_policy *policy) From f6ea8bd5bd48bcd62258f855a88f80e6e112cee8 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 13 May 2016 11:54:04 +0100 Subject: [PATCH 558/813] sched/fair: call OPP update when going idle after migration When a task leaves a rq because it is migrated away it carries its utilization with him. In this case and OPP update on the src rq might be needed. The corresponding update at dst rq will happen at enqueue time. Change-Id: I22754a43760fc8d22a488fe15044af93787ea7a8 sched/fair: Fix uninitialised variable in idle_balance compiler warned, looks legit. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2e08b88cce12..88eae79d1a3c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8343,6 +8343,7 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + long removed_util=0; idle_enter_fair(this_rq); @@ -8366,6 +8367,17 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); + /* + * If removed_util_avg is !0 we most probably migrated some task away + * from this_cpu. In this case we might be willing to trigger an OPP + * update, but we want to do so if we don't find anybody else to pull + * here (we will trigger an OPP update with the pulled task's enqueue + * anyway). + * + * Record removed_util before calling update_blocked_averages, and use + * it below (before returning) to see if an OPP update is required. + */ + removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -8430,6 +8442,12 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } else if (removed_util) { + /* + * No task pulled and someone has been migrated away. + * Good case to trigger an OPP update. + */ + update_capacity_of(this_cpu); } return pulled_task; From e86992e170b0abb448b1b612fc7a1f08f2809bed Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: [PATCH 559/813] vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Change-Id: Idf256cfacb40b4dc8dbb6795cf06b34e8fec7a06 Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Git-commit: 0eb77e9880321915322d42913c3b53241739c8aa [shashim@codeaurora.org: resolve minor merge conflicts] Signed-off-by: Shiraz Hashim [jstultz: fwdport to 4.4] Signed-off-by: John Stultz --- include/linux/vmstat.h | 2 ++ kernel/sched/idle.c | 1 + mm/vmstat.c | 69 +++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3e5d9075960f..73fae8c4a5fb 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); +void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); @@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } +static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cbc130efbc5b..917c94abf5bb 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -220,6 +220,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/mm/vmstat.c b/mm/vmstat.c index c54fd2924f25..83a003bc3cae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w) } } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1449,7 +1468,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { From bfa1dd2bfda910b60e425974fd5dc94a6bdaccc4 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 29 Jul 2016 17:50:11 +0100 Subject: [PATCH 560/813] sched/fair: Favor higher cpus only for boosted tasks This CL separates the notion of boost and prefer_idle schedtune attributes in cpu selection. Today only top-app tasks are boosted. The CPU selection is slightly tweaked such that higher order cpus are preferred only for boosted tasks (top-app) and the rest would be skewed towards lower order cpus. This avoids starvation issues for fg tasks when interacting with high priority top-app tasks (a problem often seen in the case of system_server). bug: 30245369 bug: 30292998 Change-Id: I0377e00893b9f6586eec55632a265518fd2fa8a1 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 88eae79d1a3c..e7f9afe29c61 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5612,7 +5612,7 @@ done: return target; } -static inline int find_best_target(struct task_struct *p, bool prefer_idle) +static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5630,9 +5630,9 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) int idle_idx; /* - * favor higher cpus for tasks that prefer idle cores + * Iterate from higher cpus for boosted tasks. */ - int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5802,7 +5802,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) bool boosted = 0; bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted || prefer_idle); + int tmp_target = find_best_target(p, boosted, prefer_idle); if (tmp_target >= 0) { target_cpu = tmp_target; if ((boosted || prefer_idle) && idle_cpu(target_cpu)) From 2748928f860f549290b2c7299dc16ce8855019cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:34 +0200 Subject: [PATCH 561/813] UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity() Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it: __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the bug by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org Signed-off-by: Ingo Molnar (cherry picked from commit 25834c73f93af7f0712c98ca4593691592e6b360) Signed-off-by: Punit Agrawal BUG=chrome-os-partner:44828 TEST=Boot kernel on Oak. TEST=smaug-release and strago-release trybots. Change-Id: Id3c898c5ee1a22ed704e83f2ecf5f78199280d38 Reviewed-on: https://chromium-review.googlesource.com/321264 Commit-Ready: Ricky Liang Tested-by: Ricky Liang Reviewed-by: Ricky Liang Conflicts: kernel/sched/core.c --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e001ee1e3175..4f8f3464b982 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5160,6 +5160,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock(&rq->lock); __sched_fork(0, idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); From 6bd4858471d0c44deaa97e9ba4a8f0672af27690 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Tue, 17 Jun 2014 21:43:35 -0700 Subject: [PATCH 562/813] sched/rt: print RT tasks when RT throttling is activated Existing debug prints do not provide any clues about which tasks may have triggered RT throttling. Print the names and PIDs of all tasks on the throttled rt_rq to help narrow down the source of the problem. Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa --- kernel/sched/rt.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index be700bfa1ae4..2b9121ea91bf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -891,6 +891,42 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *rt_se; + char buf[500]; + char *pos = buf; + char *end = buf + sizeof(buf); + int idx; + + pos += snprintf(pos, sizeof(buf), + "sched: RT throttling activated for rt_rq %p (cpu %d)\n", + rt_rq, cpu_of(rq_of_rt_rq(rt_rq))); + + if (bitmap_empty(array->bitmap, MAX_RT_PRIO)) + goto out; + + pos += snprintf(pos, end - pos, "potential CPU hogs:\n"); + idx = sched_find_first_bit(array->bitmap); + while (idx < MAX_RT_PRIO) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pos < end) + pos += snprintf(pos, end - pos, "\t%s (%d)\n", + p->comm, p->pid); + } + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + } +out: + printk_deferred("%s", buf); +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -914,8 +950,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + + if (!once) { + once = true; + dump_throttled_rt_tasks(rt_rq); + } } else { /* * In case we did anyway, make it go away, From 6c6dc1632a4c9de940818cfdf86f1a436e638414 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Thu, 19 Jun 2014 14:23:33 -0700 Subject: [PATCH 563/813] sched/rt: Add Kconfig option to enable panicking for RT throttling This may be useful for detecting and debugging RT throttling issues. Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9 Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa [jstultz: forwardported to 4.4] Signed-off-by: John Stultz --- kernel/sched/rt.c | 9 +++++++++ lib/Kconfig.debug | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2b9121ea91bf..8a16cba968c4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -924,7 +924,16 @@ static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); } out: +#ifdef CONFIG_PANIC_ON_RT_THROTTLING + /* + * Use pr_err() in the BUG() case since printk_sched() will + * not get flushed and deadlock is not a concern. + */ + pr_err("%s", buf); + BUG(); +#else printk_deferred("%s", buf); +#endif } static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index fc9f2adc0c9b..63d14d9b51d8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -867,6 +867,15 @@ config SCHED_INFO bool default n +config PANIC_ON_RT_THROTTLING + bool "Panic on RT throttling" + help + Say Y here to enable the kernel to panic when a realtime + runqueue is throttled. This may be useful for detecting + and debugging RT throttling issues. + + Say N if unsure. + config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS From ac6f9bad52a79154bebd1626344dfc206d1cbefe Mon Sep 17 00:00:00 2001 From: Ricky Liang Date: Tue, 2 Feb 2016 01:12:06 +0800 Subject: [PATCH 564/813] FIXUP: sched: scheduler-driven cpu frequency selection Two fixups that have been reported on LKML. The next version of scheduler-driver cpu frequency selection patch set should include these fixes and we can drop this patch then. Signed-off-by: Ricky Liang Change-Id: Ia2f8b5c0dd5dac06580256eeb4b259929688af68 --- kernel/sched/cpufreq_sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 4fea269a6598..f6f9b9b3a4a8 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -131,6 +131,8 @@ static int cpufreq_sched_thread(void *data) new_request = gd->requested_freq; if (new_request == last_request) { set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; schedule(); } else { /* @@ -293,6 +295,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) goto err; } + policy->governor_data = gd; if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; gd->task = kthread_create(cpufreq_sched_thread, policy, @@ -309,12 +312,12 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); } - policy->governor_data = gd; set_sched_freq(); return 0; err: + policy->governor_data = NULL; kfree(gd); return -ENOMEM; } From 674d9a6b8286a49bc8e2f57a11ed81b2fa5d88bf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Nov 2015 14:09:38 -0500 Subject: [PATCH 565/813] sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() Part of the responsibility of the update_sg_lb_stats() function is to update the idle_cpus statistical counter in struct sg_lb_stats. This check is done by calling idle_cpu(). The idle_cpu() function, in turn, checks a number of fields within the run queue structure such as rq->curr and rq->nr_running. With the current layout of the run queue structure, rq->curr and rq->nr_running are in separate cachelines. The rq->curr variable is checked first followed by nr_running. As nr_running is also accessed by update_sg_lb_stats() earlier, it makes no sense to load another cacheline when nr_running is not 0 as idle_cpu() will always return false in this case. This patch eliminates this redundant cacheline load by checking the cached nr_running before calling idle_cpu(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas Hatch Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448478580-26467-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar (cherry picked from commit a426f99c91d1036767a7819aaaba6bd3191b7f06) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7f9afe29c61..efa516dfd6bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7308,7 +7308,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload, bool *overutilized) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -7325,7 +7325,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -7333,7 +7334,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; if (cpu_overutilized(i)) { From 2bf81a5752c429dd3289bf09085f6d40fa41bae6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Sep 2016 16:43:21 -0700 Subject: [PATCH 566/813] cpufreq: Kconfig: Fixup incorrect selection by CPU_FREQ_DEFAULT_GOV_SCHED The CPU_FREQ_DEFAULT_GOV_SCHED option is incorrectly selecting CPU_FREQ_GOV_INTERACTIVE, when it should be selecting CPU_FREQ_GOV_SCHED. Signed-off-by: John Stultz --- drivers/cpufreq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index e93877f38cae..d43c401ff190 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -114,7 +114,7 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_INTERACTIVE + select CPU_FREQ_GOV_SCHED help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the From 884090a34a90609f11f1d5b926e347718ca72aa2 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 25 Aug 2016 11:06:37 +0530 Subject: [PATCH 567/813] sched/walt: include missing header for arm_timer_read_counter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include clocksource/arm_arch_timer.h to fix implicit function declaration of ‘arch_timer_read_counter’ build error for ARCH=arm. Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- kernel/sched/walt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index d9d09914ce30..07b7f84b37e2 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "sched.h" #include "walt.h" From 8a187171b09aa3b63086daea3ba4a5f06acea0be Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 29 Aug 2016 19:48:17 +0530 Subject: [PATCH 568/813] DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpu curr capacity can only be traced for SMP systems. Non-SMP builds will fail with: drivers/cpufreq/cpufreq.c: In function ‘cpufreq_freq_transition_begin’: drivers/cpufreq/cpufreq.c:438:22: error: implicit declaration of function ‘capacity_curr_of’ [-Werror=implicit-function-declaration] trace_cpu_capacity(capacity_curr_of(cpu), cpu); ^ Fixes: Change-Id: Icd0930d11068fcb7d2b6a9a48e7ed974904e1081 ("DEBUG: sched,cpufreq: add cpu_capacity change tracepoint") Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- drivers/cpufreq/cpufreq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7264820e6443..7b728143440d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,7 +29,9 @@ #include #include #include +#ifdef CONFIG_SMP #include +#endif #include static LIST_HEAD(cpufreq_policy_list); @@ -474,7 +476,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { +#ifdef CONFIG_SMP int cpu; +#endif /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -503,8 +507,10 @@ wait: spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); +#ifdef CONFIG_SMP for_each_cpu(cpu, policy->cpus) trace_cpu_capacity(capacity_curr_of(cpu), cpu); +#endif cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } From 6b84e134d8420374c1640f4615953dc4f8563c0d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 24 Aug 2016 11:52:17 +0530 Subject: [PATCH 569/813] sched/walt: use do_div instead of division operator Use do_div() instead of "/" operator to fix undefined references to "__aeabi_uldivmod" build error for ARCH=arm. Also in TP_fast_assign(), along with do_div() usage, replace "," with ";" which would have resulted in a syntax error (!), because '#define TP_fast_assign(args...) args' would have stripped off the "," and left white space between these two assignments after CPP phase. Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- include/trace/events/sched.h | 3 ++- kernel/sched/sched.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c50310a7fd6d..dffaffab4bc8 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1057,7 +1057,8 @@ TRACE_EVENT(walt_update_history, __entry->samples = samples; __entry->evt = evt; __entry->demand = p->ravg.demand; - __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->walt_avg = (__entry->demand << 10); + do_div(__entry->walt_avg, walt_ravg_window); __entry->pelt_avg = p->se.avg.util_avg; memcpy(__entry->hist, p->ravg.sum_history, RAVG_HIST_SIZE_MAX * sizeof(u32)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2d8ad59f41f..2f2b959ad244 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1577,9 +1577,10 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) - util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / - walt_ravg_window; + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } #endif delta += util; if (delta < 0) From fbc1f826a81fd7cb78f2d9a7508379f493038d88 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 4 May 2016 18:56:45 -0700 Subject: [PATCH 570/813] arm: Fix build error "conflicting types for 'scale_cpu_capacity'" Commit "arm: Update arch_scale_cpu_capacity() to reflect change to define" introduced a dependency on struct sched_domain in arch/arm/include/asm/topologoy.h, but that structure is only currently defined if CONFIG_CPU_FREQ is enabled, which causes include/linux/cpufreq.h to get pulled in which defines it. Include regardless of CONFIG_CPU_FREQ so struct sched_domain is always defined. Fixes: Change-Id: I372bd5e4c1e203428d72b18c8a806b06f3567ef6 ("arm: Update arch_scale_cpu_capacity() to reflect change to define") Signed-off-by: Steve Muckle Signed-off-by: Amit Pundir [jstultz: Cherry-picked from android-3.18] Signed-off-by: John Stultz --- arch/arm/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index e3e596cbb1a7..d06064120694 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -3,6 +3,7 @@ #ifdef CONFIG_ARM_CPU_TOPOLOGY +#include #include struct cputopo_arm { @@ -25,7 +26,6 @@ void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef CONFIG_CPU_FREQ -#include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif #define arch_scale_cpu_capacity scale_cpu_capacity From 5151bbef1a652dda43f88d8f21c4ccef5022daf2 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 2 Jun 2016 12:18:08 +0000 Subject: [PATCH 571/813] arm: Fix #if/#ifdef typo in topology.c Probably a typo in arch/arm/kernel/topology.c This patch fixes the warning... arch/arm/kernel/topology.c: In function 'scale_cpu_capacity': arch/arm/kernel/topology.c:47:5: warning: "CONFIG_CPU_FREQ" is not defined [-Wundef] Fixes: Change-Id: If5e9e0ba8ff5a5d3236b373dbce8c72ea71b5e18 ("arm: Enable max freq invariant scheduler load-tracking and capacity support") Signed-off-by: Jon Medhurst Signed-off-by: Amit Pundir [jstultz: Cherry-picked from android-3.18] Signed-off-by: John Stultz --- arch/arm/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index f5941004efba..4f2c51ef162d 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,7 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { -#if CONFIG_CPU_FREQ +#ifdef CONFIG_CPU_FREQ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; From 592c519f5c8d191df3f1132ee717331d118f8a7b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:02:29 +0100 Subject: [PATCH 572/813] FIXUP: sched/tune: add fixes missing from a previous patch The previous patch: e7ce26f - FIXUP: sched/tune: fix accounting for runnable tasks squashed together patches of a series to fix SchedTune's accounting issues. However, in the consolidation and cleanup of the series to merge in the Android Common Kernel, we somehow missed a couple of important changes: 1) the schedtune_exit function is not more required, because e7ce26f fixes accounting of exiting tasks in a different way 2) the schedtune_initialized flag was not set at the end of scheddtune_init_cgroup() thus failing to enabled SchedTune at boot. This patch thus is to be considered an integration of e7ce26f. Signed-off-by: Patrick Bellasi [jstultz: Cherry-picked from android-3.18. It should be noted that some of this patch was already applied in the 4.4 patches (schedtune_exit doesn't exist for example), but this patch just ensures things are totally synced up] Signed-off-by: John Stultz --- kernel/sched/tune.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index bd7f319ce53e..505d7b35b0e1 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -715,7 +715,7 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, -// .allow_attach = schedtune_allow_attach, + .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, @@ -736,6 +736,8 @@ schedtune_init_cgroups(void) pr_info("schedtune: configured to support %d boost groups\n", BOOSTGROUPS_COUNT); + + schedtune_initialized = true; } #else /* CONFIG_CGROUP_SCHEDTUNE */ From 41e58098703b24e33f8372f82a65bb0f82ecf14a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:27:27 +0100 Subject: [PATCH 573/813] FIXUP: sched/tune: update accouting before CPU capacity The SchedTune tasks accounting is used to identify how many tasks are in a boostgroup and thus to bias the selection of an OPP based on the maximum boost value of the active boostgroups. The current implementation however update the accounting after CPU capacity has been update. This has two effects: a) when we enqueue a boosted task, we do not immediately boost its CPU b) when we dequeue a boosted task, we can keep a CPU boosted even if not required This patch change the order of the SchedTune accounting and SchedFreq updated to ensure to have always an updated representation of which boosted tasks are runnable on a CPU before updating its capacity. Reported-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efa516dfd6bc..87a3d793f35b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4255,6 +4255,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && @@ -4274,9 +4293,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_capacity_of(cpu_of(rq)); } - /* Update SchedTune accouting */ - schedtune_enqueue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4342,6 +4358,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + if (!se) { walt_dec_cumulative_runnable_avg(rq, p); @@ -4361,9 +4386,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } - /* Update SchedTune accouting */ - schedtune_dequeue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); From 15023395079accf4c345412f2b972ad5825f76bf Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Tue, 29 Dec 2015 04:47:00 +0800 Subject: [PATCH 574/813] UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor The VMSA field of MMFR0 (bottom 4 bits) is incremented for each added feature. PXN is supported if the value is >= 4 and LPAE is supported if it is >= 5. In case a kernel with CONFIG_ARM_LPAE disabled is used on a processor that supports LPAE, we can still use PXN in short descriptors. So check for >= 4 not == 4. Signed-off-by: Jungseung Lee Acked-by: Catalin Marinas Signed-off-by: Ben Hutchings Signed-off-by: Russell King --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4867f5daf82c..de9f8921e407 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -572,7 +572,7 @@ static void __init build_mem_type_table(void) * in the Short-descriptor translation table format descriptors. */ if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) { user_pmd_table |= PMD_PXNTABLE; } #endif From d3a1d035ef89cfb89cf6e2128d44b95771f2c7b4 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Jul 2016 17:42:57 -0400 Subject: [PATCH 575/813] UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg() (cherry picked from commit 43761473c254b45883a64441dd0bc85a42f3645c) There is a double fetch problem in audit_log_single_execve_arg() where we first check the execve(2) argumnets for any "bad" characters which would require hex encoding and then re-fetch the arguments for logging in the audit record[1]. Of course this leaves a window of opportunity for an unsavory application to munge with the data. This patch reworks things by only fetching the argument data once[2] into a buffer where it is scanned and logged into the audit records(s). In addition to fixing the double fetch, this patch improves on the original code in a few other ways: better handling of large arguments which require encoding, stricter record length checking, and some performance improvements (completely unverified, but we got rid of some strlen() calls, that's got to be a good thing). As part of the development of this patch, I've also created a basic regression test for the audit-testsuite, the test can be tracked on GitHub at the following link: * https://github.com/linux-audit/audit-testsuite/issues/25 [1] If you pay careful attention, there is actually a triple fetch problem due to a strnlen_user() call at the top of the function. [2] This is a tiny white lie, we do make a call to strnlen_user() prior to fetching the argument data. I don't like it, but due to the way the audit record is structured we really have no choice unless we copy the entire argument at once (which would require a rather wasteful allocation). The good news is that with this patch the kernel no longer relies on this strnlen_user() value for anything beyond recording it in the log, we also update it with a trustworthy value whenever possible. Reported-by: Pengfei Wang Cc: Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Change-Id: I10e979e94605e3cf8d461e3e521f8f9837228aa5 Bug: 30956807 --- kernel/auditsc.c | 332 +++++++++++++++++++++++------------------------ 1 file changed, 164 insertions(+), 168 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..48f45987dc6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) -{ - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; - } - - /* walk the whole argument looking for non-ascii chars */ - do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } - - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} - static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { - int i, len; - size_t len_sent = 0; - const char __user *p; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; - p = (const char __user *)current->mm->arg_start; + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { audit_panic("out of memory for argv string"); return; } + buf = buf_head; - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; + do { + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; + + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); + } + + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } + + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } + + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); + + /* NOTE: the caller handles the final audit_log_end() call */ + +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) From 410d31b9ffca731afcf2b62872fed27bbdc892d4 Mon Sep 17 00:00:00 2001 From: Anjali Singhai Jain Date: Wed, 21 Oct 2015 19:47:07 -0400 Subject: [PATCH 576/813] i40e: Workaround fix for mss < 256 issue [ Upstream commit 4f2f017c6101ab2ba202d6059c238c15577ad38b ] HW/NVM sets a limit of no less than 256 bytes for MSS. Stack can send as low as 76 bytes MSS. This patch lowers the HW limit to 64 bytes to avoid MDDs from firing and causing a reset when the MSS is lower than 256. Change-ID: I36b500a6bb227d283c3e321a7718e0672b11fab0 Signed-off-by: Anjali Singhai Jain Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e_main.c | 27 +++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 4a9873ec28c7..59b0bcc76a12 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6685,6 +6685,7 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit) struct i40e_hw *hw = &pf->hw; u8 set_fc_aq_fail = 0; i40e_status ret; + u32 val; u32 v; /* Now we wait for GRST to settle out. @@ -6823,6 +6824,20 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit) } } + /* Reconfigure hardware for allowing smaller MSS in the case + * of TSO, so that we avoid the MDD being fired and causing + * a reset in the case of small MSS+TSO. + */ +#define I40E_REG_MSS 0x000E64DC +#define I40E_REG_MSS_MIN_MASK 0x3FF0000 +#define I40E_64BYTE_MSS 0x400000 + val = rd32(hw, I40E_REG_MSS); + if ((val & I40E_REG_MSS_MIN_MASK) > I40E_64BYTE_MSS) { + val &= ~I40E_REG_MSS_MIN_MASK; + val |= I40E_64BYTE_MSS; + wr32(hw, I40E_REG_MSS, val); + } + if (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) || (pf->hw.aq.fw_maj_ver < 4)) { msleep(75); @@ -10183,6 +10198,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) u16 link_status; int err; u32 len; + u32 val; u32 i; u8 set_fc_aq_fail; @@ -10493,6 +10509,17 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) i40e_stat_str(&pf->hw, err), i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); + /* Reconfigure hardware for allowing smaller MSS in the case + * of TSO, so that we avoid the MDD being fired and causing + * a reset in the case of small MSS+TSO. + */ + val = rd32(hw, I40E_REG_MSS); + if ((val & I40E_REG_MSS_MIN_MASK) > I40E_64BYTE_MSS) { + val &= ~I40E_REG_MSS_MIN_MASK; + val |= I40E_64BYTE_MSS; + wr32(hw, I40E_REG_MSS, val); + } + if (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) || (pf->hw.aq.fw_maj_ver < 4)) { msleep(75); From 355528ece5de4764f61221732b49d53cd51f72f6 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Wed, 21 Oct 2015 19:47:12 -0400 Subject: [PATCH 577/813] i40evf: handle many MAC filters correctly [ Upstream commit 1418c3458118c6969d08e23aa377da7e2a7be36c ] When a lot (many hundreds) of MAC or VLAN filters are added at one time, we can overflow the Admin Queue buffer size with all the requests. Unfortunately, the driver would then calculate the message size incorrectly, causing it to be rejected by the PF. Furthermore, there was no mechanism to trigger another request to allow for configuring the rest of the filters that didn't fit into the first request. To fix this, recalculate the correct buffer size when we detect the overflow condition instead of just assuming the max buffer size. Also, don't clear the request bit in adapter->aq_required when we have an overflow, so that the rest of the filters can be processed later. Change-ID: Idd7cbbc5af31315e0dcb1b10e6a02ad9817ce65c Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../ethernet/intel/i40evf/i40evf_virtchnl.c | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c index 32e620e1eb5c..5de3f52fd31f 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c @@ -391,6 +391,7 @@ void i40evf_add_ether_addrs(struct i40evf_adapter *adapter) struct i40e_virtchnl_ether_addr_list *veal; int len, i = 0, count = 0; struct i40evf_mac_filter *f; + bool more = false; if (adapter->current_op != I40E_VIRTCHNL_OP_UNKNOWN) { /* bail because we already have a command pending */ @@ -415,7 +416,9 @@ void i40evf_add_ether_addrs(struct i40evf_adapter *adapter) count = (I40EVF_MAX_AQ_BUF_SIZE - sizeof(struct i40e_virtchnl_ether_addr_list)) / sizeof(struct i40e_virtchnl_ether_addr); - len = I40EVF_MAX_AQ_BUF_SIZE; + len = sizeof(struct i40e_virtchnl_ether_addr_list) + + (count * sizeof(struct i40e_virtchnl_ether_addr)); + more = true; } veal = kzalloc(len, GFP_ATOMIC); @@ -431,7 +434,8 @@ void i40evf_add_ether_addrs(struct i40evf_adapter *adapter) f->add = false; } } - adapter->aq_required &= ~I40EVF_FLAG_AQ_ADD_MAC_FILTER; + if (!more) + adapter->aq_required &= ~I40EVF_FLAG_AQ_ADD_MAC_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS, (u8 *)veal, len); kfree(veal); @@ -450,6 +454,7 @@ void i40evf_del_ether_addrs(struct i40evf_adapter *adapter) struct i40e_virtchnl_ether_addr_list *veal; struct i40evf_mac_filter *f, *ftmp; int len, i = 0, count = 0; + bool more = false; if (adapter->current_op != I40E_VIRTCHNL_OP_UNKNOWN) { /* bail because we already have a command pending */ @@ -474,7 +479,9 @@ void i40evf_del_ether_addrs(struct i40evf_adapter *adapter) count = (I40EVF_MAX_AQ_BUF_SIZE - sizeof(struct i40e_virtchnl_ether_addr_list)) / sizeof(struct i40e_virtchnl_ether_addr); - len = I40EVF_MAX_AQ_BUF_SIZE; + len = sizeof(struct i40e_virtchnl_ether_addr_list) + + (count * sizeof(struct i40e_virtchnl_ether_addr)); + more = true; } veal = kzalloc(len, GFP_ATOMIC); if (!veal) @@ -490,7 +497,8 @@ void i40evf_del_ether_addrs(struct i40evf_adapter *adapter) kfree(f); } } - adapter->aq_required &= ~I40EVF_FLAG_AQ_DEL_MAC_FILTER; + if (!more) + adapter->aq_required &= ~I40EVF_FLAG_AQ_DEL_MAC_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS, (u8 *)veal, len); kfree(veal); @@ -509,6 +517,7 @@ void i40evf_add_vlans(struct i40evf_adapter *adapter) struct i40e_virtchnl_vlan_filter_list *vvfl; int len, i = 0, count = 0; struct i40evf_vlan_filter *f; + bool more = false; if (adapter->current_op != I40E_VIRTCHNL_OP_UNKNOWN) { /* bail because we already have a command pending */ @@ -534,7 +543,9 @@ void i40evf_add_vlans(struct i40evf_adapter *adapter) count = (I40EVF_MAX_AQ_BUF_SIZE - sizeof(struct i40e_virtchnl_vlan_filter_list)) / sizeof(u16); - len = I40EVF_MAX_AQ_BUF_SIZE; + len = sizeof(struct i40e_virtchnl_vlan_filter_list) + + (count * sizeof(u16)); + more = true; } vvfl = kzalloc(len, GFP_ATOMIC); if (!vvfl) @@ -549,7 +560,8 @@ void i40evf_add_vlans(struct i40evf_adapter *adapter) f->add = false; } } - adapter->aq_required &= ~I40EVF_FLAG_AQ_ADD_VLAN_FILTER; + if (!more) + adapter->aq_required &= ~I40EVF_FLAG_AQ_ADD_VLAN_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_ADD_VLAN, (u8 *)vvfl, len); kfree(vvfl); } @@ -567,6 +579,7 @@ void i40evf_del_vlans(struct i40evf_adapter *adapter) struct i40e_virtchnl_vlan_filter_list *vvfl; struct i40evf_vlan_filter *f, *ftmp; int len, i = 0, count = 0; + bool more = false; if (adapter->current_op != I40E_VIRTCHNL_OP_UNKNOWN) { /* bail because we already have a command pending */ @@ -592,7 +605,9 @@ void i40evf_del_vlans(struct i40evf_adapter *adapter) count = (I40EVF_MAX_AQ_BUF_SIZE - sizeof(struct i40e_virtchnl_vlan_filter_list)) / sizeof(u16); - len = I40EVF_MAX_AQ_BUF_SIZE; + len = sizeof(struct i40e_virtchnl_vlan_filter_list) + + (count * sizeof(u16)); + more = true; } vvfl = kzalloc(len, GFP_ATOMIC); if (!vvfl) @@ -608,7 +623,8 @@ void i40evf_del_vlans(struct i40evf_adapter *adapter) kfree(f); } } - adapter->aq_required &= ~I40EVF_FLAG_AQ_DEL_VLAN_FILTER; + if (!more) + adapter->aq_required &= ~I40EVF_FLAG_AQ_DEL_VLAN_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_DEL_VLAN, (u8 *)vvfl, len); kfree(vvfl); } From f36bb42d19d9914afb71a422a29fdbc44f22a037 Mon Sep 17 00:00:00 2001 From: Anjali Singhai Jain Date: Mon, 26 Oct 2015 19:44:29 -0400 Subject: [PATCH 578/813] i40e/i40evf: Fix RS bit update in Tx path and disable force WB workaround [ Upstream commit 6a7fded776a778f728b13d83a2c9fc893580c080 ] This patch fixes the issue of forcing WB too often causing us to not benefit from NAPI. Without this patch we were forcing WB/arming interrupt too often taking away the benefits of NAPI and causing a performance impact. With this patch we disable force WB in the clean routine for X710 and XL710 adapters. X722 adapters do not enable interrupt to force a WB and benefit from WB_ON_ITR and hence force WB is left enabled for those adapters. For XL710 and X710 adapters if we have less than 4 packets pending a software Interrupt triggered from service task will force a WB. This patch also changes the conditions for setting RS bit as described in code comments. This optimizes when the HW does a tail bump amd when it does a WB. It also optimizes when we do a wmb. Change-ID: Id831e1ae7d3e2ec3f52cd0917b41ce1d22d75d9d Signed-off-by: Anjali Singhai Jain Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 118 +++++++++++------- drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 2 + 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 47e9a90d6b10..9533c44feee4 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -245,16 +245,6 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) tx_ring->q_vector->tx.total_bytes += total_bytes; tx_ring->q_vector->tx.total_packets += total_packets; - /* check to see if there are any non-cache aligned descriptors - * waiting to be written back, and kick the hardware to force - * them to be written back in case of napi polling - */ - if (budget && - !((i & WB_STRIDE) == WB_STRIDE) && - !test_bit(__I40E_DOWN, &tx_ring->vsi->state) && - (I40E_DESC_UNUSED(tx_ring) != tx_ring->count)) - tx_ring->arm_wb = true; - netdev_tx_completed_queue(netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index), total_packets, total_bytes); @@ -1770,6 +1760,9 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, u32 td_tag = 0; dma_addr_t dma; u16 gso_segs; + u16 desc_count = 0; + bool tail_bump = true; + bool do_rs = false; if (tx_flags & I40E_TX_FLAGS_HW_VLAN) { td_cmd |= I40E_TX_DESC_CMD_IL2TAG1; @@ -1810,6 +1803,8 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, tx_desc++; i++; + desc_count++; + if (i == tx_ring->count) { tx_desc = I40E_TX_DESC(tx_ring, 0); i = 0; @@ -1829,6 +1824,8 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, tx_desc++; i++; + desc_count++; + if (i == tx_ring->count) { tx_desc = I40E_TX_DESC(tx_ring, 0); i = 0; @@ -1843,35 +1840,7 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, tx_bi = &tx_ring->tx_bi[i]; } - /* Place RS bit on last descriptor of any packet that spans across the - * 4th descriptor (WB_STRIDE aka 0x3) in a 64B cacheline. - */ #define WB_STRIDE 0x3 - if (((i & WB_STRIDE) != WB_STRIDE) && - (first <= &tx_ring->tx_bi[i]) && - (first >= &tx_ring->tx_bi[i & ~WB_STRIDE])) { - tx_desc->cmd_type_offset_bsz = - build_ctob(td_cmd, td_offset, size, td_tag) | - cpu_to_le64((u64)I40E_TX_DESC_CMD_EOP << - I40E_TXD_QW1_CMD_SHIFT); - } else { - tx_desc->cmd_type_offset_bsz = - build_ctob(td_cmd, td_offset, size, td_tag) | - cpu_to_le64((u64)I40E_TXD_CMD << - I40E_TXD_QW1_CMD_SHIFT); - } - - netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev, - tx_ring->queue_index), - first->bytecount); - - /* Force memory writes to complete before letting h/w - * know there are new descriptors to fetch. (Only - * applicable for weak-ordered memory model archs, - * such as IA-64). - */ - wmb(); - /* set next_to_watch value indicating a packet is present */ first->next_to_watch = tx_desc; @@ -1881,15 +1850,78 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, tx_ring->next_to_use = i; + netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->queue_index), + first->bytecount); i40evf_maybe_stop_tx(tx_ring, DESC_NEEDED); + + /* Algorithm to optimize tail and RS bit setting: + * if xmit_more is supported + * if xmit_more is true + * do not update tail and do not mark RS bit. + * if xmit_more is false and last xmit_more was false + * if every packet spanned less than 4 desc + * then set RS bit on 4th packet and update tail + * on every packet + * else + * update tail and set RS bit on every packet. + * if xmit_more is false and last_xmit_more was true + * update tail and set RS bit. + * else (kernel < 3.18) + * if every packet spanned less than 4 desc + * then set RS bit on 4th packet and update tail + * on every packet + * else + * set RS bit on EOP for every packet and update tail + * + * Optimization: wmb to be issued only in case of tail update. + * Also optimize the Descriptor WB path for RS bit with the same + * algorithm. + * + * Note: If there are less than 4 packets + * pending and interrupts were disabled the service task will + * trigger a force WB. + */ + if (skb->xmit_more && + !netif_xmit_stopped(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->queue_index))) { + tx_ring->flags |= I40E_TXR_FLAGS_LAST_XMIT_MORE_SET; + tail_bump = false; + } else if (!skb->xmit_more && + !netif_xmit_stopped(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->queue_index)) && + (!(tx_ring->flags & I40E_TXR_FLAGS_LAST_XMIT_MORE_SET)) && + (tx_ring->packet_stride < WB_STRIDE) && + (desc_count < WB_STRIDE)) { + tx_ring->packet_stride++; + } else { + tx_ring->packet_stride = 0; + tx_ring->flags &= ~I40E_TXR_FLAGS_LAST_XMIT_MORE_SET; + do_rs = true; + } + if (do_rs) + tx_ring->packet_stride = 0; + + tx_desc->cmd_type_offset_bsz = + build_ctob(td_cmd, td_offset, size, td_tag) | + cpu_to_le64((u64)(do_rs ? I40E_TXD_CMD : + I40E_TX_DESC_CMD_EOP) << + I40E_TXD_QW1_CMD_SHIFT); + /* notify HW of packet */ - if (!skb->xmit_more || - netif_xmit_stopped(netdev_get_tx_queue(tx_ring->netdev, - tx_ring->queue_index))) - writel(i, tx_ring->tail); - else + if (!tail_bump) prefetchw(tx_desc + 1); + if (tail_bump) { + /* Force memory writes to complete before letting h/w + * know there are new descriptors to fetch. (Only + * applicable for weak-ordered memory model archs, + * such as IA-64). + */ + wmb(); + writel(i, tx_ring->tail); + } + return; dma_error: diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h index ebc1bf77f036..998976844e4e 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h @@ -267,6 +267,8 @@ struct i40e_ring { bool ring_active; /* is ring online or not */ bool arm_wb; /* do something to arm write back */ + u8 packet_stride; +#define I40E_TXR_FLAGS_LAST_XMIT_MORE_SET BIT(2) u16 flags; #define I40E_TXR_FLAGS_WB_ON_ITR BIT(0) From be98bdf8c42915677fc8460fbb376b0bfee1de28 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Thu, 5 Nov 2015 17:01:01 -0800 Subject: [PATCH 579/813] i40e: fix: do not sleep in netdev_ops [ Upstream commit 0e4425ed641f3eef67c892bc541949cd745a9ba9 ] The driver was being called by VLAN, bonding, teaming operations that expected to be able to hold locks like rcu_read_lock(). This causes the driver to be held to the requirement to not sleep, and was found by the kernel debug options for checking sleep inside critical section, and the locking validator. Change-ID: Ibc68c835f5ffa8ffe0638ffe910a66fc5649a7f7 Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e_main.c | 44 ++++++++++----------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 59b0bcc76a12..1c09f62b85f0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1547,9 +1547,11 @@ static int i40e_set_mac(struct net_device *netdev, void *p) spin_unlock_bh(&vsi->mac_filter_list_lock); } - i40e_sync_vsi_filters(vsi, false); ether_addr_copy(netdev->dev_addr, addr->sa_data); - + /* schedule our worker thread which will take care of + * applying the new filter changes + */ + i40e_service_event_schedule(vsi->back); return 0; } @@ -2112,12 +2114,7 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi, bool grab_rtnl) */ if (pf->cur_promisc != cur_promisc) { pf->cur_promisc = cur_promisc; - if (grab_rtnl) - i40e_do_reset_safe(pf, - BIT(__I40E_PF_RESET_REQUESTED)); - else - i40e_do_reset(pf, - BIT(__I40E_PF_RESET_REQUESTED)); + set_bit(__I40E_PF_RESET_REQUESTED, &pf->state); } } else { ret = i40e_aq_set_vsi_unicast_promiscuous( @@ -2377,16 +2374,13 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid) } } - /* Make sure to release before sync_vsi_filter because that - * function will lock/unlock as necessary - */ spin_unlock_bh(&vsi->mac_filter_list_lock); - if (test_bit(__I40E_DOWN, &vsi->back->state) || - test_bit(__I40E_RESET_RECOVERY_PENDING, &vsi->back->state)) - return 0; - - return i40e_sync_vsi_filters(vsi, false); + /* schedule our worker thread which will take care of + * applying the new filter changes + */ + i40e_service_event_schedule(vsi->back); + return 0; } /** @@ -2459,16 +2453,13 @@ int i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid) } } - /* Make sure to release before sync_vsi_filter because that - * function with lock/unlock as necessary - */ spin_unlock_bh(&vsi->mac_filter_list_lock); - if (test_bit(__I40E_DOWN, &vsi->back->state) || - test_bit(__I40E_RESET_RECOVERY_PENDING, &vsi->back->state)) - return 0; - - return i40e_sync_vsi_filters(vsi, false); + /* schedule our worker thread which will take care of + * applying the new filter changes + */ + i40e_service_event_schedule(vsi->back); + return 0; } /** @@ -2711,6 +2702,11 @@ static void i40e_config_xps_tx_ring(struct i40e_ring *ring) netif_set_xps_queue(ring->netdev, mask, ring->queue_index); free_cpumask_var(mask); } + + /* schedule our worker thread which will take care of + * applying the new filter changes + */ + i40e_service_event_schedule(vsi->back); } /** From 3a87d06fb95e9d8ec3f87edc9383cc3538cd598a Mon Sep 17 00:00:00 2001 From: Kiran Patil Date: Fri, 6 Nov 2015 15:26:03 -0800 Subject: [PATCH 580/813] i40e: Fix memory leaks, sideband filter programming [ Upstream commit a42e7a369ea2b73a554a85dea7d6243af51cd4f0 ] This patch fixes the memory leak which would be seen otherwise when user programs flow-director filter using ethtool (sideband filter programming). When ethtool is used to program flow directory filter, 'raw_buf' gets allocated and it is supposed to be freed as part of queue cleanup. But check of 'tx_buffer->skb' was preventing it from being freed. Change-ID: Ief4f0a1a32a653180498bf6e987c1b4342ab8923 Signed-off-by: Kiran Patil Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 19 ++++++++++++++----- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 10 +++++----- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 635b3ac17877..1e1211875005 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -235,6 +235,9 @@ static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi, "Filter deleted for PCTYPE %d loc = %d\n", fd_data->pctype, fd_data->fd_id); } + if (err) + kfree(raw_packet); + return err ? -EOPNOTSUPP : 0; } @@ -312,6 +315,9 @@ static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi, fd_data->pctype, fd_data->fd_id); } + if (err) + kfree(raw_packet); + return err ? -EOPNOTSUPP : 0; } @@ -387,6 +393,9 @@ static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi, } } + if (err) + kfree(raw_packet); + return err ? -EOPNOTSUPP : 0; } @@ -526,11 +535,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, struct i40e_tx_buffer *tx_buffer) { if (tx_buffer->skb) { - if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) - kfree(tx_buffer->raw_buf); - else - dev_kfree_skb_any(tx_buffer->skb); - + dev_kfree_skb_any(tx_buffer->skb); if (dma_unmap_len(tx_buffer, len)) dma_unmap_single(ring->dev, dma_unmap_addr(tx_buffer, dma), @@ -542,6 +547,10 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, dma_unmap_len(tx_buffer, len), DMA_TO_DEVICE); } + + if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) + kfree(tx_buffer->raw_buf); + tx_buffer->next_to_watch = NULL; tx_buffer->skb = NULL; dma_unmap_len_set(tx_buffer, len, 0); diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 9533c44feee4..a8a29c374224 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -51,11 +51,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, struct i40e_tx_buffer *tx_buffer) { if (tx_buffer->skb) { - if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) - kfree(tx_buffer->raw_buf); - else - dev_kfree_skb_any(tx_buffer->skb); - + dev_kfree_skb_any(tx_buffer->skb); if (dma_unmap_len(tx_buffer, len)) dma_unmap_single(ring->dev, dma_unmap_addr(tx_buffer, dma), @@ -67,6 +63,10 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, dma_unmap_len(tx_buffer, len), DMA_TO_DEVICE); } + + if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) + kfree(tx_buffer->raw_buf); + tx_buffer->next_to_watch = NULL; tx_buffer->skb = NULL; dma_unmap_len_set(tx_buffer, len, 0); From b085e4dd706156b5fe7322b9dc6cf80dedc53a78 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Thu, 19 Nov 2015 11:34:16 -0800 Subject: [PATCH 581/813] i40e: properly delete VF MAC filters [ Upstream commit b36e9ab59b7e3a5b14bf88dc0536e6579db7b54d ] The virtual channel interface was using incorrect semantics to remove MAC addresses, which would leave incorrect filters active when using VLANs. To correct this, add a new function that unconditionally removes MAC addresses from all VLANs, and call this function when the VF requests a MAC filter removal. Change-ID: I69826908ae4f6c847f5bf9b32f11faa760189c74 Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e.h | 2 ++ drivers/net/ethernet/intel/i40e/i40e_main.c | 36 +++++++++++++++++++ .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 8 +++-- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 4dd3e26129b4..7e258a83ccab 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -767,6 +767,8 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid); int i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid); struct i40e_mac_filter *i40e_put_mac_in_vlan(struct i40e_vsi *vsi, u8 *macaddr, bool is_vf, bool is_netdev); +int i40e_del_mac_all_vlan(struct i40e_vsi *vsi, u8 *macaddr, + bool is_vf, bool is_netdev); bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi); struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, u8 *macaddr, bool is_vf, bool is_netdev); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1c09f62b85f0..cd207f6e7f91 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1316,6 +1316,42 @@ struct i40e_mac_filter *i40e_put_mac_in_vlan(struct i40e_vsi *vsi, u8 *macaddr, struct i40e_mac_filter, list); } +/** + * i40e_del_mac_all_vlan - Remove a MAC filter from all VLANS + * @vsi: the VSI to be searched + * @macaddr: the mac address to be removed + * @is_vf: true if it is a VF + * @is_netdev: true if it is a netdev + * + * Removes a given MAC address from a VSI, regardless of VLAN + * + * Returns 0 for success, or error + **/ +int i40e_del_mac_all_vlan(struct i40e_vsi *vsi, u8 *macaddr, + bool is_vf, bool is_netdev) +{ + struct i40e_mac_filter *f = NULL; + int changed = 0; + + WARN(!spin_is_locked(&vsi->mac_filter_list_lock), + "Missing mac_filter_list_lock\n"); + list_for_each_entry(f, &vsi->mac_filter_list, list) { + if ((ether_addr_equal(macaddr, f->macaddr)) && + (is_vf == f->is_vf) && + (is_netdev == f->is_netdev)) { + f->counter--; + f->changed = true; + changed = 1; + } + } + if (changed) { + vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED; + vsi->back->flags |= I40E_FLAG_FILTER_SYNC; + return 0; + } + return -ENOENT; +} + /** * i40e_rm_default_mac_filter - Remove the default MAC filter set by NVM * @vsi: the PF Main VSI - inappropriate for any other VSI diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 44462b40f2d7..388069150cab 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1680,8 +1680,12 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg, u16 msglen) spin_lock_bh(&vsi->mac_filter_list_lock); /* delete addresses from the list */ for (i = 0; i < al->num_elements; i++) - i40e_del_filter(vsi, al->list[i].addr, - I40E_VLAN_ANY, true, false); + if (i40e_del_mac_all_vlan(vsi, al->list[i].addr, true, false)) { + ret = I40E_ERR_INVALID_MAC_ADDR; + spin_unlock_bh(&vsi->mac_filter_list_lock); + goto error_param; + } + spin_unlock_bh(&vsi->mac_filter_list_lock); /* program the updated filter list */ From 796adc8527c3ce901bd813816ac0ec9ab212ee81 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Thu, 19 Nov 2015 11:34:17 -0800 Subject: [PATCH 582/813] i40e: don't add zero MAC filter [ Upstream commit b7b713a8eaf325607d37229f024ad0b9f3e7f320 ] When VFs are created, the MAC address defaults to all zeros, indicating to the VF driver that it should use a random MAC address. However, the PF driver was incorrectly adding this zero MAC to the filter table, along with the VF's randomly generated MAC address. Check for a good address before adding the default filter. While we're at it, make the error message a bit more useful. Change-ID: Ia100947d68140e0f73a19ba755cbffc3e79a8fcf Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 388069150cab..e116d9a99b8e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -549,12 +549,15 @@ static int i40e_alloc_vsi_res(struct i40e_vf *vf, enum i40e_vsi_type type) i40e_vsi_add_pvid(vsi, vf->port_vlan_id); spin_lock_bh(&vsi->mac_filter_list_lock); - f = i40e_add_filter(vsi, vf->default_lan_addr.addr, - vf->port_vlan_id ? vf->port_vlan_id : -1, - true, false); - if (!f) - dev_info(&pf->pdev->dev, - "Could not allocate VF MAC addr\n"); + if (is_valid_ether_addr(vf->default_lan_addr.addr)) { + f = i40e_add_filter(vsi, vf->default_lan_addr.addr, + vf->port_vlan_id ? vf->port_vlan_id : -1, + true, false); + if (!f) + dev_info(&pf->pdev->dev, + "Could not add MAC filter %pM for VF %d\n", + vf->default_lan_addr.addr, vf->vf_id); + } f = i40e_add_filter(vsi, brdcast, vf->port_vlan_id ? vf->port_vlan_id : -1, true, false); From 0af31973b5fd298a52e9429751be5e4bade42a83 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Thu, 19 Nov 2015 11:34:18 -0800 Subject: [PATCH 583/813] i40evf: check rings before freeing resources [ Upstream commit fdb47ae87af537b24977a03bc69cfe1c5c55ca62 ] If the driver gets unloaded during reset recovery, it's possible that it will attempt to free resources when they're already free. Add a check to make sure that the Tx and Rx rings actually exist before dereferencing them to free resources. Change-ID: I4d2b7e9ede49f634d421a4c5deaa5446bc755eee Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 99d2cffae0cd..5f03ab3dfa19 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -1864,6 +1864,9 @@ void i40evf_free_all_tx_resources(struct i40evf_adapter *adapter) { int i; + if (!adapter->tx_rings) + return; + for (i = 0; i < adapter->num_active_queues; i++) if (adapter->tx_rings[i]->desc) i40evf_free_tx_resources(adapter->tx_rings[i]); @@ -1932,6 +1935,9 @@ void i40evf_free_all_rx_resources(struct i40evf_adapter *adapter) { int i; + if (!adapter->rx_rings) + return; + for (i = 0; i < adapter->num_active_queues; i++) if (adapter->rx_rings[i]->desc) i40evf_free_rx_resources(adapter->rx_rings[i]); From 42622b1b7a5211f046ab6d56f67e156d05019c10 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 19 Nov 2015 11:34:23 -0800 Subject: [PATCH 584/813] i40e: clean whole mac filter list [ Upstream commit f11999987bc0b5559ab56dedc6f4ca32fab5438a ] Clean the whole mac filter list when resetting after an intermediate add or delete push to the firmware. The code had evolved from using a list from the stack to a heap allocation, but the memset() didn't follow the change correctly. This now cleans the whole list rather that just part of the first element. Change-ID: I4cd03d5a103b7407dd8556a3a231e800f2d6f2d5 Reported-by: Rasmus Villemoes Signed-off-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e_main.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index cd207f6e7f91..2215bebe208e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1973,11 +1973,13 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi, bool grab_rtnl) /* Now process 'del_list' outside the lock */ if (!list_empty(&tmp_del_list)) { + int del_list_size; + filter_list_len = pf->hw.aq.asq_buf_size / sizeof(struct i40e_aqc_remove_macvlan_element_data); - del_list = kcalloc(filter_list_len, - sizeof(struct i40e_aqc_remove_macvlan_element_data), - GFP_KERNEL); + del_list_size = filter_list_len * + sizeof(struct i40e_aqc_remove_macvlan_element_data); + del_list = kzalloc(del_list_size, GFP_KERNEL); if (!del_list) { i40e_cleanup_add_list(&tmp_add_list); @@ -2009,7 +2011,7 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi, bool grab_rtnl) NULL); aq_err = pf->hw.aq.asq_last_status; num_del = 0; - memset(del_list, 0, sizeof(*del_list)); + memset(del_list, 0, del_list_size); if (ret && aq_err != I40E_AQ_RC_ENOENT) dev_err(&pf->pdev->dev, @@ -2042,13 +2044,14 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi, bool grab_rtnl) } if (!list_empty(&tmp_add_list)) { + int add_list_size; /* do all the adds now */ filter_list_len = pf->hw.aq.asq_buf_size / sizeof(struct i40e_aqc_add_macvlan_element_data), - add_list = kcalloc(filter_list_len, - sizeof(struct i40e_aqc_add_macvlan_element_data), - GFP_KERNEL); + add_list_size = filter_list_len * + sizeof(struct i40e_aqc_add_macvlan_element_data); + add_list = kzalloc(add_list_size, GFP_KERNEL); if (!add_list) { /* Purge element from temporary lists */ i40e_cleanup_add_list(&tmp_add_list); @@ -2086,7 +2089,7 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi, bool grab_rtnl) if (ret) break; - memset(add_list, 0, sizeof(*add_list)); + memset(add_list, 0, add_list_size); } /* Entries from tmp_add_list were cloned from MAC * filter list, hence clean those cloned entries From b909cfc8722a7ed3eb3c881a9ed96ba0f0c42fbe Mon Sep 17 00:00:00 2001 From: Anjali Singhai Jain Date: Wed, 9 Dec 2015 15:50:21 -0800 Subject: [PATCH 585/813] i40e: Fix Rx hash reported to the stack by our driver [ Upstream commit 857942fd1aa15edf7356a4a4bad5369c8e70a633 ] If the driver calls skb_set_hash even with a zero hash, that indicates to the stack that the hash calculation is offloaded in hardware. So the Stack doesn't do a SW hash which is required for load balancing if the user decides to turn of rx-hashing on our device. This patch fixes the path so that we do not call skb_set_hash if the feature is disabled. Change-ID: Ic4debfa4ff91b5a72e447348a75768ed7a2d3e1b Signed-off-by: Anjali Singhai Jain Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 54 ++++++++++--------- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 54 ++++++++++--------- 2 files changed, 58 insertions(+), 50 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 1e1211875005..26c55bba4bf3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1425,31 +1425,12 @@ checksum_fail: } /** - * i40e_rx_hash - returns the hash value from the Rx descriptor - * @ring: descriptor ring - * @rx_desc: specific descriptor - **/ -static inline u32 i40e_rx_hash(struct i40e_ring *ring, - union i40e_rx_desc *rx_desc) -{ - const __le64 rss_mask = - cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH << - I40E_RX_DESC_STATUS_FLTSTAT_SHIFT); - - if ((ring->netdev->features & NETIF_F_RXHASH) && - (rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) - return le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss); - else - return 0; -} - -/** - * i40e_ptype_to_hash - get a hash type + * i40e_ptype_to_htype - get a hash type * @ptype: the ptype value from the descriptor * * Returns a hash type to be used by skb_set_hash **/ -static inline enum pkt_hash_types i40e_ptype_to_hash(u8 ptype) +static inline enum pkt_hash_types i40e_ptype_to_htype(u8 ptype) { struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype); @@ -1466,6 +1447,30 @@ static inline enum pkt_hash_types i40e_ptype_to_hash(u8 ptype) return PKT_HASH_TYPE_L2; } +/** + * i40e_rx_hash - set the hash value in the skb + * @ring: descriptor ring + * @rx_desc: specific descriptor + **/ +static inline void i40e_rx_hash(struct i40e_ring *ring, + union i40e_rx_desc *rx_desc, + struct sk_buff *skb, + u8 rx_ptype) +{ + u32 hash; + const __le64 rss_mask = + cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH << + I40E_RX_DESC_STATUS_FLTSTAT_SHIFT); + + if (ring->netdev->features & NETIF_F_RXHASH) + return; + + if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) { + hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss); + skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype)); + } +} + /** * i40e_clean_rx_irq_ps - Reclaim resources after receive; packet split * @rx_ring: rx ring to clean @@ -1615,8 +1620,8 @@ static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, int budget) continue; } - skb_set_hash(skb, i40e_rx_hash(rx_ring, rx_desc), - i40e_ptype_to_hash(rx_ptype)); + i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype); + if (unlikely(rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK)) { i40e_ptp_rx_hwtstamp(vsi->back, skb, (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >> @@ -1745,8 +1750,7 @@ static int i40e_clean_rx_irq_1buf(struct i40e_ring *rx_ring, int budget) continue; } - skb_set_hash(skb, i40e_rx_hash(rx_ring, rx_desc), - i40e_ptype_to_hash(rx_ptype)); + i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype); if (unlikely(rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK)) { i40e_ptp_rx_hwtstamp(vsi->back, skb, (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >> diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index a8a29c374224..39db70a597ed 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -879,31 +879,12 @@ checksum_fail: } /** - * i40e_rx_hash - returns the hash value from the Rx descriptor - * @ring: descriptor ring - * @rx_desc: specific descriptor - **/ -static inline u32 i40e_rx_hash(struct i40e_ring *ring, - union i40e_rx_desc *rx_desc) -{ - const __le64 rss_mask = - cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH << - I40E_RX_DESC_STATUS_FLTSTAT_SHIFT); - - if ((ring->netdev->features & NETIF_F_RXHASH) && - (rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) - return le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss); - else - return 0; -} - -/** - * i40e_ptype_to_hash - get a hash type + * i40e_ptype_to_htype - get a hash type * @ptype: the ptype value from the descriptor * * Returns a hash type to be used by skb_set_hash **/ -static inline enum pkt_hash_types i40e_ptype_to_hash(u8 ptype) +static inline enum pkt_hash_types i40e_ptype_to_htype(u8 ptype) { struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype); @@ -920,6 +901,30 @@ static inline enum pkt_hash_types i40e_ptype_to_hash(u8 ptype) return PKT_HASH_TYPE_L2; } +/** + * i40e_rx_hash - set the hash value in the skb + * @ring: descriptor ring + * @rx_desc: specific descriptor + **/ +static inline void i40e_rx_hash(struct i40e_ring *ring, + union i40e_rx_desc *rx_desc, + struct sk_buff *skb, + u8 rx_ptype) +{ + u32 hash; + const __le64 rss_mask = + cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH << + I40E_RX_DESC_STATUS_FLTSTAT_SHIFT); + + if (ring->netdev->features & NETIF_F_RXHASH) + return; + + if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) { + hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss); + skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype)); + } +} + /** * i40e_clean_rx_irq_ps - Reclaim resources after receive; packet split * @rx_ring: rx ring to clean @@ -1061,8 +1066,8 @@ static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, int budget) continue; } - skb_set_hash(skb, i40e_rx_hash(rx_ring, rx_desc), - i40e_ptype_to_hash(rx_ptype)); + i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype); + /* probably a little skewed due to removing CRC */ total_rx_bytes += skb->len; total_rx_packets++; @@ -1179,8 +1184,7 @@ static int i40e_clean_rx_irq_1buf(struct i40e_ring *rx_ring, int budget) continue; } - skb_set_hash(skb, i40e_rx_hash(rx_ring, rx_desc), - i40e_ptype_to_hash(rx_ptype)); + i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype); /* probably a little skewed due to removing CRC */ total_rx_bytes += skb->len; total_rx_packets++; From d5463792a1c1bd788aca2024f96e3136c875c01d Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Thu, 10 Sep 2015 15:37:50 -0400 Subject: [PATCH 586/813] igb: don't unmap NULL hw_addr [ Upstream commit 73bf8048d7c86a20a59d427e55deb1a778e94df7 ] I've got a startech thunderbolt dock someone loaned me, which among other things, has the following device in it: 08:00.0 Ethernet controller: Intel Corporation I210 Gigabit Network Connection (rev 03) This hotplugs just fine (kernel 4.2.0 plus a patch or two here): [ 863.020315] igb: Intel(R) Gigabit Ethernet Network Driver - version 5.2.18-k [ 863.020316] igb: Copyright (c) 2007-2014 Intel Corporation. [ 863.028657] igb 0000:08:00.0: enabling device (0000 -> 0002) [ 863.062089] igb 0000:08:00.0: added PHC on eth0 [ 863.062090] igb 0000:08:00.0: Intel(R) Gigabit Ethernet Network Connection [ 863.062091] igb 0000:08:00.0: eth0: (PCIe:2.5Gb/s:Width x1) e8:ea:6a:00:1b:2a [ 863.062194] igb 0000:08:00.0: eth0: PBA No: 000200-000 [ 863.062196] igb 0000:08:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s) [ 863.064889] igb 0000:08:00.0 enp8s0: renamed from eth0 But disconnecting it is another story: [ 1002.807932] igb 0000:08:00.0: removed PHC on enp8s0 [ 1002.807944] igb 0000:08:00.0 enp8s0: PCIe link lost, device now detached [ 1003.341141] ------------[ cut here ]------------ [ 1003.341148] WARNING: CPU: 0 PID: 199 at lib/iomap.c:43 bad_io_access+0x38/0x40() [ 1003.341149] Bad IO access at port 0x0 () [ 1003.342767] Modules linked in: snd_usb_audio snd_usbmidi_lib snd_rawmidi igb dca firewire_ohci firewire_core crc_itu_t rfcomm ctr ccm arc4 iwlmvm mac80211 fuse xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter bnep dm_mirror dm_region_hash dm_log dm_mod coretemp x86_pkg_temp_thermal intel_powerclamp kvm_intel snd_hda_codec_hdmi kvm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drbg [ 1003.342793] ansi_cprng aesni_intel hp_wmi aes_x86_64 iTCO_wdt lrw iTCO_vendor_support ppdev gf128mul sparse_keymap glue_helper ablk_helper cryptd snd_hda_codec_realtek snd_hda_codec_generic microcode snd_hda_intel uvcvideo iwlwifi snd_hda_codec videobuf2_vmalloc videobuf2_memops snd_hda_core videobuf2_core snd_hwdep btusb v4l2_common btrtl snd_seq btbcm btintel videodev cfg80211 snd_seq_device rtsx_pci_ms bluetooth pcspkr input_leds i2c_i801 media parport_pc memstick rfkill sg lpc_ich snd_pcm 8250_fintek parport joydev snd_timer snd soundcore hp_accel ie31200_edac mei_me lis3lv02d edac_core input_polldev mei hp_wireless shpchp tpm_infineon sch_fq_codel nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables autofs4 xfs libcrc32c sd_mod sr_mod cdrom rtsx_pci_sdmmc mmc_core crc32c_intel serio_raw rtsx_pci [ 1003.342822] nouveau ahci libahci mxm_wmi e1000e xhci_pci hwmon ptp drm_kms_helper pps_core xhci_hcd ttm wmi video ipv6 [ 1003.342839] CPU: 0 PID: 199 Comm: kworker/0:2 Not tainted 4.2.0-2.el7_UNSUPPORTED.x86_64 #1 [ 1003.342840] Hardware name: Hewlett-Packard HP ZBook 15 G2/2253, BIOS M70 Ver. 01.07 02/26/2015 [ 1003.342843] Workqueue: pciehp-3 pciehp_power_thread [ 1003.342844] ffffffff81a90655 ffff8804866d3b48 ffffffff8164763a 0000000000000000 [ 1003.342846] ffff8804866d3b98 ffff8804866d3b88 ffffffff8107134a ffff8804866d3b88 [ 1003.342847] ffff880486f46000 ffff88046c8a8000 ffff880486f46840 ffff88046c8a8098 [ 1003.342848] Call Trace: [ 1003.342852] [] dump_stack+0x45/0x57 [ 1003.342855] [] warn_slowpath_common+0x8a/0xc0 [ 1003.342857] [] warn_slowpath_fmt+0x46/0x50 [ 1003.342859] [] ? pci_disable_msix+0x3e/0x50 [ 1003.342860] [] bad_io_access+0x38/0x40 [ 1003.342861] [] pci_iounmap+0x27/0x40 [ 1003.342865] [] igb_remove+0xc7/0x160 [igb] [ 1003.342867] [] pci_device_remove+0x3f/0xc0 [ 1003.342869] [] __device_release_driver+0x96/0x130 [ 1003.342870] [] device_release_driver+0x23/0x30 [ 1003.342871] [] pci_stop_bus_device+0x94/0xa0 [ 1003.342872] [] pci_stop_bus_device+0x3d/0xa0 [ 1003.342873] [] pci_stop_bus_device+0x3d/0xa0 [ 1003.342874] [] pci_stop_and_remove_bus_device+0x16/0x30 [ 1003.342876] [] pciehp_unconfigure_device+0x9b/0x180 [ 1003.342877] [] pciehp_disable_slot+0x43/0xb0 [ 1003.342878] [] pciehp_power_thread+0x8d/0xb0 [ 1003.342885] [] process_one_work+0x152/0x3d0 [ 1003.342886] [] worker_thread+0x11a/0x460 [ 1003.342887] [] ? process_one_work+0x3d0/0x3d0 [ 1003.342890] [] kthread+0xc9/0xe0 [ 1003.342891] [] ? kthread_create_on_node+0x180/0x180 [ 1003.342893] [] ret_from_fork+0x3f/0x70 [ 1003.342894] [] ? kthread_create_on_node+0x180/0x180 [ 1003.342895] ---[ end trace 65a77e06d5aa9358 ]--- Upon looking at the igb driver, I see that igb_rd32() attempted to read from hw_addr and failed, so it set hw->hw_addr to NULL and spit out the message in the log output above, "PCIe link lost, device now detached". Well, now that hw_addr is NULL, the attempt to call pci_iounmap is obviously not going to go well. As suggested by Mark Rustad, do something similar to what ixgbe does, and save a copy of hw_addr as adapter->io_addr, so we can still call pci_iounmap on it on teardown. Additionally, for consistency, make the pci_iomap call assignment directly to io_addr, so map and unmap match. Signed-off-by: Jarod Wilson Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb.h | 2 ++ drivers/net/ethernet/intel/igb/igb_main.c | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 1a2f1cc44b28..e3cb93bdb21a 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -389,6 +389,8 @@ struct igb_adapter { u16 link_speed; u16 link_duplex; + u8 __iomem *io_addr; /* Mainly for iounmap use */ + struct work_struct reset_task; struct work_struct watchdog_task; bool fc_autoneg; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index ea7b09887245..061e1026af76 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2294,9 +2294,11 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); err = -EIO; - hw->hw_addr = pci_iomap(pdev, 0, 0); - if (!hw->hw_addr) + adapter->io_addr = pci_iomap(pdev, 0, 0); + if (!adapter->io_addr) goto err_ioremap; + /* hw->hw_addr can be altered, we'll use adapter->io_addr for unmap */ + hw->hw_addr = adapter->io_addr; netdev->netdev_ops = &igb_netdev_ops; igb_set_ethtool_ops(netdev); @@ -2656,7 +2658,7 @@ err_sw_init: #ifdef CONFIG_PCI_IOV igb_disable_sriov(pdev); #endif - pci_iounmap(pdev, hw->hw_addr); + pci_iounmap(pdev, adapter->io_addr); err_ioremap: free_netdev(netdev); err_alloc_etherdev: @@ -2823,7 +2825,7 @@ static void igb_remove(struct pci_dev *pdev) igb_clear_interrupt_scheme(adapter); - pci_iounmap(pdev, hw->hw_addr); + pci_iounmap(pdev, adapter->io_addr); if (hw->flash_address) iounmap(hw->flash_address); pci_release_selected_regions(pdev, From 1231f5a2e59477f896c69d3ed66d9efde20603ba Mon Sep 17 00:00:00 2001 From: Todd Fujinaka Date: Fri, 18 Sep 2015 15:43:51 -0700 Subject: [PATCH 587/813] igb: use the correct i210 register for EEMNGCTL [ Upstream commit 08c991297582114a6e1220f913eec91789c4eac6 ] The i210 has two EEPROM access registers that are located in non-standard offsets: EEARBC and EEMNGCTL. EEARBC was fixed previously and EEMNGCTL should also be corrected. Reported-by: Roman Hodek Signed-off-by: Todd Fujinaka Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/e1000_82575.c | 1 + drivers/net/ethernet/intel/igb/e1000_i210.c | 27 ++++++++++++++++++++ drivers/net/ethernet/intel/igb/e1000_i210.h | 1 + drivers/net/ethernet/intel/igb/e1000_regs.h | 1 + 4 files changed, 30 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c b/drivers/net/ethernet/intel/igb/e1000_82575.c index 7a73510e547c..97bf0c3d5c69 100644 --- a/drivers/net/ethernet/intel/igb/e1000_82575.c +++ b/drivers/net/ethernet/intel/igb/e1000_82575.c @@ -294,6 +294,7 @@ static s32 igb_init_phy_params_82575(struct e1000_hw *hw) case I210_I_PHY_ID: phy->type = e1000_phy_i210; phy->ops.check_polarity = igb_check_polarity_m88; + phy->ops.get_cfg_done = igb_get_cfg_done_i210; phy->ops.get_phy_info = igb_get_phy_info_m88; phy->ops.get_cable_length = igb_get_cable_length_m88_gen2; phy->ops.set_d0_lplu_state = igb_set_d0_lplu_state_82580; diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.c b/drivers/net/ethernet/intel/igb/e1000_i210.c index 65d931669f81..29f59c76878a 100644 --- a/drivers/net/ethernet/intel/igb/e1000_i210.c +++ b/drivers/net/ethernet/intel/igb/e1000_i210.c @@ -900,3 +900,30 @@ s32 igb_pll_workaround_i210(struct e1000_hw *hw) wr32(E1000_MDICNFG, mdicnfg); return ret_val; } + +/** + * igb_get_cfg_done_i210 - Read config done bit + * @hw: pointer to the HW structure + * + * Read the management control register for the config done bit for + * completion status. NOTE: silicon which is EEPROM-less will fail trying + * to read the config done bit, so an error is *ONLY* logged and returns + * 0. If we were to return with error, EEPROM-less silicon + * would not be able to be reset or change link. + **/ +s32 igb_get_cfg_done_i210(struct e1000_hw *hw) +{ + s32 timeout = PHY_CFG_TIMEOUT; + u32 mask = E1000_NVM_CFG_DONE_PORT_0; + + while (timeout) { + if (rd32(E1000_EEMNGCTL_I210) & mask) + break; + usleep_range(1000, 2000); + timeout--; + } + if (!timeout) + hw_dbg("MNG configuration cycle has not completed.\n"); + + return 0; +} diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.h b/drivers/net/ethernet/intel/igb/e1000_i210.h index 3442b6357d01..eaa68a50cb3b 100644 --- a/drivers/net/ethernet/intel/igb/e1000_i210.h +++ b/drivers/net/ethernet/intel/igb/e1000_i210.h @@ -34,6 +34,7 @@ s32 igb_write_xmdio_reg(struct e1000_hw *hw, u16 addr, u8 dev_addr, u16 data); s32 igb_init_nvm_params_i210(struct e1000_hw *hw); bool igb_get_flash_presence_i210(struct e1000_hw *hw); s32 igb_pll_workaround_i210(struct e1000_hw *hw); +s32 igb_get_cfg_done_i210(struct e1000_hw *hw); #define E1000_STM_OPCODE 0xDB00 #define E1000_EEPROM_FLASH_SIZE_WORD 0x11 diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h index 4af2870e49f8..0fdcd4d1b982 100644 --- a/drivers/net/ethernet/intel/igb/e1000_regs.h +++ b/drivers/net/ethernet/intel/igb/e1000_regs.h @@ -66,6 +66,7 @@ #define E1000_PBA 0x01000 /* Packet Buffer Allocation - RW */ #define E1000_PBS 0x01008 /* Packet Buffer Size */ #define E1000_EEMNGCTL 0x01010 /* MNG EEprom Control */ +#define E1000_EEMNGCTL_I210 0x12030 /* MNG EEprom Control */ #define E1000_EEARBC_I210 0x12024 /* EEPROM Auto Read Bus Control */ #define E1000_EEWR 0x0102C /* EEPROM Write Register - RW */ #define E1000_I2CCMD 0x01028 /* SFPI2C Command Register - RW */ From fe52e1255397ac4a1233db10e8f651847f992a4b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 19 Oct 2015 04:23:29 -0600 Subject: [PATCH 588/813] igb: fix NULL derefs due to skipped SR-IOV enabling [ Upstream commit be06998f96ecb93938ad2cce46c4289bf7cf45bc ] The combined effect of commits 6423fc3416 ("igb: do not re-init SR-IOV during probe") and ceee3450b3 ("igb: make sure SR-IOV init uses the right number of queues") causes VFs no longer getting set up, leading to NULL pointer dereferences due to the adapter's ->vf_data being NULL while ->vfs_allocated_count is non-zero. The first commit not only neglected the side effect of igb_sriov_reinit() that the second commit tried to account for, but also that of setting IGB_FLAG_HAS_MSIX, without which igb_enable_sriov() is effectively a no-op. Calling igb_{,re}set_interrupt_capability() as done here seems to address this, but I'm not sure whether this is better than sinply reverting the other two commits. Signed-off-by: Jan Beulich Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 061e1026af76..fa3b4cbea23b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2858,6 +2858,13 @@ static void igb_probe_vfs(struct igb_adapter *adapter) if ((hw->mac.type == e1000_i210) || (hw->mac.type == e1000_i211)) return; + /* Of the below we really only want the effect of getting + * IGB_FLAG_HAS_MSIX set (if available), without which + * igb_enable_sriov() has no effect. + */ + igb_set_interrupt_capability(adapter, true); + igb_reset_interrupt_capability(adapter); + pci_sriov_set_totalvfs(pdev, 7); igb_enable_sriov(pdev, max_vfs); From 5c5989e0dde275bae778d6b4706cf606aca749cc Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 22 Sep 2015 14:35:41 -0700 Subject: [PATCH 589/813] ixgbe: Fix handling of NAPI budget when multiple queues are enabled per vector [ Upstream commit 5d6002b7b822c7423e75d4651e6790bfb5642b1b ] This patch corrects an issue in which the polling routine would increase the budget for Rx to at least 1 per queue if multiple queues were present. This would result in Rx packets being processed when the budget was 0 which is meant to indicate that no Rx can be handled. Signed-off-by: Alexander Duyck Tested-by: Darin Miller Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index aed8d029b23d..cd9b284bc83b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2786,7 +2786,8 @@ int ixgbe_poll(struct napi_struct *napi, int budget) ixgbe_for_each_ring(ring, q_vector->tx) clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring); - if (!ixgbe_qv_lock_napi(q_vector)) + /* Exit if we are called by netpoll or busy polling is active */ + if ((budget <= 0) || !ixgbe_qv_lock_napi(q_vector)) return budget; /* attempt to distribute budget to each queue fairly, but don't allow From abe2792fab39c606f21a6b4a4dab2d5090723a01 Mon Sep 17 00:00:00 2001 From: Dmitriy Vyukov Date: Tue, 8 Sep 2015 10:52:44 +0200 Subject: [PATCH 590/813] e1000: fix data race between tx_ring->next_to_clean [ Upstream commit 9eab46b7cb8d0b0dcf014bf7b25e0e72b9e4d929 ] e1000_clean_tx_irq cleans buffers and sets tx_ring->next_to_clean, then e1000_xmit_frame reuses the cleaned buffers. But there are no memory barriers when buffers gets recycled, so the recycled buffers can be corrupted. Use smp_store_release to update tx_ring->next_to_clean and smp_load_acquire to read tx_ring->next_to_clean to properly hand off buffers from e1000_clean_tx_irq to e1000_xmit_frame. The data race was found with KernelThreadSanitizer (KTSAN). Signed-off-by: Dmitry Vyukov Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/e1000/e1000.h | 7 +++++-- drivers/net/ethernet/intel/e1000/e1000_main.c | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h index 69707108d23c..98fe5a2cd6e3 100644 --- a/drivers/net/ethernet/intel/e1000/e1000.h +++ b/drivers/net/ethernet/intel/e1000/e1000.h @@ -213,8 +213,11 @@ struct e1000_rx_ring { }; #define E1000_DESC_UNUSED(R) \ - ((((R)->next_to_clean > (R)->next_to_use) \ - ? 0 : (R)->count) + (R)->next_to_clean - (R)->next_to_use - 1) +({ \ + unsigned int clean = smp_load_acquire(&(R)->next_to_clean); \ + unsigned int use = READ_ONCE((R)->next_to_use); \ + (clean > use ? 0 : (R)->count) + clean - use - 1; \ +}) #define E1000_RX_DESC_EXT(R, i) \ (&(((union e1000_rx_desc_extended *)((R).desc))[i])) diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index fd7be860c201..068023595d84 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -3876,7 +3876,10 @@ static bool e1000_clean_tx_irq(struct e1000_adapter *adapter, eop_desc = E1000_TX_DESC(*tx_ring, eop); } - tx_ring->next_to_clean = i; + /* Synchronize with E1000_DESC_UNUSED called from e1000_xmit_frame, + * which will reuse the cleaned buffers. + */ + smp_store_release(&tx_ring->next_to_clean, i); netdev_completed_queue(netdev, pkts_compl, bytes_compl); From c6210760f2428690255e1fd91cf084b12e827a0a Mon Sep 17 00:00:00 2001 From: Dmitry Fleytman Date: Tue, 13 Oct 2015 12:48:18 +0300 Subject: [PATCH 591/813] e1000e: fix division by zero on jumbo MTUs [ Upstream commit b77ac46bbae862dcb3f51296825c940404c69b0f ] This patch fixes possible division by zero in receive interrupt handler when working without adaptive interrupt moderation. The adaptive interrupt moderation mechanism is typically disabled on jumbo MTUs. Signed-off-by: Dmitry Fleytman Signed-off-by: Leonid Bloch Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/e1000e/netdev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 0a854a47d31a..80ec587d510e 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -1959,8 +1959,10 @@ static irqreturn_t e1000_intr_msix_rx(int __always_unused irq, void *data) * previous interrupt. */ if (rx_ring->set_itr) { - writel(1000000000 / (rx_ring->itr_val * 256), - rx_ring->itr_register); + u32 itr = rx_ring->itr_val ? + 1000000000 / (rx_ring->itr_val * 256) : 0; + + writel(itr, rx_ring->itr_register); rx_ring->set_itr = 0; } From 4aa1324340dabca412d39ba98de7eabd495af7f0 Mon Sep 17 00:00:00 2001 From: Loc Ho Date: Thu, 19 Nov 2015 12:20:30 -0700 Subject: [PATCH 592/813] clk: xgene: Fix divider with non-zero shift value [ Upstream commit 1382ea631ddddb634850a3795527db0feeff5aaf ] The X-Gene clock driver missed the divider shift operation when set the divider value. Signed-off-by: Loc Ho Fixes: 308964caeebc ("clk: Add APM X-Gene SoC clock driver") Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/clk/clk-xgene.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk-xgene.c b/drivers/clk/clk-xgene.c index 27c0da29eca3..10224b01b97c 100644 --- a/drivers/clk/clk-xgene.c +++ b/drivers/clk/clk-xgene.c @@ -351,7 +351,8 @@ static int xgene_clk_set_rate(struct clk_hw *hw, unsigned long rate, /* Set new divider */ data = xgene_clk_read(pclk->param.divider_reg + pclk->param.reg_divider_offset); - data &= ~((1 << pclk->param.reg_divider_width) - 1); + data &= ~((1 << pclk->param.reg_divider_width) - 1) + << pclk->param.reg_divider_shift; data |= divider; xgene_clk_write(data, pclk->param.divider_reg + pclk->param.reg_divider_offset); From 87a7bea68d32ef961f0d6883feb7a83f45c2a74e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 24 Aug 2015 17:27:24 -0700 Subject: [PATCH 593/813] fm10k: do not assume VF always has 1 queue [ Upstream commit 1340181fe435ccb8ca2f996b8680bd9566860619 ] It is possible that the PF has not yet assigned resources to the VF. Although rare, this could result in the VF attempting to read queues it does not own and result in FUM or THI faults in the PF. To prevent this, check queue 0 before we continue in init_hw_vf. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_type.h | 1 + drivers/net/ethernet/intel/fm10k/fm10k_vf.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h b/drivers/net/ethernet/intel/fm10k/fm10k_type.h index 318a212f0a78..35afd711d144 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h @@ -77,6 +77,7 @@ struct fm10k_hw; #define FM10K_PCIE_SRIOV_CTRL_VFARI 0x10 #define FM10K_ERR_PARAM -2 +#define FM10K_ERR_NO_RESOURCES -3 #define FM10K_ERR_REQUESTS_PENDING -4 #define FM10K_ERR_RESET_REQUESTED -5 #define FM10K_ERR_DMA_PENDING -6 diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c index 36c8b0aa08fd..3a18ef1cc017 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c @@ -103,7 +103,12 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw) s32 err; u16 i; - /* assume we always have at least 1 queue */ + /* verify we have at least 1 queue */ + if (!~fm10k_read_reg(hw, FM10K_TXQCTL(0)) || + !~fm10k_read_reg(hw, FM10K_RXQCTL(0))) + return FM10K_ERR_NO_RESOURCES; + + /* determine how many queues we have */ for (i = 1; tqdloc0 && (i < FM10K_MAX_QUEUES_POOL); i++) { /* verify the Descriptor cache offsets are increasing */ tqdloc = ~fm10k_read_reg(hw, FM10K_TQDLOC(i)); From 199ffa62bf2e6c599f07badf90f996a3f773af77 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 25 Aug 2015 13:49:11 -0700 Subject: [PATCH 594/813] fm10k: Correct MTU for jumbo frames [ Upstream commit 8c7ee6d2cacc7794a91875ef5fd8284b4a900d8c ] Based on hardware testing, the host interface supports up to 15368 bytes as the maximum frame size. To determine the correct MTU, we subtract 8 for the internal switch tag, 14 for the L2 header, and 4 for the appended FCS header, resulting in 15342 bytes of payload for our maximum MTU on jumbo frames. Signed-off-by: Matthew Vick Signed-off-by: Jacob Keller Acked-by: Bruce Allan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index 14440200499b..48809e5d3f79 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -33,7 +33,7 @@ #include "fm10k_pf.h" #include "fm10k_vf.h" -#define FM10K_MAX_JUMBO_FRAME_SIZE 15358 /* Maximum supported size 15K */ +#define FM10K_MAX_JUMBO_FRAME_SIZE 15342 /* Maximum supported size 15K */ #define MAX_QUEUES FM10K_MAX_QUEUES_PF From 031d16e75c5dbc96cdbfcb1ebd5c60d95d33786d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 22 Sep 2015 14:35:35 -0700 Subject: [PATCH 595/813] fm10k: Fix handling of NAPI budget when multiple queues are enabled per vector [ Upstream commit 9f872986479b6e0543eb5c615e5f9491bb04e5c1 ] This patch corrects an issue in which the polling routine would increase the budget for Rx to at least 1 per queue if multiple queues were present. This would result in Rx packets being processed when the budget was 0 which is meant to indicate that no Rx can be handled. Signed-off-by: Alexander Duyck Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index e76a44cf330c..746a1986690b 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -1428,6 +1428,10 @@ static int fm10k_poll(struct napi_struct *napi, int budget) fm10k_for_each_ring(ring, q_vector->tx) clean_complete &= fm10k_clean_tx_irq(q_vector, ring); + /* Handle case where we are called by netpoll with a budget of 0 */ + if (budget <= 0) + return budget; + /* attempt to distribute budget to each queue fairly, but don't * allow the budget to go below 1 because we'll exit polling */ From 587e0cfdad34aa76d6bafa9151e4afabadbdbfbd Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 16 Oct 2015 10:56:57 -0700 Subject: [PATCH 596/813] fm10k: reset max_queues on init_hw_vf failure [ Upstream commit 0e8d5b5975401c83641efd5d4595e6cdbe9e9e2f ] VF drivers must detect how many queues are available. Previously, the driver assumed that each VF has at minimum 1 queue. This assumption is incorrect, since it is possible that the PF has not yet assigned the queues to the VF by the time the VF checks. To resolve this, we added a check first to ensure that the first queue is infact owned by the VF at init_hw_vf time. However, the code flow did not reset hw->mac.max_queues to 0. In some cases, such as during reinit flows, we call init_hw_vf without clearing the previous value of hw->mac.max_queues. Due to this, when init_hw_vf errors out, if its error code is not properly handled the VF driver may still believe it has queues which no longer belong to it. Fix this by clearing the hw->mac.max_queues on exit due to errors. Signed-off-by: Jacob Keller Reviewed-by: Bruce Allan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_vf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c index 3a18ef1cc017..d512575c33f3 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c @@ -105,8 +105,10 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw) /* verify we have at least 1 queue */ if (!~fm10k_read_reg(hw, FM10K_TXQCTL(0)) || - !~fm10k_read_reg(hw, FM10K_RXQCTL(0))) - return FM10K_ERR_NO_RESOURCES; + !~fm10k_read_reg(hw, FM10K_RXQCTL(0))) { + err = FM10K_ERR_NO_RESOURCES; + goto reset_max_queues; + } /* determine how many queues we have */ for (i = 1; tqdloc0 && (i < FM10K_MAX_QUEUES_POOL); i++) { @@ -124,7 +126,7 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw) /* shut down queues we own and reset DMA configuration */ err = fm10k_disable_queues_generic(hw, i); if (err) - return err; + goto reset_max_queues; /* record maximum queue count */ hw->mac.max_queues = i; @@ -134,6 +136,11 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw) FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT; return 0; + +reset_max_queues: + hw->mac.max_queues = 0; + + return err; } /* This structure defines the attibutes to be parsed below */ From 134d78e9d4625aa237d9e6a516965e753bb02e67 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 16 Oct 2015 10:56:58 -0700 Subject: [PATCH 597/813] fm10k: always check init_hw for errors [ Upstream commit 1343c65f70ee1b1f968a08b30e1836a4e37116cd ] A recent change modified init_hw in some flows the function may fail on VF devices. For example, if a VF doesn't yet own its own queues. However, many callers of init_hw didn't bother to check the error code. Other callers checked but only displayed diagnostic messages without actually handling the consequences. Fix this by (a) always returning and preventing the netdevice from going up, and (b) printing the diagnostic in every flow for consistency. This should resolve an issue where VF drivers would attempt to come up before the PF has finished assigning queues. In addition, change the dmesg output to explicitly show the actual function that failed, instead of combining reset_hw and init_hw into a single check, to help for future debugging. Fixes: 1d568b0f6424 ("fm10k: do not assume VF always has 1 queue") Signed-off-by: Jacob Keller Reviewed-by: Bruce Allan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 34 +++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 74be792f3f1b..1af13763ccb0 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -163,9 +163,17 @@ static void fm10k_reinit(struct fm10k_intfc *interface) interface->last_reset = jiffies + (10 * HZ); /* reset and initialize the hardware so it is in a known state */ - err = hw->mac.ops.reset_hw(hw) ? : hw->mac.ops.init_hw(hw); - if (err) + err = hw->mac.ops.reset_hw(hw); + if (err) { + dev_err(&interface->pdev->dev, "reset_hw failed: %d\n", err); + goto reinit_err; + } + + err = hw->mac.ops.init_hw(hw); + if (err) { dev_err(&interface->pdev->dev, "init_hw failed: %d\n", err); + goto reinit_err; + } /* reassociate interrupts */ fm10k_mbx_request_irq(interface); @@ -193,6 +201,10 @@ static void fm10k_reinit(struct fm10k_intfc *interface) fm10k_iov_resume(interface->pdev); +reinit_err: + if (err) + netif_device_detach(netdev); + rtnl_unlock(); clear_bit(__FM10K_RESETTING, &interface->state); @@ -1684,7 +1696,13 @@ static int fm10k_sw_init(struct fm10k_intfc *interface, interface->last_reset = jiffies + (10 * HZ); /* reset and initialize the hardware so it is in a known state */ - err = hw->mac.ops.reset_hw(hw) ? : hw->mac.ops.init_hw(hw); + err = hw->mac.ops.reset_hw(hw); + if (err) { + dev_err(&pdev->dev, "reset_hw failed: %d\n", err); + return err; + } + + err = hw->mac.ops.init_hw(hw); if (err) { dev_err(&pdev->dev, "init_hw failed: %d\n", err); return err; @@ -2071,8 +2089,10 @@ static int fm10k_resume(struct pci_dev *pdev) /* reset hardware to known state */ err = hw->mac.ops.init_hw(&interface->hw); - if (err) + if (err) { + dev_err(&pdev->dev, "init_hw failed: %d\n", err); return err; + } /* reset statistics starting values */ hw->mac.ops.rebind_hw_stats(hw, &interface->stats); @@ -2248,7 +2268,11 @@ static void fm10k_io_resume(struct pci_dev *pdev) int err = 0; /* reset hardware to known state */ - hw->mac.ops.init_hw(&interface->hw); + err = hw->mac.ops.init_hw(&interface->hw); + if (err) { + dev_err(&pdev->dev, "init_hw failed: %d\n", err); + return; + } /* reset statistics starting values */ hw->mac.ops.rebind_hw_stats(hw, &interface->stats); From 6628b26348d9338571b9f024b2bd266a60298442 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 16 Oct 2015 10:56:59 -0700 Subject: [PATCH 598/813] fm10k: reinitialize queuing scheme after calling init_hw [ Upstream commit 875328e4bce696e85edcda3c4b0ec80fd525e3a3 ] The init_hw function may fail, and in the case of VFs, it might change the number of maximum queues available. Thus, for every flow which checks init_hw, we need to ensure that we clear the queue scheme before, and initialize it after. The fm10k_io_slot_reset path will end up triggering a reset so fm10k_reinit needs this change. The fm10k_io_error_detected and fm10k_io_resume also need to properly clear and reinitialize the queue scheme. Signed-off-by: Jacob Keller Reviewed-by: Bruce Allan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 1af13763ccb0..e4ec1361eaf4 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -159,6 +159,9 @@ static void fm10k_reinit(struct fm10k_intfc *interface) fm10k_mbx_free_irq(interface); + /* free interrupts */ + fm10k_clear_queueing_scheme(interface); + /* delay any future reset requests */ interface->last_reset = jiffies + (10 * HZ); @@ -175,6 +178,12 @@ static void fm10k_reinit(struct fm10k_intfc *interface) goto reinit_err; } + err = fm10k_init_queueing_scheme(interface); + if (err) { + dev_err(&interface->pdev->dev, "init_queueing_scheme failed: %d\n", err); + goto reinit_err; + } + /* reassociate interrupts */ fm10k_mbx_request_irq(interface); @@ -2205,6 +2214,9 @@ static pci_ers_result_t fm10k_io_error_detected(struct pci_dev *pdev, if (netif_running(netdev)) fm10k_close(netdev); + /* free interrupts */ + fm10k_clear_queueing_scheme(interface); + fm10k_mbx_free_irq(interface); pci_disable_device(pdev); @@ -2277,6 +2289,12 @@ static void fm10k_io_resume(struct pci_dev *pdev) /* reset statistics starting values */ hw->mac.ops.rebind_hw_stats(hw, &interface->stats); + err = fm10k_init_queueing_scheme(interface); + if (err) { + dev_err(&interface->pdev->dev, "init_queueing_scheme failed: %d\n", err); + return; + } + /* reassociate interrupts */ fm10k_mbx_request_irq(interface); From e0ee2ce51824680b687a73857ef64182bbfd3a14 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 27 Oct 2015 16:59:12 -0700 Subject: [PATCH 599/813] fm10k: Cleanup MSI-X interrupts in case of failure [ Upstream commit 587731e684dcf3522215194a02357d26b9bc7277 ] If the q_vector allocation fails we should free the resources associated with the MSI-X vector table. Signed-off-by: Alexander Duyck Reviewed-by: Bruce Allan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 746a1986690b..09281558bfbc 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -1970,8 +1970,10 @@ int fm10k_init_queueing_scheme(struct fm10k_intfc *interface) /* Allocate memory for queues */ err = fm10k_alloc_q_vectors(interface); - if (err) + if (err) { + fm10k_reset_msix_capability(interface); return err; + } /* Map rings to devices, and map devices to physical queues */ fm10k_assign_rings(interface); From acafdb31f521b6728f865646612e6fb5e605fe97 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 27 Oct 2015 16:59:18 -0700 Subject: [PATCH 600/813] fm10k: Cleanup exception handling for mailbox interrupt [ Upstream commit e00e23bceba48a8f0c94fefe26948404cbd43d0a ] This patch addresses two issues. First is the fact that the fm10k_mbx_free_irq was assuming msix_entries was valid and that will not always be the case. As such we need to add a check for if it is NULL. Second is the fact that we weren't freeing the IRQ if the mailbox API returned an error on trying to connect. Signed-off-by: Alexander Duyck Reviewed-by: Bruce Allan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index e4ec1361eaf4..7f3fb51bc37b 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1122,6 +1122,10 @@ void fm10k_mbx_free_irq(struct fm10k_intfc *interface) struct fm10k_hw *hw = &interface->hw; int itr_reg; + /* no mailbox IRQ to free if MSI-X is not enabled */ + if (!interface->msix_entries) + return; + /* disconnect the mailbox */ hw->mbx.ops.disconnect(hw, &hw->mbx); @@ -1444,10 +1448,15 @@ int fm10k_mbx_request_irq(struct fm10k_intfc *interface) err = fm10k_mbx_request_irq_pf(interface); else err = fm10k_mbx_request_irq_vf(interface); + if (err) + return err; /* connect mailbox */ - if (!err) - err = hw->mbx.ops.connect(hw, &hw->mbx); + err = hw->mbx.ops.connect(hw, &hw->mbx); + + /* if the mailbox failed to connect, then free IRQ */ + if (err) + fm10k_mbx_free_irq(interface); return err; } From edf9459008c299d90abc5c7a045d167e5f46f68c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 2 Dec 2015 17:26:28 -0600 Subject: [PATCH 601/813] cxlflash: a couple off by one bugs [ Upstream commit e37390bee6fe7dfbe507a9d50cdc11344b53fa08 ] The "> MAX_CONTEXT" should be ">= MAX_CONTEXT". Otherwise we go one step beyond the end of the cfg->ctx_tbl[] array. Signed-off-by: Dan Carpenter Reviewed-by: Manoj Kumar Reviewed-by: Johannes Thumshirn Acked-by: Matthew R. Ochs Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/superpipe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index cac2e6a50efd..34b21a0a926a 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c @@ -1380,7 +1380,7 @@ static int cxlflash_disk_attach(struct scsi_device *sdev, } ctxid = cxl_process_element(ctx); - if (unlikely((ctxid > MAX_CONTEXT) || (ctxid < 0))) { + if (unlikely((ctxid >= MAX_CONTEXT) || (ctxid < 0))) { dev_err(dev, "%s: ctxid (%d) invalid!\n", __func__, ctxid); rc = -EPERM; goto err2; @@ -1508,7 +1508,7 @@ static int recover_context(struct cxlflash_cfg *cfg, struct ctx_info *ctxi) } ctxid = cxl_process_element(ctx); - if (unlikely((ctxid > MAX_CONTEXT) || (ctxid < 0))) { + if (unlikely((ctxid >= MAX_CONTEXT) || (ctxid < 0))) { dev_err(dev, "%s: ctxid (%d) invalid!\n", __func__, ctxid); rc = -EPERM; goto err1; From cf48dc15f10a8524c37dd0a7f2821b3123d54744 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Tue, 12 Jan 2016 07:49:15 +0100 Subject: [PATCH 602/813] lightnvm: fix bio submission issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 3cd485b1f8e25a6534eb4c542e7eba1b944fbaaf ] Put bio when submission fails, since we get it before submission. And return error when backend device driver doesn't provide a submit_io method, thus we can end IO properly. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/lightnvm/gennvm.c | 2 +- drivers/lightnvm/rrpc.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index a54b339951a3..62c6f4d11f24 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -345,7 +345,7 @@ static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { if (!dev->ops->submit_io) - return 0; + return -ENODEV; /* Convert address space */ gennvm_generic_to_addr_mode(dev, rqd); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 134e4faba482..a1e7488c1f3e 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -650,11 +650,12 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error) if (bio_data_dir(rqd->bio) == WRITE) rrpc_end_io_write(rrpc, rrqd, laddr, npages); + bio_put(rqd->bio); + if (rrqd->flags & NVM_IOTYPE_GC) return 0; rrpc_unlock_rq(rrpc, rqd); - bio_put(rqd->bio); if (npages > 1) nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list); @@ -841,6 +842,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, err = nvm_submit_io(rrpc->dev, rqd); if (err) { pr_err("rrpc: I/O submission failed: %d\n", err); + bio_put(bio); return NVM_IO_ERR; } From 15de5f8f3c5d1ea982a4aa270fc10cd78c6682a0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jan 2016 07:49:16 +0100 Subject: [PATCH 603/813] lightnvm: fix incorrect nr_free_blocks stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit bdded1552085b12d23c9be76147d2e96647a098f ] When initing bad block list in gennvm_block_bb, once we move bad block from free_list to bb_list, we should maintain both stat info nr_free_blocks and nr_bad_blocks. So this patch fixes to add missing operation related to nr_free_blocks. Signed-off-by: Chao Yu Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/lightnvm/gennvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 62c6f4d11f24..2a96ff6923f0 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -89,6 +89,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; + lun->vlun.nr_free_blocks--; } return 0; From d3f89f3a91dfa4758fdd6e98b780095d2de66943 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Tue, 12 Jan 2016 07:49:17 +0100 Subject: [PATCH 604/813] lightnvm: add check after mempool allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 3bfbc6adbc5031e8a5907baa5beb27b41637742a ] The mempool allocation might fail. Make sure to return error when it does, instead of causing a kernel panic. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/lightnvm/rrpc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index a1e7488c1f3e..f4bc98687d7f 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -287,6 +287,8 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) } page = mempool_alloc(rrpc->page_pool, GFP_NOIO); + if (!page) + return -ENOMEM; while ((slot = find_first_zero_bit(rblk->invalid_pages, nr_pgs_per_blk)) < nr_pgs_per_blk) { From d239b53a14bb9bbe6fa45405af6fa0d86f68a874 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Tue, 12 Jan 2016 07:49:18 +0100 Subject: [PATCH 605/813] lightnvm: unlock rq and free ppa_list on submission fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c27278bddd75a3ee755c8e83c6bcc3fdd7271ef6 ] When rrpc_write_ppalist_rq and rrpc_read_ppalist_rq succeed, we setup rq correctly, but nvm_submit_io may afterward fail since it cannot allocate request or nvme_nvm_command, we return error but forget to cleanup the previous work. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/lightnvm/rrpc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index f4bc98687d7f..748cab499580 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -845,6 +845,12 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, if (err) { pr_err("rrpc: I/O submission failed: %d\n", err); bio_put(bio); + if (!(flags & NVM_IOTYPE_GC)) { + rrpc_unlock_rq(rrpc, rqd); + if (rqd->nr_pages > 1) + nvm_dev_dma_free(rrpc->dev, + rqd->ppa_list, rqd->dma_ppa_list); + } return NVM_IO_ERR; } From 651b328d97d2bdf62518093628f765824cb638fd Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Tue, 12 Jan 2016 07:49:25 +0100 Subject: [PATCH 606/813] lightnvm: fix locking and mempool in rrpc_lun_gc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b262924be03d5d2ae735bc9a4b37eb2c613f61f8 ] This patch fix two issues in rrpc_lun_gc 1. prio_list is protected by rrpc_lun's lock not nvm_lun's, so acquire rlun's lock instead of lun's before operate on the list. 2. we delete block from prio_list before allocating gcb, but gcb allocation may fail, we end without putting it back to the list, this makes the block won't get reclaimed in the future. To solve this issue, delete block after gcb allocation. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/lightnvm/rrpc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 748cab499580..a9859489acf6 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -429,7 +429,7 @@ static void rrpc_lun_gc(struct work_struct *work) if (nr_blocks_need < rrpc->nr_luns) nr_blocks_need = rrpc->nr_luns; - spin_lock(&lun->lock); + spin_lock(&rlun->lock); while (nr_blocks_need > lun->nr_free_blocks && !list_empty(&rlun->prio_list)) { struct rrpc_block *rblock = block_prio_find_max(rlun); @@ -438,16 +438,16 @@ static void rrpc_lun_gc(struct work_struct *work) if (!rblock->nr_invalid_pages) break; + gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); + if (!gcb) + break; + list_del_init(&rblock->prio); BUG_ON(!block_is_full(rrpc, rblock)); pr_debug("rrpc: selected block '%lu' for GC\n", block->id); - gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); - if (!gcb) - break; - gcb->rrpc = rrpc; gcb->rblk = rblock; INIT_WORK(&gcb->ws_gc, rrpc_block_gc); @@ -456,7 +456,7 @@ static void rrpc_lun_gc(struct work_struct *work) nr_blocks_need--; } - spin_unlock(&lun->lock); + spin_unlock(&rlun->lock); /* TODO: Hint that request queue can be started again */ } From 12e2d36594f98bef28aebe5ebf4c8c48be70ba3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:32 +0100 Subject: [PATCH 607/813] lightnvm: fix missing grown bad block type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b5d4acd4cbf5029a2616084d9e9f392046d53a37 ] The get/set bad block interface defines good block, factory bad block, grown bad block, device reserved block, and host reserved block. Unfortunately the grown bad block was missing, leaving the offsets wrong for device and host side reserved blocks. This patch adds the missing type and corrects the offsets. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/lightnvm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 034117b3be5f..f09648d14694 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -58,8 +58,9 @@ enum { /* Block Types */ NVM_BLK_T_FREE = 0x0, NVM_BLK_T_BAD = 0x1, - NVM_BLK_T_DEV = 0x2, - NVM_BLK_T_HOST = 0x4, + NVM_BLK_T_GRWN_BAD = 0x2, + NVM_BLK_T_DEV = 0x4, + NVM_BLK_T_HOST = 0x8, }; struct nvm_id_group { From e31409cd1ae44450964fc29e2bb0519cc50f5923 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Mon, 14 Dec 2015 16:01:32 -0800 Subject: [PATCH 608/813] Drivers: hv: util: Increase the timeout for util services [ Upstream commit c0b200cfb0403740171c7527b3ac71d03f82947a ] Util services such as KVP and FCOPY need assistance from daemon's running in user space. Increase the timeout so we don't prematurely terminate the transaction in the kernel. Host sets up a 60 second timeout for all util driver transactions. The host will retry the transaction if it times out. Set the guest timeout at 30 seconds. Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_fcopy.c | 3 ++- drivers/hv/hv_kvp.c | 3 ++- drivers/hv/hyperv_vmbus.h | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index db4b887b889d..bbdec50c4a1c 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -275,7 +275,8 @@ void hv_fcopy_onchannelcallback(void *context) * Send the information to the user-level daemon. */ schedule_work(&fcopy_send_work); - schedule_delayed_work(&fcopy_timeout_work, 5*HZ); + schedule_delayed_work(&fcopy_timeout_work, + HV_UTIL_TIMEOUT * HZ); return; } icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 74c38a9f34a6..e6aa33a89b0e 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -668,7 +668,8 @@ void hv_kvp_onchannelcallback(void *context) * user-mode not responding. */ schedule_work(&kvp_sendkey_work); - schedule_delayed_work(&kvp_timeout_work, 5*HZ); + schedule_delayed_work(&kvp_timeout_work, + HV_UTIL_TIMEOUT * HZ); return; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 3782636562a1..225b96bcf7fe 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -30,6 +30,11 @@ #include #include +/* + * Timeout for services such as KVP and fcopy. + */ +#define HV_UTIL_TIMEOUT 30 + /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent * is set by CPUID(HVCPUID_VERSION_FEATURES). From b125fabf22d44f7c57e6df2a21d27add8385df0d Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 14 Dec 2015 16:01:33 -0800 Subject: [PATCH 609/813] Drivers: hv: utils: run polling callback always in interrupt context [ Upstream commit 3cace4a616108539e2730f8dc21a636474395e0f ] All channel interrupts are bound to specific VCPUs in the guest at the point channel is created. While currently, we invoke the polling function on the correct CPU (the CPU to which the channel is bound to) in some cases we may run the polling function in a non-interrupt context. This potentially can cause an issue as the polling function can be interrupted by the channel callback function. Fix the issue by running the polling function on the appropriate CPU at interrupt level. Additional details of the issue being addressed by this patch are given below: Currently hv_fcopy_onchannelcallback is called from interrupts and also via the ->write function of hv_utils. Since the used global variables to maintain state are not thread safe the state can get out of sync. This affects the variable state as well as the channel inbound buffer. As suggested by KY adjust hv_poll_channel to always run the given callback on the cpu which the channel is bound to. This avoids the need for locking because all the util services are single threaded and only one transaction is active at any given point in time. Additionally, remove the context variable, they will always be the same as recv_channel. Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_fcopy.c | 34 ++++++++++++---------------------- drivers/hv/hv_kvp.c | 28 ++++++++++------------------ drivers/hv/hv_snapshot.c | 29 +++++++++++------------------ drivers/hv/hyperv_vmbus.h | 6 +----- 4 files changed, 34 insertions(+), 63 deletions(-) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index bbdec50c4a1c..c37a71e13de0 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -51,7 +51,6 @@ static struct { struct hv_fcopy_hdr *fcopy_msg; /* current message */ struct vmbus_channel *recv_channel; /* chn we got the request */ u64 recv_req_id; /* request ID. */ - void *fcopy_context; /* for the channel callback */ } fcopy_transaction; static void fcopy_respond_to_host(int error); @@ -67,6 +66,13 @@ static struct hvutil_transport *hvt; */ static int dm_reg_value; +static void fcopy_poll_wrapper(void *channel) +{ + /* Transaction is finished, reset the state here to avoid races. */ + fcopy_transaction.state = HVUTIL_READY; + hv_fcopy_onchannelcallback(channel); +} + static void fcopy_timeout_func(struct work_struct *dummy) { /* @@ -74,13 +80,7 @@ static void fcopy_timeout_func(struct work_struct *dummy) * process the pending transaction. */ fcopy_respond_to_host(HV_E_FAIL); - - /* Transaction is finished, reset the state. */ - if (fcopy_transaction.state > HVUTIL_READY) - fcopy_transaction.state = HVUTIL_READY; - - hv_poll_channel(fcopy_transaction.fcopy_context, - hv_fcopy_onchannelcallback); + hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); } static int fcopy_handle_handshake(u32 version) @@ -108,9 +108,7 @@ static int fcopy_handle_handshake(u32 version) return -EINVAL; } pr_debug("FCP: userspace daemon ver. %d registered\n", version); - fcopy_transaction.state = HVUTIL_READY; - hv_poll_channel(fcopy_transaction.fcopy_context, - hv_fcopy_onchannelcallback); + hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); return 0; } @@ -227,15 +225,8 @@ void hv_fcopy_onchannelcallback(void *context) int util_fw_version; int fcopy_srv_version; - if (fcopy_transaction.state > HVUTIL_READY) { - /* - * We will defer processing this callback once - * the current transaction is complete. - */ - fcopy_transaction.fcopy_context = context; + if (fcopy_transaction.state > HVUTIL_READY) return; - } - fcopy_transaction.fcopy_context = NULL; vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, &requestid); @@ -305,9 +296,8 @@ static int fcopy_on_msg(void *msg, int len) if (cancel_delayed_work_sync(&fcopy_timeout_work)) { fcopy_transaction.state = HVUTIL_USERSPACE_RECV; fcopy_respond_to_host(*val); - fcopy_transaction.state = HVUTIL_READY; - hv_poll_channel(fcopy_transaction.fcopy_context, - hv_fcopy_onchannelcallback); + hv_poll_channel(fcopy_transaction.recv_channel, + fcopy_poll_wrapper); } return 0; diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index e6aa33a89b0e..2a3420c4ca59 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -66,7 +66,6 @@ static struct { struct hv_kvp_msg *kvp_msg; /* current message */ struct vmbus_channel *recv_channel; /* chn we got the request */ u64 recv_req_id; /* request ID. */ - void *kvp_context; /* for the channel callback */ } kvp_transaction; /* @@ -94,6 +93,13 @@ static struct hvutil_transport *hvt; */ #define HV_DRV_VERSION "3.1" +static void kvp_poll_wrapper(void *channel) +{ + /* Transaction is finished, reset the state here to avoid races. */ + kvp_transaction.state = HVUTIL_READY; + hv_kvp_onchannelcallback(channel); +} + static void kvp_register(int reg_value) { @@ -121,12 +127,7 @@ static void kvp_timeout_func(struct work_struct *dummy) */ kvp_respond_to_host(NULL, HV_E_FAIL); - /* Transaction is finished, reset the state. */ - if (kvp_transaction.state > HVUTIL_READY) - kvp_transaction.state = HVUTIL_READY; - - hv_poll_channel(kvp_transaction.kvp_context, - hv_kvp_onchannelcallback); + hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper); } static int kvp_handle_handshake(struct hv_kvp_msg *msg) @@ -218,9 +219,7 @@ static int kvp_on_msg(void *msg, int len) */ if (cancel_delayed_work_sync(&kvp_timeout_work)) { kvp_respond_to_host(message, error); - kvp_transaction.state = HVUTIL_READY; - hv_poll_channel(kvp_transaction.kvp_context, - hv_kvp_onchannelcallback); + hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper); } return 0; @@ -596,15 +595,8 @@ void hv_kvp_onchannelcallback(void *context) int util_fw_version; int kvp_srv_version; - if (kvp_transaction.state > HVUTIL_READY) { - /* - * We will defer processing this callback once - * the current transaction is complete. - */ - kvp_transaction.kvp_context = context; + if (kvp_transaction.state > HVUTIL_READY) return; - } - kvp_transaction.kvp_context = NULL; vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen, &requestid); diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 815405f2e777..a548ae42c927 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -53,7 +53,6 @@ static struct { struct vmbus_channel *recv_channel; /* chn we got the request */ u64 recv_req_id; /* request ID. */ struct hv_vss_msg *msg; /* current message */ - void *vss_context; /* for the channel callback */ } vss_transaction; @@ -74,6 +73,13 @@ static void vss_timeout_func(struct work_struct *dummy); static DECLARE_DELAYED_WORK(vss_timeout_work, vss_timeout_func); static DECLARE_WORK(vss_send_op_work, vss_send_op); +static void vss_poll_wrapper(void *channel) +{ + /* Transaction is finished, reset the state here to avoid races. */ + vss_transaction.state = HVUTIL_READY; + hv_vss_onchannelcallback(channel); +} + /* * Callback when data is received from user mode. */ @@ -86,12 +92,7 @@ static void vss_timeout_func(struct work_struct *dummy) pr_warn("VSS: timeout waiting for daemon to reply\n"); vss_respond_to_host(HV_E_FAIL); - /* Transaction is finished, reset the state. */ - if (vss_transaction.state > HVUTIL_READY) - vss_transaction.state = HVUTIL_READY; - - hv_poll_channel(vss_transaction.vss_context, - hv_vss_onchannelcallback); + hv_poll_channel(vss_transaction.recv_channel, vss_poll_wrapper); } static int vss_handle_handshake(struct hv_vss_msg *vss_msg) @@ -138,9 +139,8 @@ static int vss_on_msg(void *msg, int len) if (cancel_delayed_work_sync(&vss_timeout_work)) { vss_respond_to_host(vss_msg->error); /* Transaction is finished, reset the state. */ - vss_transaction.state = HVUTIL_READY; - hv_poll_channel(vss_transaction.vss_context, - hv_vss_onchannelcallback); + hv_poll_channel(vss_transaction.recv_channel, + vss_poll_wrapper); } } else { /* This is a spurious call! */ @@ -238,15 +238,8 @@ void hv_vss_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct icmsg_negotiate *negop = NULL; - if (vss_transaction.state > HVUTIL_READY) { - /* - * We will defer processing this callback once - * the current transaction is complete. - */ - vss_transaction.vss_context = context; + if (vss_transaction.state > HVUTIL_READY) return; - } - vss_transaction.vss_context = NULL; vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, &requestid); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 225b96bcf7fe..12156db2e88e 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -764,11 +764,7 @@ static inline void hv_poll_channel(struct vmbus_channel *channel, if (!channel) return; - if (channel->target_cpu != smp_processor_id()) - smp_call_function_single(channel->target_cpu, - cb, channel, true); - else - cb(channel); + smp_call_function_single(channel->target_cpu, cb, channel, true); } enum hvutil_device_state { From 10b8e4ebc4b3b3f4f2ff3e5651429ff4427d3397 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 14 Dec 2015 16:01:34 -0800 Subject: [PATCH 610/813] tools: hv: report ENOSPC errors in hv_fcopy_daemon [ Upstream commit b4ed5d1682c6613988c2eb1de55df5ac9988afcc ] Currently some "Unspecified error 0x80004005" is reported on the Windows side if something fails. Handle the ENOSPC case and return ERROR_DISK_FULL, which allows at least Copy-VMFile to report a meaning full error. Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/hyperv.h | 1 + tools/hv/hv_fcopy_daemon.c | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index e4c0a35d6417..e347b24ef9fb 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -313,6 +313,7 @@ enum hv_kvp_exchg_pool { #define HV_INVALIDARG 0x80070057 #define HV_GUID_NOTFOUND 0x80041002 #define HV_ERROR_ALREADY_EXISTS 0x80070050 +#define HV_ERROR_DISK_FULL 0x80070070 #define ADDR_FAMILY_NONE 0x00 #define ADDR_FAMILY_IPV4 0x01 diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c index 5480e4e424eb..f1d742682317 100644 --- a/tools/hv/hv_fcopy_daemon.c +++ b/tools/hv/hv_fcopy_daemon.c @@ -37,12 +37,14 @@ static int target_fd; static char target_fname[W_MAX_PATH]; +static unsigned long long filesize; static int hv_start_fcopy(struct hv_start_fcopy *smsg) { int error = HV_E_FAIL; char *q, *p; + filesize = 0; p = (char *)smsg->path_name; snprintf(target_fname, sizeof(target_fname), "%s/%s", (char *)smsg->path_name, (char *)smsg->file_name); @@ -98,14 +100,26 @@ done: static int hv_copy_data(struct hv_do_fcopy *cpmsg) { ssize_t bytes_written; + int ret = 0; bytes_written = pwrite(target_fd, cpmsg->data, cpmsg->size, cpmsg->offset); - if (bytes_written != cpmsg->size) - return HV_E_FAIL; + filesize += cpmsg->size; + if (bytes_written != cpmsg->size) { + switch (errno) { + case ENOSPC: + ret = HV_ERROR_DISK_FULL; + break; + default: + ret = HV_E_FAIL; + break; + } + syslog(LOG_ERR, "pwrite failed to write %llu bytes: %ld (%s)", + filesize, (long)bytes_written, strerror(errno)); + } - return 0; + return ret; } static int hv_copy_finished(void) From e35b50490e2612b42edbd08b602a4ea597fce7f2 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 14 Dec 2015 16:01:36 -0800 Subject: [PATCH 611/813] Drivers: hv: util: catch allocation errors [ Upstream commit cdc0c0c94e4e6dfa371d497a3130f83349b6ead6 ] Catch allocation errors in hvutil_transport_send. Fixes: 14b50f80c32d ('Drivers: hv: util: introduce hv_utils_transport abstraction') Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_utils_transport.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c index 6a9d80a5332d..1505ee6e6605 100644 --- a/drivers/hv/hv_utils_transport.c +++ b/drivers/hv/hv_utils_transport.c @@ -204,9 +204,12 @@ int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len) goto out_unlock; } hvt->outmsg = kzalloc(len, GFP_KERNEL); - memcpy(hvt->outmsg, msg, len); - hvt->outmsg_len = len; - wake_up_interruptible(&hvt->outmsg_q); + if (hvt->outmsg) { + memcpy(hvt->outmsg, msg, len); + hvt->outmsg_len = len; + wake_up_interruptible(&hvt->outmsg_q); + } else + ret = -ENOMEM; out_unlock: mutex_unlock(&hvt->outmsg_lock); return ret; From c1d32939e9e1951eaf7af573b15d54d8495aafff Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Mon, 14 Dec 2015 16:01:38 -0800 Subject: [PATCH 612/813] drivers/hv: cleanup synic msrs if vmbus connect failed [ Upstream commit 17efbee8ba02ef00d3b270998978f8a1a90f1d92 ] Before vmbus_connect() synic is setup per vcpu - this means hypervisor receives writes at synic msr's and probably allocate hypervisor resources per synic setup. If vmbus_connect() failed for some reason it's neccessary to cleanup synic setup by call hv_synic_cleanup() at each vcpu to get a chance to free allocated resources by hypervisor per synic. This patch does appropriate cleanup in case of vmbus_connect() failure. Signed-off-by: Andrey Smetanin Signed-off-by: Denis V. Lunev Reviewed-by: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Haiyang Zhang CC: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/vmbus_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 9b5440f6b3b4..acd03b2f6568 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -870,7 +870,7 @@ static int vmbus_bus_init(int irq) on_each_cpu(hv_synic_init, NULL, 1); ret = vmbus_connect(); if (ret) - goto err_alloc; + goto err_connect; if (vmbus_proto_version > VERSION_WIN7) cpu_hotplug_disable(); @@ -888,6 +888,8 @@ static int vmbus_bus_init(int irq) return 0; +err_connect: + on_each_cpu(hv_synic_cleanup, NULL, 1); err_alloc: hv_synic_free(); hv_remove_vmbus_irq(); From bf59c816fb104b2e92d29f8c13f1118f4c5333c4 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 14 Dec 2015 16:01:42 -0800 Subject: [PATCH 613/813] Drivers: hv: vss: run only on supported host versions [ Upstream commit ed9ba608e4851144af8c7061cbb19f751c73e998 ] The Backup integration service on WS2012 has appearently trouble to negotiate with a guest which does not support the provided util version. Currently the VSS driver supports only version 5/0. A WS2012 offers only version 1/x and 3/x, and vmbus_prep_negotiate_resp correctly returns an empty icframe_vercnt/icmsg_vercnt. But the host ignores that and continues to send ICMSGTYPE_NEGOTIATE messages. The result are weird errors during boot and general misbehaviour. Check the Windows version to work around the host bug, skip hv_vss_init on WS2012 and older. Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_snapshot.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index a548ae42c927..81882d4848bd 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -331,6 +331,11 @@ static void vss_on_reset(void) int hv_vss_init(struct hv_util_service *srv) { + if (vmbus_proto_version < VERSION_WIN8_1) { + pr_warn("Integration service 'Backup (volume snapshot)'" + " not supported on this host version.\n"); + return -ENOTSUPP; + } recv_buffer = srv->recv_buffer; /* From ff83d7914b749f8c654c0e08d21667035901ab75 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 14 Dec 2015 16:01:47 -0800 Subject: [PATCH 614/813] Drivers: hv: vmbus: serialize process_chn_event() and vmbus_close_internal() [ Upstream commit 63d55b2aeb5e4faa170316fee73c3c47ea9268c7 ] process_chn_event(), running in the tasklet, can race with vmbus_close_internal() in the case of SMP guest, e.g., when the former is accessing channel->inbound.ring_buffer, the latter could be freeing the ring_buffer pages. To resolve the race, we can serialize them by disabling the tasklet when the latter is running here. Signed-off-by: Dexuan Cui Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 9098f13f2f44..6a90c69bc71c 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "hyperv_vmbus.h" @@ -496,8 +497,21 @@ static void reset_channel_cb(void *arg) static int vmbus_close_internal(struct vmbus_channel *channel) { struct vmbus_channel_close_channel *msg; + struct tasklet_struct *tasklet; int ret; + /* + * process_chn_event(), running in the tasklet, can race + * with vmbus_close_internal() in the case of SMP guest, e.g., when + * the former is accessing channel->inbound.ring_buffer, the latter + * could be freeing the ring_buffer pages. + * + * To resolve the race, we can serialize them by disabling the + * tasklet when the latter is running here. + */ + tasklet = hv_context.event_dpc[channel->target_cpu]; + tasklet_disable(tasklet); + channel->state = CHANNEL_OPEN_STATE; channel->sc_creation_callback = NULL; /* Stop callback and cancel the timer asap */ @@ -525,7 +539,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) * If we failed to post the close msg, * it is perhaps better to leak memory. */ - return ret; + goto out; } /* Tear down the gpadl for the channel's ring buffer */ @@ -538,7 +552,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) * If we failed to teardown gpadl, * it is perhaps better to leak memory. */ - return ret; + goto out; } } @@ -555,6 +569,9 @@ static int vmbus_close_internal(struct vmbus_channel *channel) if (channel->rescind) hv_process_channel_removal(channel, channel->offermsg.child_relid); +out: + tasklet_enable(tasklet); + return ret; } From 64bdc685ccc1d207632a6cf17cd5771e10623c85 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 14 Dec 2015 16:01:49 -0800 Subject: [PATCH 615/813] Drivers: hv: vmbus: fix rescind-offer handling for device without a driver [ Upstream commit 34c6801e3310ad286c7bb42bc88d42926b8f99bf ] In the path vmbus_onoffer_rescind() -> vmbus_device_unregister() -> device_unregister() -> ... -> __device_release_driver(), we can see for a device without a driver loaded: dev->driver is NULL, so dev->bus->remove(dev), namely vmbus_remove(), isn't invoked. As a result, vmbus_remove() -> hv_process_channel_removal() isn't invoked and some cleanups(like sending a CHANNELMSG_RELID_RELEASED message to the host) aren't done. We can demo the issue this way: 1. rmmod hv_utils; 2. disable the Heartbeat Integration Service in Hyper-V Manager and lsvmbus shows the device disappears. 3. re-enable the Heartbeat in Hyper-V Manager and modprobe hv_utils, but lsvmbus shows the device can't appear again. This is because, the host thinks the VM hasn't released the relid, so can't re-offer the device to the VM. We can fix the issue by moving hv_process_channel_removal() from vmbus_close_internal() to vmbus_device_release(), since the latter is always invoked on device_unregister(), whether or not the dev has a driver loaded. Signed-off-by: Dexuan Cui Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 6 ------ drivers/hv/channel_mgmt.c | 6 +++--- drivers/hv/vmbus_drv.c | 15 +++------------ 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 6a90c69bc71c..1ef37c727572 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -563,12 +563,6 @@ static int vmbus_close_internal(struct vmbus_channel *channel) free_pages((unsigned long)channel->ringbuffer_pages, get_order(channel->ringbuffer_pagecount * PAGE_SIZE)); - /* - * If the channel has been rescinded; process device removal. - */ - if (channel->rescind) - hv_process_channel_removal(channel, - channel->offermsg.child_relid); out: tasklet_enable(tasklet); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 652afd11a9ef..bd2e9f60272a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -191,6 +191,8 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) if (channel == NULL) return; + BUG_ON(!channel->rescind); + if (channel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(channel->target_cpu, @@ -230,9 +232,7 @@ void vmbus_free_channels(void) list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list, listentry) { - /* if we don't set rescind to true, vmbus_close_internal() - * won't invoke hv_process_channel_removal(). - */ + /* hv_process_channel_removal() needs this */ channel->rescind = true; vmbus_device_unregister(channel->device_obj); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index acd03b2f6568..55952d1ed336 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -603,23 +603,11 @@ static int vmbus_remove(struct device *child_device) { struct hv_driver *drv; struct hv_device *dev = device_to_hv_device(child_device); - u32 relid = dev->channel->offermsg.child_relid; if (child_device->driver) { drv = drv_to_hv_drv(child_device->driver); if (drv->remove) drv->remove(dev); - else { - hv_process_channel_removal(dev->channel, relid); - pr_err("remove not set for driver %s\n", - dev_name(child_device)); - } - } else { - /* - * We don't have a driver for this device; deal with the - * rescind message by removing the channel. - */ - hv_process_channel_removal(dev->channel, relid); } return 0; @@ -654,7 +642,10 @@ static void vmbus_shutdown(struct device *child_device) static void vmbus_device_release(struct device *device) { struct hv_device *hv_dev = device_to_hv_device(device); + struct vmbus_channel *channel = hv_dev->channel; + hv_process_channel_removal(channel, + channel->offermsg.child_relid); kfree(hv_dev); } From c0b72ca1ae92147e0714e7a780ba7ccc4e282370 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Mon, 16 Nov 2015 09:33:45 +0530 Subject: [PATCH 616/813] cxl: Fix possible idr warning when contexts are released [ Upstream commit 1b5df59e50874b9034c0fa389cd52b65f1f93292 ] An idr warning is reported when a context is release after the capi card is unbound from the cxl driver via sysfs. Below are the steps to reproduce: 1. Create multiple afu contexts in an user-space application using libcxl. 2. Unbind capi card from cxl using command of form echo > /sys/bus/pci/drivers/cxl-pci/unbind 3. Exit/kill the application owning afu contexts. After above steps a warning message is usually seen in the kernel logs of the form "idr_remove called for id= which is not allocated." This is caused by the function cxl_release_afu which destroys the contexts_idr table. So when a context is release no entry for context pe is found in the contexts_idr table and idr code prints this warning. This patch fixes this issue by increasing & decreasing the ref-count on the afu device when a context is initialized or when its freed respectively. This prevents the afu from being released until all the afu contexts have been released. The patch introduces two new functions namely cxl_afu_get/put that manage the ref-count on the afu device. Also the patch removes code inside cxl_dev_context_init that increases ref on the afu device as its guaranteed to be alive during this function. Reported-by: Ian Munsie Signed-off-by: Vaibhav Jain Acked-by: Ian Munsie Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/api.c | 4 ---- drivers/misc/cxl/context.c | 9 +++++++++ drivers/misc/cxl/cxl.h | 12 ++++++++++++ drivers/misc/cxl/file.c | 19 +++++++++++-------- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 103baf0e0c5b..a6543aefa299 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -25,7 +25,6 @@ struct cxl_context *cxl_dev_context_init(struct pci_dev *dev) afu = cxl_pci_to_afu(dev); - get_device(&afu->dev); ctx = cxl_context_alloc(); if (IS_ERR(ctx)) { rc = PTR_ERR(ctx); @@ -61,7 +60,6 @@ err_mapping: err_ctx: kfree(ctx); err_dev: - put_device(&afu->dev); return ERR_PTR(rc); } EXPORT_SYMBOL_GPL(cxl_dev_context_init); @@ -87,8 +85,6 @@ int cxl_release_context(struct cxl_context *ctx) if (ctx->status >= STARTED) return -EBUSY; - put_device(&ctx->afu->dev); - cxl_context_free(ctx); return 0; diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 2faa1270d085..6dde7a9d6a7e 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -97,6 +97,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, ctx->pe = i; ctx->elem = &ctx->afu->spa[i]; ctx->pe_inserted = false; + + /* + * take a ref on the afu so that it stays alive at-least till + * this context is reclaimed inside reclaim_ctx. + */ + cxl_afu_get(afu); return 0; } @@ -278,6 +284,9 @@ static void reclaim_ctx(struct rcu_head *rcu) if (ctx->irq_bitmap) kfree(ctx->irq_bitmap); + /* Drop ref to the afu device taken during cxl_context_init */ + cxl_afu_put(ctx->afu); + kfree(ctx); } diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 0cfb9c129f27..25ae57fa79b0 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -403,6 +403,18 @@ struct cxl_afu { bool enabled; }; +/* AFU refcount management */ +static inline struct cxl_afu *cxl_afu_get(struct cxl_afu *afu) +{ + + return (get_device(&afu->dev) == NULL) ? NULL : afu; +} + +static inline void cxl_afu_put(struct cxl_afu *afu) +{ + put_device(&afu->dev); +} + struct cxl_irq_name { struct list_head list; diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 7ccd2998be92..5cc14599837d 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -67,7 +67,13 @@ static int __afu_open(struct inode *inode, struct file *file, bool master) spin_unlock(&adapter->afu_list_lock); goto err_put_adapter; } - get_device(&afu->dev); + + /* + * taking a ref to the afu so that it doesn't go away + * for rest of the function. This ref is released before + * we return. + */ + cxl_afu_get(afu); spin_unlock(&adapter->afu_list_lock); if (!afu->current_mode) @@ -90,13 +96,12 @@ static int __afu_open(struct inode *inode, struct file *file, bool master) file->private_data = ctx; cxl_ctx_get(); - /* Our ref on the AFU will now hold the adapter */ - put_device(&adapter->dev); - - return 0; + /* indicate success */ + rc = 0; err_put_afu: - put_device(&afu->dev); + /* release the ref taken earlier */ + cxl_afu_put(afu); err_put_adapter: put_device(&adapter->dev); return rc; @@ -131,8 +136,6 @@ int afu_release(struct inode *inode, struct file *file) mutex_unlock(&ctx->mapping_lock); } - put_device(&ctx->afu->dev); - /* * At this this point all bottom halfs have finished and we should be * getting no more IRQs from the hardware for this context. Once it's From e66e0a249d9dacb9fda4135d0e38ba8ae9af9cac Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Tue, 24 Nov 2015 16:26:18 +0530 Subject: [PATCH 617/813] cxl: Fix DSI misses when the context owning task exits [ Upstream commit 7b8ad495d59280b634a7b546f4cdf58cf4d65f61 ] Presently when a user-space process issues CXL_IOCTL_START_WORK ioctl we store the pid of the current task_struct and use it to get pointer to the mm_struct of the process, while processing page or segment faults from the capi card. However this causes issues when the thread that had originally issued the start-work ioctl exits in which case the stored pid is no more valid and the cxl driver is unable to handle faults as the mm_struct corresponding to process is no more accessible. This patch fixes this issue by using the mm_struct of the next alive task in the thread group. This is done by iterating over all the tasks in the thread group starting from thread group leader and calling get_task_mm on each one of them. When a valid mm_struct is obtained the pid of the associated task is stored in the context replacing the exiting one for handling future faults. The patch introduces a new function named get_mem_context that checks if the current task pointed to by ctx->pid is dead? If yes it performs the steps described above. Also a new variable cxl_context.glpid is introduced which stores the pid of the thread group leader associated with the context owning task. Reported-by: Matthew R. Ochs Reported-by: Frank Haverkamp Suggested-by: Ian Munsie Signed-off-by: Vaibhav Jain Acked-by: Ian Munsie Reviewed-by: Frederic Barrat Reviewed-by: Matthew R. Ochs Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/api.c | 2 +- drivers/misc/cxl/context.c | 6 +- drivers/misc/cxl/cxl.h | 3 + drivers/misc/cxl/fault.c | 129 +++++++++++++++++++++++++++---------- drivers/misc/cxl/file.c | 6 +- 5 files changed, 109 insertions(+), 37 deletions(-) diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index a6543aefa299..ea3eeb7011e1 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -172,7 +172,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, if (task) { ctx->pid = get_task_pid(task, PIDTYPE_PID); - get_pid(ctx->pid); + ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID); kernel = false; } diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 6dde7a9d6a7e..262b88eac414 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -42,7 +42,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, spin_lock_init(&ctx->sste_lock); ctx->afu = afu; ctx->master = master; - ctx->pid = NULL; /* Set in start work ioctl */ + ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */ mutex_init(&ctx->mapping_lock); ctx->mapping = mapping; @@ -217,7 +217,11 @@ int __detach_context(struct cxl_context *ctx) WARN_ON(cxl_detach_process(ctx) && cxl_adapter_link_ok(ctx->afu->adapter)); flush_work(&ctx->fault_work); /* Only needed for dedicated process */ + + /* release the reference to the group leader and mm handling pid */ put_pid(ctx->pid); + put_pid(ctx->glpid); + cxl_ctx_put(); return 0; } diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 25ae57fa79b0..a521bc72cec2 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -445,6 +445,9 @@ struct cxl_context { unsigned int sst_size, sst_lru; wait_queue_head_t wq; + /* pid of the group leader associated with the pid */ + struct pid *glpid; + /* use mm context associated with this pid for ds faults */ struct pid *pid; spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */ /* Only used in PR mode */ diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 25a5418c55cb..81c3f75b7330 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -166,13 +166,92 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, cxl_ack_irq(ctx, CXL_PSL_TFC_An_R, 0); } +/* + * Returns the mm_struct corresponding to the context ctx via ctx->pid + * In case the task has exited we use the task group leader accessible + * via ctx->glpid to find the next task in the thread group that has a + * valid mm_struct associated with it. If a task with valid mm_struct + * is found the ctx->pid is updated to use the task struct for subsequent + * translations. In case no valid mm_struct is found in the task group to + * service the fault a NULL is returned. + */ +static struct mm_struct *get_mem_context(struct cxl_context *ctx) +{ + struct task_struct *task = NULL; + struct mm_struct *mm = NULL; + struct pid *old_pid = ctx->pid; + + if (old_pid == NULL) { + pr_warn("%s: Invalid context for pe=%d\n", + __func__, ctx->pe); + return NULL; + } + + task = get_pid_task(old_pid, PIDTYPE_PID); + + /* + * pid_alive may look racy but this saves us from costly + * get_task_mm when the task is a zombie. In worst case + * we may think a task is alive, which is about to die + * but get_task_mm will return NULL. + */ + if (task != NULL && pid_alive(task)) + mm = get_task_mm(task); + + /* release the task struct that was taken earlier */ + if (task) + put_task_struct(task); + else + pr_devel("%s: Context owning pid=%i for pe=%i dead\n", + __func__, pid_nr(old_pid), ctx->pe); + + /* + * If we couldn't find the mm context then use the group + * leader to iterate over the task group and find a task + * that gives us mm_struct. + */ + if (unlikely(mm == NULL && ctx->glpid != NULL)) { + + rcu_read_lock(); + task = pid_task(ctx->glpid, PIDTYPE_PID); + if (task) + do { + mm = get_task_mm(task); + if (mm) { + ctx->pid = get_task_pid(task, + PIDTYPE_PID); + break; + } + task = next_thread(task); + } while (task && !thread_group_leader(task)); + rcu_read_unlock(); + + /* check if we switched pid */ + if (ctx->pid != old_pid) { + if (mm) + pr_devel("%s:pe=%i switch pid %i->%i\n", + __func__, ctx->pe, pid_nr(old_pid), + pid_nr(ctx->pid)); + else + pr_devel("%s:Cannot find mm for pid=%i\n", + __func__, pid_nr(old_pid)); + + /* drop the reference to older pid */ + put_pid(old_pid); + } + } + + return mm; +} + + + void cxl_handle_fault(struct work_struct *fault_work) { struct cxl_context *ctx = container_of(fault_work, struct cxl_context, fault_work); u64 dsisr = ctx->dsisr; u64 dar = ctx->dar; - struct task_struct *task = NULL; struct mm_struct *mm = NULL; if (cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An) != dsisr || @@ -195,17 +274,17 @@ void cxl_handle_fault(struct work_struct *fault_work) "DSISR: %#llx DAR: %#llx\n", ctx->pe, dsisr, dar); if (!ctx->kernel) { - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { - pr_devel("cxl_handle_fault unable to get task %i\n", - pid_nr(ctx->pid)); + + mm = get_mem_context(ctx); + /* indicates all the thread in task group have exited */ + if (mm == NULL) { + pr_devel("%s: unable to get mm for pe=%d pid=%i\n", + __func__, ctx->pe, pid_nr(ctx->pid)); cxl_ack_ae(ctx); return; - } - if (!(mm = get_task_mm(task))) { - pr_devel("cxl_handle_fault unable to get mm %i\n", - pid_nr(ctx->pid)); - cxl_ack_ae(ctx); - goto out; + } else { + pr_devel("Handling page fault for pe=%d pid=%i\n", + ctx->pe, pid_nr(ctx->pid)); } } @@ -218,33 +297,22 @@ void cxl_handle_fault(struct work_struct *fault_work) if (mm) mmput(mm); -out: - if (task) - put_task_struct(task); } static void cxl_prefault_one(struct cxl_context *ctx, u64 ea) { - int rc; - struct task_struct *task; struct mm_struct *mm; - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { - pr_devel("cxl_prefault_one unable to get task %i\n", - pid_nr(ctx->pid)); - return; - } - if (!(mm = get_task_mm(task))) { + mm = get_mem_context(ctx); + if (mm == NULL) { pr_devel("cxl_prefault_one unable to get mm %i\n", pid_nr(ctx->pid)); - put_task_struct(task); return; } - rc = cxl_fault_segment(ctx, mm, ea); + cxl_fault_segment(ctx, mm, ea); mmput(mm); - put_task_struct(task); } static u64 next_segment(u64 ea, u64 vsid) @@ -263,18 +331,13 @@ static void cxl_prefault_vma(struct cxl_context *ctx) struct copro_slb slb; struct vm_area_struct *vma; int rc; - struct task_struct *task; struct mm_struct *mm; - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { - pr_devel("cxl_prefault_vma unable to get task %i\n", - pid_nr(ctx->pid)); - return; - } - if (!(mm = get_task_mm(task))) { + mm = get_mem_context(ctx); + if (mm == NULL) { pr_devel("cxl_prefault_vm unable to get mm %i\n", pid_nr(ctx->pid)); - goto out1; + return; } down_read(&mm->mmap_sem); @@ -295,8 +358,6 @@ static void cxl_prefault_vma(struct cxl_context *ctx) up_read(&mm->mmap_sem); mmput(mm); -out1: - put_task_struct(task); } void cxl_prefault(struct cxl_context *ctx, u64 wed) diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 5cc14599837d..783337d22f36 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -201,8 +201,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, * where a process (master, some daemon, etc) has opened the chardev on * behalf of another process, so the AFU's mm gets bound to the process * that performs this ioctl and not the process that opened the file. + * Also we grab the PID of the group leader so that if the task that + * has performed the attach operation exits the mm context of the + * process is still accessible. */ - ctx->pid = get_pid(get_task_pid(current, PIDTYPE_PID)); + ctx->pid = get_task_pid(current, PIDTYPE_PID); + ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID); trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); From 284555fc7f5d2d4aab34dbf2b5766c6f8da84873 Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Mon, 14 Dec 2015 15:07:02 -0600 Subject: [PATCH 618/813] cxlflash: Fix to resolve cmd leak after host reset [ Upstream commit ee91e332a6e6e9b939f60f6e1bd72fb2def5290d ] After a few iterations of resetting the card, either during EEH recovery, or a host_reset the following is seen in the logs. cxlflash 0008:00: cxlflash_queuecommand: could not get a free command At every reset of the card, the commands that are outstanding are being leaked. No effort is being made to reap these commands. A few more resets later, the above error message floods the logs and the card is rendered totally unusable as no free commands are available. Iterated through the 'cmd' queue and printed out the 'free' counter and found that on each reset certain commands were in-use and stayed in-use through subsequent resets. To resolve this issue, when the card is reset, reap all the commands that are active/outstanding. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Andrew Donnellan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 1e5bf0ca81da..3fb0e1ca6809 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -632,15 +632,30 @@ static void free_mem(struct cxlflash_cfg *cfg) * @cfg: Internal structure associated with the host. * * Safe to call with AFU in a partially allocated/initialized state. + * + * Cleans up all state associated with the command queue, and unmaps + * the MMIO space. + * + * - complete() will take care of commands we initiated (they'll be checked + * in as part of the cleanup that occurs after the completion) + * + * - cmd_checkin() will take care of entries that we did not initiate and that + * have not (and will not) complete because they are sitting on a [now stale] + * hardware queue */ static void stop_afu(struct cxlflash_cfg *cfg) { int i; struct afu *afu = cfg->afu; + struct afu_cmd *cmd; if (likely(afu)) { - for (i = 0; i < CXLFLASH_NUM_CMDS; i++) - complete(&afu->cmd[i].cevent); + for (i = 0; i < CXLFLASH_NUM_CMDS; i++) { + cmd = &afu->cmd[i]; + complete(&cmd->cevent); + if (!atomic_read(&cmd->free)) + cmd_checkin(cmd); + } if (likely(afu->afu_map)) { cxl_psa_unmap((void __iomem *)afu->afu_map); From 20ab6948f1c09ad84ec8f5b8fdf02ea86e33fa63 Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Mon, 14 Dec 2015 15:07:23 -0600 Subject: [PATCH 619/813] cxlflash: Resolve oops in wait_port_offline [ Upstream commit b45cdbaf9f7f0486847c52f60747fb108724652a ] If an async error interrupt is generated, and the error requires the FC link to be reset, it cannot be performed in the interrupt context. So a work element is scheduled to complete the link reset in a process context. If either an EEH event or an escalation occurs in between when the interrupt is generated and the scheduled work is started, the MMIO space may no longer be available. This will cause an oops in the worker thread. [ 606.806583] NIP kthread_data+0x28/0x40 [ 606.806633] LR wq_worker_sleeping+0x30/0x100 [ 606.806694] Call Trace: [ 606.806721] 0x50 (unreliable) [ 606.806796] wq_worker_sleeping+0x30/0x100 [ 606.806884] __schedule+0x69c/0x8a0 [ 606.806959] schedule+0x44/0xc0 [ 606.807034] do_exit+0x770/0xb90 [ 606.807109] die+0x300/0x460 [ 606.807185] bad_page_fault+0xd8/0x150 [ 606.807259] handle_page_fault+0x2c/0x30 [ 606.807338] wait_port_offline.constprop.12+0x60/0x130 [cxlflash] To prevent the problem space area from being unmapped, when there is pending work, a mapcount (using the kref mechanism) is held. The mapcount is released only when the work is completed. The last reference release is tied to the unmapping service. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/common.h | 2 ++ drivers/scsi/cxlflash/main.c | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h index c11cd193f896..5ada9268a450 100644 --- a/drivers/scsi/cxlflash/common.h +++ b/drivers/scsi/cxlflash/common.h @@ -165,6 +165,8 @@ struct afu { struct sisl_host_map __iomem *host_map; /* MC host map */ struct sisl_ctrl_map __iomem *ctrl_map; /* MC control map */ + struct kref mapcount; + ctx_hndl_t ctx_hndl; /* master's context handle */ u64 *hrrq_start; u64 *hrrq_end; diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 3fb0e1ca6809..463d7e503d88 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -368,6 +368,7 @@ out: no_room: afu->read_room = true; + kref_get(&cfg->afu->mapcount); schedule_work(&cfg->work_q); rc = SCSI_MLQUEUE_HOST_BUSY; goto out; @@ -473,6 +474,16 @@ out: return rc; } +static void afu_unmap(struct kref *ref) +{ + struct afu *afu = container_of(ref, struct afu, mapcount); + + if (likely(afu->afu_map)) { + cxl_psa_unmap((void __iomem *)afu->afu_map); + afu->afu_map = NULL; + } +} + /** * cxlflash_driver_info() - information handler for this host driver * @host: SCSI host associated with device. @@ -503,6 +514,7 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp) ulong lock_flags; short lflag = 0; int rc = 0; + int kref_got = 0; dev_dbg_ratelimited(dev, "%s: (scp=%p) %d/%d/%d/%llu " "cdb=(%08X-%08X-%08X-%08X)\n", @@ -547,6 +559,9 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp) goto out; } + kref_get(&cfg->afu->mapcount); + kref_got = 1; + cmd->rcb.ctx_id = afu->ctx_hndl; cmd->rcb.port_sel = port_sel; cmd->rcb.lun_id = lun_to_lunid(scp->device->lun); @@ -587,6 +602,8 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp) } out: + if (kref_got) + kref_put(&afu->mapcount, afu_unmap); pr_devel("%s: returning rc=%d\n", __func__, rc); return rc; } @@ -661,6 +678,7 @@ static void stop_afu(struct cxlflash_cfg *cfg) cxl_psa_unmap((void __iomem *)afu->afu_map); afu->afu_map = NULL; } + kref_put(&afu->mapcount, afu_unmap); } } @@ -746,8 +764,8 @@ static void cxlflash_remove(struct pci_dev *pdev) scsi_remove_host(cfg->host); /* fall through */ case INIT_STATE_AFU: - term_afu(cfg); cancel_work_sync(&cfg->work_q); + term_afu(cfg); case INIT_STATE_PCI: pci_release_regions(cfg->dev); pci_disable_device(pdev); @@ -1331,6 +1349,7 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data) __func__, port); cfg->lr_state = LINK_RESET_REQUIRED; cfg->lr_port = port; + kref_get(&cfg->afu->mapcount); schedule_work(&cfg->work_q); } @@ -1351,6 +1370,7 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data) if (info->action & SCAN_HOST) { atomic_inc(&cfg->scan_host_needed); + kref_get(&cfg->afu->mapcount); schedule_work(&cfg->work_q); } } @@ -1746,6 +1766,7 @@ static int init_afu(struct cxlflash_cfg *cfg) rc = -ENOMEM; goto err1; } + kref_init(&afu->mapcount); /* No byte reverse on reading afu_version or string will be backwards */ reg = readq(&afu->afu_map->global.regs.afu_version); @@ -1780,8 +1801,7 @@ out: return rc; err2: - cxl_psa_unmap((void __iomem *)afu->afu_map); - afu->afu_map = NULL; + kref_put(&afu->mapcount, afu_unmap); err1: term_mc(cfg, UNDO_START); goto out; @@ -2354,6 +2374,7 @@ static void cxlflash_worker_thread(struct work_struct *work) if (atomic_dec_if_positive(&cfg->scan_host_needed) >= 0) scsi_scan_host(cfg->host); + kref_put(&afu->mapcount, afu_unmap); } /** From 8c67424e2a37005364d0aecd6541d221517a1c4f Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Mon, 14 Dec 2015 15:07:43 -0600 Subject: [PATCH 620/813] cxlflash: Enable device id for future IBM CXL adapter [ Upstream commit a2746fb16e41b7c8f02aa4d2605ecce97abbebbd ] This drop enables a future card with a device id of 0x0600 to be recognized by the cxlflash driver. As per the design, the Accelerator Function Unit (AFU) for this new IBM CXL Flash Adapter retains the same host interface as the previous generation. For the early prototypes of the new card, the driver with this change behaves exactly as the driver prior to this behaved with the earlier generation card. Therefore, no card specific programming has been added. These card specific changes can be staged in later if needed. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Andrew Donnellan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 3 +++ drivers/scsi/cxlflash/main.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 463d7e503d88..d1d077420964 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -2309,6 +2309,7 @@ static struct scsi_host_template driver_template = { * Device dependent values */ static struct dev_dependent_vals dev_corsa_vals = { CXLFLASH_MAX_SECTORS }; +static struct dev_dependent_vals dev_flash_gt_vals = { CXLFLASH_MAX_SECTORS }; /* * PCI device binding table @@ -2316,6 +2317,8 @@ static struct dev_dependent_vals dev_corsa_vals = { CXLFLASH_MAX_SECTORS }; static struct pci_device_id cxlflash_pci_table[] = { {PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CORSA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (kernel_ulong_t)&dev_corsa_vals}, + {PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_FLASH_GT, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, (kernel_ulong_t)&dev_flash_gt_vals}, {} }; diff --git a/drivers/scsi/cxlflash/main.h b/drivers/scsi/cxlflash/main.h index 60324566c14f..3d2d606fafb3 100644 --- a/drivers/scsi/cxlflash/main.h +++ b/drivers/scsi/cxlflash/main.h @@ -24,8 +24,8 @@ #define CXLFLASH_ADAPTER_NAME "IBM POWER CXL Flash Adapter" #define CXLFLASH_DRIVER_DATE "(August 13, 2015)" -#define PCI_DEVICE_ID_IBM_CORSA 0x04F0 -#define CXLFLASH_SUBS_DEV_ID 0x04F0 +#define PCI_DEVICE_ID_IBM_CORSA 0x04F0 +#define PCI_DEVICE_ID_IBM_FLASH_GT 0x0600 /* Since there is only one target, make it 0 */ #define CXLFLASH_TARGET 0 From 0f874d9433bd6b2acd668ef0a0359e30af38f8b0 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 8 Jan 2016 10:30:09 -0800 Subject: [PATCH 621/813] cxl: fix build for GCC 4.6.x [ Upstream commit aa09545589ceeff884421d8eb38d04963190afbe ] GCC 4.6.3 does not support -Wno-unused-const-variable. Instead, use the kbuild infrastructure that checks if this options exists. Fixes: 2cd55c68c0a4 ("cxl: Fix build failure due to -Wunused-variable behaviour change") Suggested-by: Michal Marek Suggested-by: Arnd Bergmann Signed-off-by: Brian Norris Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile index 6982f603fadc..ab6f392d3504 100644 --- a/drivers/misc/cxl/Makefile +++ b/drivers/misc/cxl/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Werror -Wno-unused-const-variable +ccflags-y := -Werror $(call cc-disable-warning, unused-const-variable) cxl-y += main.o file.o irq.o fault.o native.o cxl-y += context.o sysfs.o debugfs.o pci.o trace.o From bcc975680df41bddcff13c3477087c7f51280b00 Mon Sep 17 00:00:00 2001 From: Uma Krishnan Date: Mon, 7 Dec 2015 16:03:32 -0600 Subject: [PATCH 622/813] cxl: Enable PCI device ID for future IBM CXL adapter [ Upstream commit 68adb7bfd66504e97364651fb7dac3f9c8aa8561 ] Add support for future IBM Coherent Accelerator (CXL) device with ID of 0x0601. Signed-off-by: Uma Krishnan Reviewed-by: Matthew R. Ochs Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index be2c8e248e2e..0c6c17a1c59e 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -138,6 +138,7 @@ static const struct pci_device_id cxl_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0477), }, { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), }, { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), }, + { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), }, { PCI_DEVICE_CLASS(0x120000, ~0), }, { } From 3f499a426420ab8b6f5c5a8a728cd6f713b5f398 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:11:52 -0500 Subject: [PATCH 623/813] lpfc: Fix FCF Infinite loop in lpfc_sli4_fcf_rr_next_index_get. [ Upstream commit f5cb5304eb26d307c9b30269fb0e007e0b262b7d ] Fix FCF Infinite loop in lpfc_sli4_fcf_rr_next_index_get. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_sli.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index f9585cdd8933..6aae828208e2 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -16173,7 +16173,7 @@ fail_fcf_read: } /** - * lpfc_check_next_fcf_pri + * lpfc_check_next_fcf_pri_level * phba pointer to the lpfc_hba struct for this port. * This routine is called from the lpfc_sli4_fcf_rr_next_index_get * routine when the rr_bmask is empty. The FCF indecies are put into the @@ -16329,8 +16329,12 @@ next_priority: if (next_fcf_index < LPFC_SLI4_FCF_TBL_INDX_MAX && phba->fcf.fcf_pri[next_fcf_index].fcf_rec.flag & - LPFC_FCF_FLOGI_FAILED) + LPFC_FCF_FLOGI_FAILED) { + if (list_is_singular(&phba->fcf.fcf_pri_list)) + return LPFC_FCOE_FCF_NEXT_NONE; + goto next_priority; + } lpfc_printf_log(phba, KERN_INFO, LOG_FIP, "2845 Get next roundrobin failover FCF (x%x)\n", From 1a1455583df0a4f9fbbbaec3f7aad2ef09934c66 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:11:53 -0500 Subject: [PATCH 624/813] lpfc: Fix the FLOGI discovery logic to comply with T11 standards [ Upstream commit d6de08cc46269899988b4f40acc7337279693d4b ] Fix the FLOGI discovery logic to comply with T11 standards We weren't properly setting fabric parameters, such as R_A_TOV and E_D_TOV, when we registered the vfi object in default configs and pt2pt configs. Revise to now pass service params with the values to the firmware and ensure they are reset on link bounce. Required reworking the call sequence in the discovery threads. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_crtn.h | 1 + drivers/scsi/lpfc/lpfc_els.c | 346 +++++++++++++---------------- drivers/scsi/lpfc/lpfc_hbadisc.c | 12 +- drivers/scsi/lpfc/lpfc_nportdisc.c | 124 +++++++---- 4 files changed, 242 insertions(+), 241 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index b0e6fe46448d..80d3c740a8a8 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -72,6 +72,7 @@ void lpfc_cancel_all_vport_retry_delay_timer(struct lpfc_hba *); void lpfc_retry_pport_discovery(struct lpfc_hba *); void lpfc_release_rpi(struct lpfc_hba *, struct lpfc_vport *, uint16_t); +void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *); diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index b6fa257ea3e0..f6dd15b22383 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -455,9 +455,9 @@ int lpfc_issue_reg_vfi(struct lpfc_vport *vport) { struct lpfc_hba *phba = vport->phba; - LPFC_MBOXQ_t *mboxq; + LPFC_MBOXQ_t *mboxq = NULL; struct lpfc_nodelist *ndlp; - struct lpfc_dmabuf *dmabuf; + struct lpfc_dmabuf *dmabuf = NULL; int rc = 0; /* move forward in case of SLI4 FC port loopback test and pt2pt mode */ @@ -471,25 +471,33 @@ lpfc_issue_reg_vfi(struct lpfc_vport *vport) } } - dmabuf = kzalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL); - if (!dmabuf) { - rc = -ENOMEM; - goto fail; - } - dmabuf->virt = lpfc_mbuf_alloc(phba, MEM_PRI, &dmabuf->phys); - if (!dmabuf->virt) { - rc = -ENOMEM; - goto fail_free_dmabuf; - } - mboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); if (!mboxq) { rc = -ENOMEM; - goto fail_free_coherent; + goto fail; } + + /* Supply CSP's only if we are fabric connect or pt-to-pt connect */ + if ((vport->fc_flag & FC_FABRIC) || (vport->fc_flag & FC_PT2PT)) { + dmabuf = kzalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL); + if (!dmabuf) { + rc = -ENOMEM; + goto fail; + } + dmabuf->virt = lpfc_mbuf_alloc(phba, MEM_PRI, &dmabuf->phys); + if (!dmabuf->virt) { + rc = -ENOMEM; + goto fail; + } + memcpy(dmabuf->virt, &phba->fc_fabparam, + sizeof(struct serv_parm)); + } + vport->port_state = LPFC_FABRIC_CFG_LINK; - memcpy(dmabuf->virt, &phba->fc_fabparam, sizeof(vport->fc_sparam)); - lpfc_reg_vfi(mboxq, vport, dmabuf->phys); + if (dmabuf) + lpfc_reg_vfi(mboxq, vport, dmabuf->phys); + else + lpfc_reg_vfi(mboxq, vport, 0); mboxq->mbox_cmpl = lpfc_mbx_cmpl_reg_vfi; mboxq->vport = vport; @@ -497,17 +505,19 @@ lpfc_issue_reg_vfi(struct lpfc_vport *vport) rc = lpfc_sli_issue_mbox(phba, mboxq, MBX_NOWAIT); if (rc == MBX_NOT_FINISHED) { rc = -ENXIO; - goto fail_free_mbox; + goto fail; } return 0; -fail_free_mbox: - mempool_free(mboxq, phba->mbox_mem_pool); -fail_free_coherent: - lpfc_mbuf_free(phba, dmabuf->virt, dmabuf->phys); -fail_free_dmabuf: - kfree(dmabuf); fail: + if (mboxq) + mempool_free(mboxq, phba->mbox_mem_pool); + if (dmabuf) { + if (dmabuf->virt) + lpfc_mbuf_free(phba, dmabuf->virt, dmabuf->phys); + kfree(dmabuf); + } + lpfc_vport_set_state(vport, FC_VPORT_FAILED); lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, "0289 Issue Register VFI failed: Err %d\n", rc); @@ -711,9 +721,10 @@ lpfc_cmpl_els_flogi_fabric(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, * For FC we need to do some special processing because of the SLI * Port's default settings of the Common Service Parameters. */ - if (phba->sli4_hba.lnk_info.lnk_tp == LPFC_LNK_TYPE_FC) { + if ((phba->sli_rev == LPFC_SLI_REV4) && + (phba->sli4_hba.lnk_info.lnk_tp == LPFC_LNK_TYPE_FC)) { /* If physical FC port changed, unreg VFI and ALL VPIs / RPIs */ - if ((phba->sli_rev == LPFC_SLI_REV4) && fabric_param_changed) + if (fabric_param_changed) lpfc_unregister_fcf_prep(phba); /* This should just update the VFI CSPs*/ @@ -824,13 +835,21 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, spin_lock_irq(shost->host_lock); vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP); + vport->fc_flag |= FC_PT2PT; spin_unlock_irq(shost->host_lock); - phba->fc_edtov = FF_DEF_EDTOV; - phba->fc_ratov = FF_DEF_RATOV; + /* If physical FC port changed, unreg VFI and ALL VPIs / RPIs */ + if ((phba->sli_rev == LPFC_SLI_REV4) && phba->fc_topology_changed) { + lpfc_unregister_fcf_prep(phba); + + spin_lock_irq(shost->host_lock); + vport->fc_flag &= ~FC_VFI_REGISTERED; + spin_unlock_irq(shost->host_lock); + phba->fc_topology_changed = 0; + } + rc = memcmp(&vport->fc_portname, &sp->portName, sizeof(vport->fc_portname)); - memcpy(&phba->fc_fabparam, sp, sizeof(struct serv_parm)); if (rc >= 0) { /* This side will initiate the PLOGI */ @@ -839,38 +858,14 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, spin_unlock_irq(shost->host_lock); /* - * N_Port ID cannot be 0, set our to LocalID the other - * side will be RemoteID. + * N_Port ID cannot be 0, set our Id to LocalID + * the other side will be RemoteID. */ /* not equal */ if (rc) vport->fc_myDID = PT2PT_LocalID; - mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!mbox) - goto fail; - - lpfc_config_link(phba, mbox); - - mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; - mbox->vport = vport; - rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT); - if (rc == MBX_NOT_FINISHED) { - mempool_free(mbox, phba->mbox_mem_pool); - goto fail; - } - - /* - * For SLI4, the VFI/VPI are registered AFTER the - * Nport with the higher WWPN sends the PLOGI with - * an assigned NPortId. - */ - - /* not equal */ - if ((phba->sli_rev == LPFC_SLI_REV4) && rc) - lpfc_issue_reg_vfi(vport); - /* Decrement ndlp reference count indicating that ndlp can be * safely released when other references to it are done. */ @@ -912,29 +907,20 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, /* If we are pt2pt with another NPort, force NPIV off! */ phba->sli3_options &= ~LPFC_SLI3_NPIV_ENABLED; - spin_lock_irq(shost->host_lock); - vport->fc_flag |= FC_PT2PT; - spin_unlock_irq(shost->host_lock); - /* If physical FC port changed, unreg VFI and ALL VPIs / RPIs */ - if ((phba->sli_rev == LPFC_SLI_REV4) && phba->fc_topology_changed) { - lpfc_unregister_fcf_prep(phba); + mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); + if (!mbox) + goto fail; - /* The FC_VFI_REGISTERED flag will get clear in the cmpl - * handler for unreg_vfi, but if we don't force the - * FC_VFI_REGISTERED flag then the reg_vfi mailbox could be - * built with the update bit set instead of just the vp bit to - * change the Nport ID. We need to have the vp set and the - * Upd cleared on topology changes. - */ - spin_lock_irq(shost->host_lock); - vport->fc_flag &= ~FC_VFI_REGISTERED; - spin_unlock_irq(shost->host_lock); - phba->fc_topology_changed = 0; - lpfc_issue_reg_vfi(vport); + lpfc_config_link(phba, mbox); + + mbox->mbox_cmpl = lpfc_mbx_cmpl_local_config_link; + mbox->vport = vport; + rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT); + if (rc == MBX_NOT_FINISHED) { + mempool_free(mbox, phba->mbox_mem_pool); + goto fail; } - /* Start discovery - this should just do CLEAR_LA */ - lpfc_disc_start(vport); return 0; fail: return -ENXIO; @@ -1157,6 +1143,7 @@ flogifail: spin_lock_irq(&phba->hbalock); phba->fcf.fcf_flag &= ~FCF_DISCOVERY; spin_unlock_irq(&phba->hbalock); + lpfc_nlp_put(ndlp); if (!lpfc_error_lost_link(irsp)) { @@ -3898,6 +3885,7 @@ lpfc_els_rsp_acc(struct lpfc_vport *vport, uint32_t flag, IOCB_t *oldcmd; struct lpfc_iocbq *elsiocb; uint8_t *pcmd; + struct serv_parm *sp; uint16_t cmdsize; int rc; ELS_PKT *els_pkt_ptr; @@ -3927,6 +3915,7 @@ lpfc_els_rsp_acc(struct lpfc_vport *vport, uint32_t flag, "Issue ACC: did:x%x flg:x%x", ndlp->nlp_DID, ndlp->nlp_flag, 0); break; + case ELS_CMD_FLOGI: case ELS_CMD_PLOGI: cmdsize = (sizeof(struct serv_parm) + sizeof(uint32_t)); elsiocb = lpfc_prep_els_iocb(vport, 0, cmdsize, oldiocb->retry, @@ -3944,10 +3933,34 @@ lpfc_els_rsp_acc(struct lpfc_vport *vport, uint32_t flag, *((uint32_t *) (pcmd)) = ELS_CMD_ACC; pcmd += sizeof(uint32_t); - memcpy(pcmd, &vport->fc_sparam, sizeof(struct serv_parm)); + sp = (struct serv_parm *)pcmd; + + if (flag == ELS_CMD_FLOGI) { + /* Copy the received service parameters back */ + memcpy(sp, &phba->fc_fabparam, + sizeof(struct serv_parm)); + + /* Clear the F_Port bit */ + sp->cmn.fPort = 0; + + /* Mark all class service parameters as invalid */ + sp->cls1.classValid = 0; + sp->cls2.classValid = 0; + sp->cls3.classValid = 0; + sp->cls4.classValid = 0; + + /* Copy our worldwide names */ + memcpy(&sp->portName, &vport->fc_sparam.portName, + sizeof(struct lpfc_name)); + memcpy(&sp->nodeName, &vport->fc_sparam.nodeName, + sizeof(struct lpfc_name)); + } else { + memcpy(pcmd, &vport->fc_sparam, + sizeof(struct serv_parm)); + } lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_ELS_RSP, - "Issue ACC PLOGI: did:x%x flg:x%x", + "Issue ACC FLOGI/PLOGI: did:x%x flg:x%x", ndlp->nlp_DID, ndlp->nlp_flag, 0); break; case ELS_CMD_PRLO: @@ -5739,7 +5752,6 @@ lpfc_els_rcv_flogi(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb, IOCB_t *icmd = &cmdiocb->iocb; struct serv_parm *sp; LPFC_MBOXQ_t *mbox; - struct ls_rjt stat; uint32_t cmd, did; int rc; uint32_t fc_flag = 0; @@ -5765,135 +5777,92 @@ lpfc_els_rcv_flogi(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb, return 1; } - if ((lpfc_check_sparm(vport, ndlp, sp, CLASS3, 1))) { - /* For a FLOGI we accept, then if our portname is greater - * then the remote portname we initiate Nport login. - */ + (void) lpfc_check_sparm(vport, ndlp, sp, CLASS3, 1); - rc = memcmp(&vport->fc_portname, &sp->portName, - sizeof(struct lpfc_name)); - if (!rc) { - if (phba->sli_rev < LPFC_SLI_REV4) { - mbox = mempool_alloc(phba->mbox_mem_pool, - GFP_KERNEL); - if (!mbox) - return 1; - lpfc_linkdown(phba); - lpfc_init_link(phba, mbox, - phba->cfg_topology, - phba->cfg_link_speed); - mbox->u.mb.un.varInitLnk.lipsr_AL_PA = 0; - mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; - mbox->vport = vport; - rc = lpfc_sli_issue_mbox(phba, mbox, - MBX_NOWAIT); - lpfc_set_loopback_flag(phba); - if (rc == MBX_NOT_FINISHED) - mempool_free(mbox, phba->mbox_mem_pool); + /* + * If our portname is greater than the remote portname, + * then we initiate Nport login. + */ + + rc = memcmp(&vport->fc_portname, &sp->portName, + sizeof(struct lpfc_name)); + + if (!rc) { + if (phba->sli_rev < LPFC_SLI_REV4) { + mbox = mempool_alloc(phba->mbox_mem_pool, + GFP_KERNEL); + if (!mbox) return 1; - } else { - /* abort the flogi coming back to ourselves - * due to external loopback on the port. - */ - lpfc_els_abort_flogi(phba); - return 0; - } - } else if (rc > 0) { /* greater than */ - spin_lock_irq(shost->host_lock); - vport->fc_flag |= FC_PT2PT_PLOGI; - spin_unlock_irq(shost->host_lock); + lpfc_linkdown(phba); + lpfc_init_link(phba, mbox, + phba->cfg_topology, + phba->cfg_link_speed); + mbox->u.mb.un.varInitLnk.lipsr_AL_PA = 0; + mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; + mbox->vport = vport; + rc = lpfc_sli_issue_mbox(phba, mbox, + MBX_NOWAIT); + lpfc_set_loopback_flag(phba); + if (rc == MBX_NOT_FINISHED) + mempool_free(mbox, phba->mbox_mem_pool); + return 1; + } - /* If we have the high WWPN we can assign our own - * myDID; otherwise, we have to WAIT for a PLOGI - * from the remote NPort to find out what it - * will be. - */ - vport->fc_myDID = PT2PT_LocalID; - } else - vport->fc_myDID = PT2PT_RemoteID; - - /* - * The vport state should go to LPFC_FLOGI only - * AFTER we issue a FLOGI, not receive one. + /* abort the flogi coming back to ourselves + * due to external loopback on the port. */ + lpfc_els_abort_flogi(phba); + return 0; + + } else if (rc > 0) { /* greater than */ spin_lock_irq(shost->host_lock); - fc_flag = vport->fc_flag; - port_state = vport->port_state; - vport->fc_flag |= FC_PT2PT; - vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP); + vport->fc_flag |= FC_PT2PT_PLOGI; spin_unlock_irq(shost->host_lock); - lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS, - "3311 Rcv Flogi PS x%x new PS x%x " - "fc_flag x%x new fc_flag x%x\n", - port_state, vport->port_state, - fc_flag, vport->fc_flag); - /* - * We temporarily set fc_myDID to make it look like we are - * a Fabric. This is done just so we end up with the right - * did / sid on the FLOGI ACC rsp. + /* If we have the high WWPN we can assign our own + * myDID; otherwise, we have to WAIT for a PLOGI + * from the remote NPort to find out what it + * will be. */ - did = vport->fc_myDID; - vport->fc_myDID = Fabric_DID; - + vport->fc_myDID = PT2PT_LocalID; } else { - /* Reject this request because invalid parameters */ - stat.un.b.lsRjtRsvd0 = 0; - stat.un.b.lsRjtRsnCode = LSRJT_UNABLE_TPC; - stat.un.b.lsRjtRsnCodeExp = LSEXP_SPARM_OPTIONS; - stat.un.b.vendorUnique = 0; - - /* - * We temporarily set fc_myDID to make it look like we are - * a Fabric. This is done just so we end up with the right - * did / sid on the FLOGI LS_RJT rsp. - */ - did = vport->fc_myDID; - vport->fc_myDID = Fabric_DID; - - lpfc_els_rsp_reject(vport, stat.un.lsRjtError, cmdiocb, ndlp, - NULL); - - /* Now lets put fc_myDID back to what its supposed to be */ - vport->fc_myDID = did; - - return 1; + vport->fc_myDID = PT2PT_RemoteID; } - /* send our FLOGI first */ - if (vport->port_state < LPFC_FLOGI) { - vport->fc_myDID = 0; - lpfc_initial_flogi(vport); - vport->fc_myDID = Fabric_DID; - } + /* + * The vport state should go to LPFC_FLOGI only + * AFTER we issue a FLOGI, not receive one. + */ + spin_lock_irq(shost->host_lock); + fc_flag = vport->fc_flag; + port_state = vport->port_state; + vport->fc_flag |= FC_PT2PT; + vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP); + spin_unlock_irq(shost->host_lock); + lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS, + "3311 Rcv Flogi PS x%x new PS x%x " + "fc_flag x%x new fc_flag x%x\n", + port_state, vport->port_state, + fc_flag, vport->fc_flag); + + /* + * We temporarily set fc_myDID to make it look like we are + * a Fabric. This is done just so we end up with the right + * did / sid on the FLOGI ACC rsp. + */ + did = vport->fc_myDID; + vport->fc_myDID = Fabric_DID; + + memcpy(&phba->fc_fabparam, sp, sizeof(struct serv_parm)); /* Send back ACC */ - lpfc_els_rsp_acc(vport, ELS_CMD_PLOGI, cmdiocb, ndlp, NULL); + lpfc_els_rsp_acc(vport, ELS_CMD_FLOGI, cmdiocb, ndlp, NULL); /* Now lets put fc_myDID back to what its supposed to be */ vport->fc_myDID = did; - if (!(vport->fc_flag & FC_PT2PT_PLOGI)) { - - mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!mbox) - goto fail; - - lpfc_config_link(phba, mbox); - - mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; - mbox->vport = vport; - rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT); - if (rc == MBX_NOT_FINISHED) { - mempool_free(mbox, phba->mbox_mem_pool); - goto fail; - } - } - return 0; -fail: - return 1; } /** @@ -7345,7 +7314,7 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, /* reject till our FLOGI completes */ if ((vport->port_state < LPFC_FABRIC_CFG_LINK) && - (cmd != ELS_CMD_FLOGI)) { + (cmd != ELS_CMD_FLOGI)) { rjt_err = LSRJT_UNABLE_TPC; rjt_exp = LSEXP_NOTHING_MORE; goto lsrjt; @@ -7381,6 +7350,7 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, rjt_exp = LSEXP_NOTHING_MORE; break; } + if (vport->port_state < LPFC_DISC_AUTH) { if (!(phba->pport->fc_flag & FC_PT2PT) || (phba->pport->fc_flag & FC_PT2PT_PLOGI)) { diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index bfc2442dd74a..c96532cc5af0 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -1083,7 +1083,7 @@ out: } -static void +void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) { struct lpfc_vport *vport = pmb->vport; @@ -1113,8 +1113,10 @@ lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) /* Start discovery by sending a FLOGI. port_state is identically * LPFC_FLOGI while waiting for FLOGI cmpl */ - if (vport->port_state != LPFC_FLOGI || vport->fc_flag & FC_PT2PT_PLOGI) + if (vport->port_state != LPFC_FLOGI) lpfc_initial_flogi(vport); + else if (vport->fc_flag & FC_PT2PT) + lpfc_disc_start(vport); return; out: @@ -2963,8 +2965,10 @@ lpfc_mbx_cmpl_reg_vfi(struct lpfc_hba *phba, LPFC_MBOXQ_t *mboxq) out_free_mem: mempool_free(mboxq, phba->mbox_mem_pool); - lpfc_mbuf_free(phba, dmabuf->virt, dmabuf->phys); - kfree(dmabuf); + if (dmabuf) { + lpfc_mbuf_free(phba, dmabuf->virt, dmabuf->phys); + kfree(dmabuf); + } return; } diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index ed9a2c80c4aa..daeda6d7fb25 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -280,38 +280,12 @@ lpfc_rcv_plogi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, uint32_t *lp; IOCB_t *icmd; struct serv_parm *sp; + uint32_t ed_tov; LPFC_MBOXQ_t *mbox; struct ls_rjt stat; int rc; memset(&stat, 0, sizeof (struct ls_rjt)); - if (vport->port_state <= LPFC_FDISC) { - /* Before responding to PLOGI, check for pt2pt mode. - * If we are pt2pt, with an outstanding FLOGI, abort - * the FLOGI and resend it first. - */ - if (vport->fc_flag & FC_PT2PT) { - lpfc_els_abort_flogi(phba); - if (!(vport->fc_flag & FC_PT2PT_PLOGI)) { - /* If the other side is supposed to initiate - * the PLOGI anyway, just ACC it now and - * move on with discovery. - */ - phba->fc_edtov = FF_DEF_EDTOV; - phba->fc_ratov = FF_DEF_RATOV; - /* Start discovery - this should just do - CLEAR_LA */ - lpfc_disc_start(vport); - } else - lpfc_initial_flogi(vport); - } else { - stat.un.b.lsRjtRsnCode = LSRJT_LOGICAL_BSY; - stat.un.b.lsRjtRsnCodeExp = LSEXP_NOTHING_MORE; - lpfc_els_rsp_reject(vport, stat.un.lsRjtError, cmdiocb, - ndlp, NULL); - return 0; - } - } pcmd = (struct lpfc_dmabuf *) cmdiocb->context2; lp = (uint32_t *) pcmd->virt; sp = (struct serv_parm *) ((uint8_t *) lp + sizeof (uint32_t)); @@ -404,30 +378,46 @@ lpfc_rcv_plogi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, /* Check for Nport to NPort pt2pt protocol */ if ((vport->fc_flag & FC_PT2PT) && !(vport->fc_flag & FC_PT2PT_PLOGI)) { - /* rcv'ed PLOGI decides what our NPortId will be */ vport->fc_myDID = icmd->un.rcvels.parmRo; - mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (mbox == NULL) - goto out; - lpfc_config_link(phba, mbox); - mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; - mbox->vport = vport; - rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT); - if (rc == MBX_NOT_FINISHED) { - mempool_free(mbox, phba->mbox_mem_pool); - goto out; + + ed_tov = be32_to_cpu(sp->cmn.e_d_tov); + if (sp->cmn.edtovResolution) { + /* E_D_TOV ticks are in nanoseconds */ + ed_tov = (phba->fc_edtov + 999999) / 1000000; } + /* - * For SLI4, the VFI/VPI are registered AFTER the - * Nport with the higher WWPN sends us a PLOGI with - * our assigned NPortId. + * For pt-to-pt, use the larger EDTOV + * RATOV = 2 * EDTOV */ + if (ed_tov > phba->fc_edtov) + phba->fc_edtov = ed_tov; + phba->fc_ratov = (2 * phba->fc_edtov) / 1000; + + memcpy(&phba->fc_fabparam, sp, sizeof(struct serv_parm)); + + /* Issue config_link / reg_vfi to account for updated TOV's */ + if (phba->sli_rev == LPFC_SLI_REV4) lpfc_issue_reg_vfi(vport); + else { + mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); + if (mbox == NULL) + goto out; + lpfc_config_link(phba, mbox); + mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; + mbox->vport = vport; + rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT); + if (rc == MBX_NOT_FINISHED) { + mempool_free(mbox, phba->mbox_mem_pool); + goto out; + } + } lpfc_can_disctmo(vport); } + mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); if (!mbox) goto out; @@ -1038,7 +1028,9 @@ lpfc_cmpl_plogi_plogi_issue(struct lpfc_vport *vport, uint32_t *lp; IOCB_t *irsp; struct serv_parm *sp; + uint32_t ed_tov; LPFC_MBOXQ_t *mbox; + int rc; cmdiocb = (struct lpfc_iocbq *) arg; rspiocb = cmdiocb->context_un.rsp_iocb; @@ -1053,6 +1045,16 @@ lpfc_cmpl_plogi_plogi_issue(struct lpfc_vport *vport, if (irsp->ulpStatus) goto out; + mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); + if (!mbox) { + lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, + "0133 PLOGI: no memory for reg_login " + "Data: x%x x%x x%x x%x\n", + ndlp->nlp_DID, ndlp->nlp_state, + ndlp->nlp_flag, ndlp->nlp_rpi); + goto out; + } + pcmd = (struct lpfc_dmabuf *) cmdiocb->context2; prsp = list_get_first(&pcmd->list, struct lpfc_dmabuf, list); @@ -1094,14 +1096,38 @@ lpfc_cmpl_plogi_plogi_issue(struct lpfc_vport *vport, ndlp->nlp_maxframe = ((sp->cmn.bbRcvSizeMsb & 0x0F) << 8) | sp->cmn.bbRcvSizeLsb; - mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!mbox) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, - "0133 PLOGI: no memory for reg_login " - "Data: x%x x%x x%x x%x\n", - ndlp->nlp_DID, ndlp->nlp_state, - ndlp->nlp_flag, ndlp->nlp_rpi); - goto out; + if ((vport->fc_flag & FC_PT2PT) && + (vport->fc_flag & FC_PT2PT_PLOGI)) { + ed_tov = be32_to_cpu(sp->cmn.e_d_tov); + if (sp->cmn.edtovResolution) { + /* E_D_TOV ticks are in nanoseconds */ + ed_tov = (phba->fc_edtov + 999999) / 1000000; + } + + /* + * Use the larger EDTOV + * RATOV = 2 * EDTOV for pt-to-pt + */ + if (ed_tov > phba->fc_edtov) + phba->fc_edtov = ed_tov; + phba->fc_ratov = (2 * phba->fc_edtov) / 1000; + + memcpy(&phba->fc_fabparam, sp, sizeof(struct serv_parm)); + + /* Issue config_link / reg_vfi to account for updated TOV's */ + if (phba->sli_rev == LPFC_SLI_REV4) { + lpfc_issue_reg_vfi(vport); + } else { + lpfc_config_link(phba, mbox); + + mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; + mbox->vport = vport; + rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT); + if (rc == MBX_NOT_FINISHED) { + mempool_free(mbox, phba->mbox_mem_pool); + goto out; + } + } } lpfc_unreg_rpi(vport, ndlp); From 0217efc5b83a7443c2234e5426493b2d06b7067c Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:11:55 -0500 Subject: [PATCH 625/813] lpfc: Fix RegLogin failed error seen on Lancer FC during port bounce [ Upstream commit 4b7789b71c916f79a3366da080101014473234c3 ] Fix RegLogin failed error seen on Lancer FC during port bounce Fix the statemachine and ref counting. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_els.c | 14 +++++++++----- drivers/scsi/lpfc/lpfc_hbadisc.c | 8 ++++---- drivers/scsi/lpfc/lpfc_nportdisc.c | 3 +++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index f6dd15b22383..d508378510f1 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -3779,14 +3779,17 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, lpfc_nlp_set_state(vport, ndlp, NLP_STE_REG_LOGIN_ISSUE); } + + ndlp->nlp_flag |= NLP_REG_LOGIN_SEND; if (lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT) != MBX_NOT_FINISHED) goto out; - else - /* Decrement the ndlp reference count we - * set for this failed mailbox command. - */ - lpfc_nlp_put(ndlp); + + /* Decrement the ndlp reference count we + * set for this failed mailbox command. + */ + lpfc_nlp_put(ndlp); + ndlp->nlp_flag &= ~NLP_REG_LOGIN_SEND; /* ELS rsp: Cannot issue reg_login for */ lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, @@ -3843,6 +3846,7 @@ out: * the routine lpfc_els_free_iocb. */ cmdiocb->context1 = NULL; + } lpfc_els_free_iocb(phba, cmdiocb); diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index c96532cc5af0..d3668aa555d5 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -3452,10 +3452,10 @@ lpfc_mbx_cmpl_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) spin_lock_irq(shost->host_lock); ndlp->nlp_flag &= ~NLP_IGNR_REG_CMPL; spin_unlock_irq(shost->host_lock); - } else - /* Good status, call state machine */ - lpfc_disc_state_machine(vport, ndlp, pmb, - NLP_EVT_CMPL_REG_LOGIN); + } + + /* Call state machine */ + lpfc_disc_state_machine(vport, ndlp, pmb, NLP_EVT_CMPL_REG_LOGIN); lpfc_mbuf_free(phba, mp->virt, mp->phys); kfree(mp); diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index daeda6d7fb25..9e571dd41687 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -2325,6 +2325,9 @@ lpfc_cmpl_reglogin_npr_node(struct lpfc_vport *vport, if (vport->phba->sli_rev < LPFC_SLI_REV4) ndlp->nlp_rpi = mb->un.varWords[0]; ndlp->nlp_flag |= NLP_RPI_REGISTERED; + if (ndlp->nlp_flag & NLP_LOGO_ACC) { + lpfc_unreg_rpi(vport, ndlp); + } } else { if (ndlp->nlp_flag & NLP_NODEV_REMOVE) { lpfc_drop_node(vport, ndlp); From 7cf5c223ccf949e6d5ef6b87c33edcad67a8ebbe Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:11:56 -0500 Subject: [PATCH 626/813] lpfc: Fix driver crash when module parameter lpfc_fcp_io_channel set to 16 [ Upstream commit 6690e0d4fc5cccf74534abe0c9f9a69032bc02f0 ] Fix driver crash when module parameter lpfc_fcp_io_channel set to 16 Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_init.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index b0d92b84bcdc..c14ab6c3ae40 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -8834,9 +8834,12 @@ found: * already mapped to this phys_id. */ if (cpup->irq != LPFC_VECTOR_MAP_EMPTY) { - chann[saved_chann] = - cpup->channel_id; - saved_chann++; + if (saved_chann <= + LPFC_FCP_IO_CHAN_MAX) { + chann[saved_chann] = + cpup->channel_id; + saved_chann++; + } goto out; } From 6b9bd78515c42966540c1edf11253d76fd57eb14 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:11:57 -0500 Subject: [PATCH 627/813] lpfc: Fix crash in fcp command completion path. [ Upstream commit c90261dcd86e4eb5c9c1627fde037e902db8aefa ] Fix crash in fcp command completion path. Missed null check. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_scsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 9e165bc05ee1..bae36cc3740b 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -3908,9 +3908,9 @@ lpfc_scsi_cmd_iocb_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pIocbIn, uint32_t logit = LOG_FCP; /* Sanity check on return of outstanding command */ - if (!(lpfc_cmd->pCmd)) - return; cmd = lpfc_cmd->pCmd; + if (!cmd) + return; shost = cmd->device->host; lpfc_cmd->result = (pIocbOut->iocb.un.ulpWord[4] & IOERR_PARAM_MASK); From 13c61f04687d95b650e2ac7091b7a221d0caf1f0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:11:59 -0500 Subject: [PATCH 628/813] lpfc: Fix RDP Speed reporting. [ Upstream commit 81e7517723fc17396ba91f59312b3177266ddbda ] Fix RDP Speed reporting. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_els.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index d508378510f1..59ced8864b2f 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -4698,28 +4698,25 @@ lpfc_rdp_res_speed(struct fc_rdp_port_speed_desc *desc, struct lpfc_hba *phba) desc->tag = cpu_to_be32(RDP_PORT_SPEED_DESC_TAG); - switch (phba->sli4_hba.link_state.speed) { - case LPFC_FC_LA_SPEED_1G: + switch (phba->fc_linkspeed) { + case LPFC_LINK_SPEED_1GHZ: rdp_speed = RDP_PS_1GB; break; - case LPFC_FC_LA_SPEED_2G: + case LPFC_LINK_SPEED_2GHZ: rdp_speed = RDP_PS_2GB; break; - case LPFC_FC_LA_SPEED_4G: + case LPFC_LINK_SPEED_4GHZ: rdp_speed = RDP_PS_4GB; break; - case LPFC_FC_LA_SPEED_8G: + case LPFC_LINK_SPEED_8GHZ: rdp_speed = RDP_PS_8GB; break; - case LPFC_FC_LA_SPEED_10G: + case LPFC_LINK_SPEED_10GHZ: rdp_speed = RDP_PS_10GB; break; - case LPFC_FC_LA_SPEED_16G: + case LPFC_LINK_SPEED_16GHZ: rdp_speed = RDP_PS_16GB; break; - case LPFC_FC_LA_SPEED_32G: - rdp_speed = RDP_PS_32GB; - break; default: rdp_speed = RDP_PS_UNKNOWN; break; From 10103c5228e85f2878ff77cce36afe3c05468984 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:12:03 -0500 Subject: [PATCH 629/813] lpfc: Fix mbox reuse in PLOGI completion [ Upstream commit 01c73bbcd7cc4f31f45a1b0caeacdba46acd9c9c ] Fix mbox reuse in PLOGI completion. Moved allocations so that buffer properly init'd. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_nportdisc.c | 31 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 9e571dd41687..193733e8c823 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -1045,16 +1045,6 @@ lpfc_cmpl_plogi_plogi_issue(struct lpfc_vport *vport, if (irsp->ulpStatus) goto out; - mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!mbox) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, - "0133 PLOGI: no memory for reg_login " - "Data: x%x x%x x%x x%x\n", - ndlp->nlp_DID, ndlp->nlp_state, - ndlp->nlp_flag, ndlp->nlp_rpi); - goto out; - } - pcmd = (struct lpfc_dmabuf *) cmdiocb->context2; prsp = list_get_first(&pcmd->list, struct lpfc_dmabuf, list); @@ -1118,6 +1108,17 @@ lpfc_cmpl_plogi_plogi_issue(struct lpfc_vport *vport, if (phba->sli_rev == LPFC_SLI_REV4) { lpfc_issue_reg_vfi(vport); } else { + mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); + if (!mbox) { + lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, + "0133 PLOGI: no memory " + "for config_link " + "Data: x%x x%x x%x x%x\n", + ndlp->nlp_DID, ndlp->nlp_state, + ndlp->nlp_flag, ndlp->nlp_rpi); + goto out; + } + lpfc_config_link(phba, mbox); mbox->mbox_cmpl = lpfc_sli_def_mbox_cmpl; @@ -1132,6 +1133,16 @@ lpfc_cmpl_plogi_plogi_issue(struct lpfc_vport *vport, lpfc_unreg_rpi(vport, ndlp); + mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); + if (!mbox) { + lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, + "0018 PLOGI: no memory for reg_login " + "Data: x%x x%x x%x x%x\n", + ndlp->nlp_DID, ndlp->nlp_state, + ndlp->nlp_flag, ndlp->nlp_rpi); + goto out; + } + if (lpfc_reg_rpi(phba, vport->vpi, irsp->un.elsreq64.remoteID, (uint8_t *) sp, mbox, ndlp->nlp_rpi) == 0) { switch (ndlp->nlp_DID) { From 2dfdfa34392530885c09f50caccf8fa4053f6cde Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Dec 2015 18:12:04 -0500 Subject: [PATCH 630/813] lpfc: Fix external loopback failure. [ Upstream commit 4360ca9c24388e44cb0e14861a62fff43cf225c0 ] Fix external loopback failure. Rx sequence reassembly was incorrect. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinicke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_sli.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 6aae828208e2..92dfd6a5178c 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -14842,10 +14842,12 @@ lpfc_fc_frame_add(struct lpfc_vport *vport, struct hbq_dmabuf *dmabuf) struct lpfc_dmabuf *h_buf; struct hbq_dmabuf *seq_dmabuf = NULL; struct hbq_dmabuf *temp_dmabuf = NULL; + uint8_t found = 0; INIT_LIST_HEAD(&dmabuf->dbuf.list); dmabuf->time_stamp = jiffies; new_hdr = (struct fc_frame_header *)dmabuf->hbuf.virt; + /* Use the hdr_buf to find the sequence that this frame belongs to */ list_for_each_entry(h_buf, &vport->rcv_buffer_list, list) { temp_hdr = (struct fc_frame_header *)h_buf->virt; @@ -14885,7 +14887,8 @@ lpfc_fc_frame_add(struct lpfc_vport *vport, struct hbq_dmabuf *dmabuf) return seq_dmabuf; } /* find the correct place in the sequence to insert this frame */ - list_for_each_entry_reverse(d_buf, &seq_dmabuf->dbuf.list, list) { + d_buf = list_entry(seq_dmabuf->dbuf.list.prev, typeof(*d_buf), list); + while (!found) { temp_dmabuf = container_of(d_buf, struct hbq_dmabuf, dbuf); temp_hdr = (struct fc_frame_header *)temp_dmabuf->hbuf.virt; /* @@ -14895,9 +14898,17 @@ lpfc_fc_frame_add(struct lpfc_vport *vport, struct hbq_dmabuf *dmabuf) if (be16_to_cpu(new_hdr->fh_seq_cnt) > be16_to_cpu(temp_hdr->fh_seq_cnt)) { list_add(&dmabuf->dbuf.list, &temp_dmabuf->dbuf.list); - return seq_dmabuf; + found = 1; + break; } + + if (&d_buf->list == &seq_dmabuf->dbuf.list) + break; + d_buf = list_entry(d_buf->list.prev, typeof(*d_buf), list); } + + if (found) + return seq_dmabuf; return NULL; } From 2b2c611c802b1c73ee346f3f213f4356bd6510dd Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 11 Dec 2015 12:27:55 +0100 Subject: [PATCH 631/813] qeth: initialize net_device with carrier off [ Upstream commit e5ebe63214d44d4dcf43df02edf3613e04d671b9 ] /sys/class/net//operstate for an active qeth network interface offen shows "unknown", which translates to "state UNKNOWN in output of "ip link show". It is caused by a missing initialization of the __LINK_STATE_NOCARRIER bit in the net_device state field. This patch adds a netif_carrier_off() invocation when creating the net_device for a qeth device. Signed-off-by: Ursula Braun Acked-by: Hendrik Brueckner Reference-ID: Bugzilla 133209 Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_l2_main.c | 1 + drivers/s390/net/qeth_l3_main.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 12b2cb7769f9..df036b872b05 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1127,6 +1127,7 @@ static int qeth_l2_setup_netdev(struct qeth_card *card) qeth_l2_request_initial_mac(card); SET_NETDEV_DEV(card->dev, &card->gdev->dev); netif_napi_add(card->dev, &card->napi, qeth_l2_poll, QETH_NAPI_WEIGHT); + netif_carrier_off(card->dev); return register_netdev(card->dev); } diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 50cec6b13d27..cc4d3c3d8cc5 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3220,6 +3220,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) SET_NETDEV_DEV(card->dev, &card->gdev->dev); netif_napi_add(card->dev, &card->napi, qeth_l3_poll, QETH_NAPI_WEIGHT); + netif_carrier_off(card->dev); return register_netdev(card->dev); } From 10de05f6c3cd24bed1832d205e94da2c3cee324e Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 25 Jan 2016 10:30:27 +0100 Subject: [PATCH 632/813] s390/cio: fix measurement characteristics memleak [ Upstream commit 0d9bfe9123cfde59bf5c2e375b59d2a7d5061c4c ] Measurement characteristics are allocated during channel path registration but not freed during deregistration. Fix this by embedding these characteristics inside struct channel_path. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/chp.c | 6 +++--- drivers/s390/cio/chp.h | 2 +- drivers/s390/cio/chsc.c | 16 ++-------------- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index c692dfebd0ba..3d2b6c48c18e 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -139,11 +139,11 @@ static ssize_t chp_measurement_chars_read(struct file *filp, device = container_of(kobj, struct device, kobj); chp = to_channelpath(device); - if (!chp->cmg_chars) + if (chp->cmg == -1) return 0; - return memory_read_from_buffer(buf, count, &off, - chp->cmg_chars, sizeof(struct cmg_chars)); + return memory_read_from_buffer(buf, count, &off, &chp->cmg_chars, + sizeof(chp->cmg_chars)); } static struct bin_attribute chp_measurement_chars_attr = { diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h index 4efd5b867cc3..af0232290dc4 100644 --- a/drivers/s390/cio/chp.h +++ b/drivers/s390/cio/chp.h @@ -48,7 +48,7 @@ struct channel_path { /* Channel-measurement related stuff: */ int cmg; int shared; - void *cmg_chars; + struct cmg_chars cmg_chars; }; /* Return channel_path struct for given chpid. */ diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index a831d18596a5..5df0efee54db 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -967,22 +967,19 @@ static void chsc_initialize_cmg_chars(struct channel_path *chp, u8 cmcv, struct cmg_chars *chars) { - struct cmg_chars *cmg_chars; int i, mask; - cmg_chars = chp->cmg_chars; for (i = 0; i < NR_MEASUREMENT_CHARS; i++) { mask = 0x80 >> (i + 3); if (cmcv & mask) - cmg_chars->values[i] = chars->values[i]; + chp->cmg_chars.values[i] = chars->values[i]; else - cmg_chars->values[i] = 0; + chp->cmg_chars.values[i] = 0; } } int chsc_get_channel_measurement_chars(struct channel_path *chp) { - struct cmg_chars *cmg_chars; int ccode, ret; struct { @@ -1006,11 +1003,6 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) u32 data[NR_MEASUREMENT_CHARS]; } __attribute__ ((packed)) *scmc_area; - chp->cmg_chars = NULL; - cmg_chars = kmalloc(sizeof(*cmg_chars), GFP_KERNEL); - if (!cmg_chars) - return -ENOMEM; - spin_lock_irq(&chsc_page_lock); memset(chsc_page, 0, PAGE_SIZE); scmc_area = chsc_page; @@ -1042,14 +1034,10 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) /* No cmg-dependent data. */ goto out; } - chp->cmg_chars = cmg_chars; chsc_initialize_cmg_chars(chp, scmc_area->cmcv, (struct cmg_chars *) &scmc_area->data); out: spin_unlock_irq(&chsc_page_lock); - if (!chp->cmg_chars) - kfree(cmg_chars); - return ret; } From eb9f26c99639de3b04c89085855196f56b3f69ea Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 25 Jan 2016 10:31:33 +0100 Subject: [PATCH 633/813] s390/cio: ensure consistent measurement state [ Upstream commit 61f0bfcf8020f02eb09adaef96745d1c1d1b3623 ] Make sure that in all cases where we could not obtain measurement characteristics the associated fields are set to invalid values. Note: without this change the "shared" capability of a channel path for which we could not obtain the measurement characteristics was incorrectly displayed as 0 (not shared). We will now correctly report "unknown" in this case. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/chp.c | 13 +++++-------- drivers/s390/cio/chsc.c | 12 ++++++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 3d2b6c48c18e..8504629cbf72 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -466,14 +466,11 @@ int chp_new(struct chp_id chpid) ret = -ENODEV; goto out_free; } - /* Get channel-measurement characteristics. */ - if (css_chsc_characteristics.scmc && css_chsc_characteristics.secm) { - ret = chsc_get_channel_measurement_chars(chp); - if (ret) - goto out_free; - } else { - chp->cmg = -1; - } + + ret = chsc_get_channel_measurement_chars(chp); + if (ret) + goto out_free; + dev_set_name(&chp->dev, "chp%x.%02x", chpid.cssid, chpid.id); /* make it known to the system */ diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 5df0efee54db..13747c510c0b 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1003,6 +1003,12 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) u32 data[NR_MEASUREMENT_CHARS]; } __attribute__ ((packed)) *scmc_area; + chp->shared = -1; + chp->cmg = -1; + + if (!css_chsc_characteristics.scmc || !css_chsc_characteristics.secm) + return 0; + spin_lock_irq(&chsc_page_lock); memset(chsc_page, 0, PAGE_SIZE); scmc_area = chsc_page; @@ -1023,11 +1029,9 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) scmc_area->response.code); goto out; } - if (scmc_area->not_valid) { - chp->cmg = -1; - chp->shared = -1; + if (scmc_area->not_valid) goto out; - } + chp->cmg = scmc_area->cmg; chp->shared = scmc_area->shared; if (chp->cmg != 2 && chp->cmg != 3) { From 9eccfa31f0024f0407a122b1afdc91bba52cf511 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 25 Jan 2016 10:32:51 +0100 Subject: [PATCH 634/813] s390/cio: update measurement characteristics [ Upstream commit 9f3d6d7a40a178b8a5b5274f4e55fec8c30147c9 ] Per channel path measurement characteristics are obtained during channel path registration. However if some properties of a channel path change we don't update the measurement characteristics. Make sure to update the characteristics when we change the properties of a channel path or receive a notification from FW about such a change. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/chp.c | 12 +++++------- drivers/s390/cio/chsc.c | 17 +++++++++++++++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 8504629cbf72..50597f9522fe 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -416,7 +416,8 @@ static void chp_release(struct device *dev) * chp_update_desc - update channel-path description * @chp - channel-path * - * Update the channel-path description of the specified channel-path. + * Update the channel-path description of the specified channel-path + * including channel measurement related information. * Return zero on success, non-zero otherwise. */ int chp_update_desc(struct channel_path *chp) @@ -428,8 +429,10 @@ int chp_update_desc(struct channel_path *chp) return rc; rc = chsc_determine_fmt1_channel_path_desc(chp->chpid, &chp->desc_fmt1); + if (rc) + return rc; - return rc; + return chsc_get_channel_measurement_chars(chp); } /** @@ -466,11 +469,6 @@ int chp_new(struct chp_id chpid) ret = -ENODEV; goto out_free; } - - ret = chsc_get_channel_measurement_chars(chp); - if (ret) - goto out_free; - dev_set_name(&chp->dev, "chp%x.%02x", chpid.cssid, chpid.id); /* make it known to the system */ diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 13747c510c0b..c424c0c7367e 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -224,8 +225,9 @@ out_unreg: void chsc_chp_offline(struct chp_id chpid) { - char dbf_txt[15]; + struct channel_path *chp = chpid_to_chp(chpid); struct chp_link link; + char dbf_txt[15]; sprintf(dbf_txt, "chpr%x.%02x", chpid.cssid, chpid.id); CIO_TRACE_EVENT(2, dbf_txt); @@ -236,6 +238,11 @@ void chsc_chp_offline(struct chp_id chpid) link.chpid = chpid; /* Wait until previous actions have settled. */ css_wait_for_slow_path(); + + mutex_lock(&chp->lock); + chp_update_desc(chp); + mutex_unlock(&chp->lock); + for_each_subchannel_staged(s390_subchannel_remove_chpid, NULL, &link); } @@ -690,8 +697,9 @@ static void chsc_process_crw(struct crw *crw0, struct crw *crw1, int overflow) void chsc_chp_online(struct chp_id chpid) { - char dbf_txt[15]; + struct channel_path *chp = chpid_to_chp(chpid); struct chp_link link; + char dbf_txt[15]; sprintf(dbf_txt, "cadd%x.%02x", chpid.cssid, chpid.id); CIO_TRACE_EVENT(2, dbf_txt); @@ -701,6 +709,11 @@ void chsc_chp_online(struct chp_id chpid) link.chpid = chpid; /* Wait until previous actions have settled. */ css_wait_for_slow_path(); + + mutex_lock(&chp->lock); + chp_update_desc(chp); + mutex_unlock(&chp->lock); + for_each_subchannel_staged(__s390_process_res_acc, NULL, &link); css_schedule_reprobe(); From f7aeba6bd2bf61cc397d5e7dcc9eed2020a14661 Mon Sep 17 00:00:00 2001 From: Sumit Saxena Date: Thu, 28 Jan 2016 21:04:22 +0530 Subject: [PATCH 635/813] megaraid_sas: Do not allow PCI access during OCR [ Upstream commit 11c71cb4ab7cd901b9d6f0ff267c102778c1c8ef ] This patch will do synhronization between OCR function and AEN function using "reset_mutex" lock. reset_mutex will be acquired only in the first half of the AEN function which issues a DCMD. Second half of the function which calls SCSI API (scsi_add_device/scsi_remove_device) should be out of reset_mutex to avoid deadlock between scsi_eh thread and driver. During chip reset (inside OCR function), there should not be any PCI access and AEN function (which is called in delayed context) may be firing DCMDs (doing PCI writes) when chip reset is happening in parallel which will cause FW fault. This patch will solve the problem by making AEN thread and OCR thread mutually exclusive. Signed-off-by: Sumit Saxena Signed-off-by: Kashyap Desai Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas.h | 2 + drivers/scsi/megaraid/megaraid_sas_base.c | 280 +++++++--------------- 2 files changed, 95 insertions(+), 187 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h index c0f7c8ce54aa..ef4ff03242ea 100644 --- a/drivers/scsi/megaraid/megaraid_sas.h +++ b/drivers/scsi/megaraid/megaraid_sas.h @@ -1083,6 +1083,8 @@ struct megasas_ctrl_info { #define VD_EXT_DEBUG 0 +#define SCAN_PD_CHANNEL 0x1 +#define SCAN_VD_CHANNEL 0x2 enum MR_SCSI_CMD_TYPE { READ_WRITE_LDIO = 0, diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index e994ff944091..8ab174694e19 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5476,7 +5476,6 @@ static int megasas_probe_one(struct pci_dev *pdev, spin_lock_init(&instance->hba_lock); spin_lock_init(&instance->completion_lock); - mutex_init(&instance->aen_mutex); mutex_init(&instance->reset_mutex); /* @@ -6443,10 +6442,10 @@ static int megasas_mgmt_ioctl_aen(struct file *file, unsigned long arg) } spin_unlock_irqrestore(&instance->hba_lock, flags); - mutex_lock(&instance->aen_mutex); + mutex_lock(&instance->reset_mutex); error = megasas_register_aen(instance, aen.seq_num, aen.class_locale_word); - mutex_unlock(&instance->aen_mutex); + mutex_unlock(&instance->reset_mutex); return error; } @@ -6648,6 +6647,7 @@ megasas_aen_polling(struct work_struct *work) int i, j, doscan = 0; u32 seq_num, wait_time = MEGASAS_RESET_WAIT_TIME; int error; + u8 dcmd_ret = 0; if (!instance) { printk(KERN_ERR "invalid instance!\n"); @@ -6660,16 +6660,7 @@ megasas_aen_polling(struct work_struct *work) wait_time = MEGASAS_ROUTINE_WAIT_TIME_VF; /* Don't run the event workqueue thread if OCR is running */ - for (i = 0; i < wait_time; i++) { - if (instance->adprecovery == MEGASAS_HBA_OPERATIONAL) - break; - if (!(i % MEGASAS_RESET_NOTICE_INTERVAL)) { - dev_notice(&instance->pdev->dev, "%s waiting for " - "controller reset to finish for scsi%d\n", - __func__, instance->host->host_no); - } - msleep(1000); - } + mutex_lock(&instance->reset_mutex); instance->ev = NULL; host = instance->host; @@ -6677,47 +6668,73 @@ megasas_aen_polling(struct work_struct *work) megasas_decode_evt(instance); switch (le32_to_cpu(instance->evt_detail->code)) { + case MR_EVT_PD_INSERTED: - if (megasas_get_pd_list(instance) == 0) { - for (i = 0; i < MEGASAS_MAX_PD_CHANNELS; i++) { - for (j = 0; - j < MEGASAS_MAX_DEV_PER_CHANNEL; - j++) { - - pd_index = - (i * MEGASAS_MAX_DEV_PER_CHANNEL) + j; - - sdev1 = scsi_device_lookup(host, i, j, 0); - - if (instance->pd_list[pd_index].driveState - == MR_PD_STATE_SYSTEM) { - if (!sdev1) - scsi_add_device(host, i, j, 0); - - if (sdev1) - scsi_device_put(sdev1); - } - } - } - } - doscan = 0; + case MR_EVT_PD_REMOVED: + dcmd_ret = megasas_get_pd_list(instance); + if (dcmd_ret == 0) + doscan = SCAN_PD_CHANNEL; break; - case MR_EVT_PD_REMOVED: - if (megasas_get_pd_list(instance) == 0) { - for (i = 0; i < MEGASAS_MAX_PD_CHANNELS; i++) { - for (j = 0; - j < MEGASAS_MAX_DEV_PER_CHANNEL; - j++) { + case MR_EVT_LD_OFFLINE: + case MR_EVT_CFG_CLEARED: + case MR_EVT_LD_DELETED: + case MR_EVT_LD_CREATED: + if (!instance->requestorId || + (instance->requestorId && megasas_get_ld_vf_affiliation(instance, 0))) + dcmd_ret = megasas_ld_list_query(instance, MR_LD_QUERY_TYPE_EXPOSED_TO_HOST); - pd_index = - (i * MEGASAS_MAX_DEV_PER_CHANNEL) + j; + if (dcmd_ret == 0) + doscan = SCAN_VD_CHANNEL; + break; + + case MR_EVT_CTRL_HOST_BUS_SCAN_REQUESTED: + case MR_EVT_FOREIGN_CFG_IMPORTED: + case MR_EVT_LD_STATE_CHANGE: + dcmd_ret = megasas_get_pd_list(instance); + + if (dcmd_ret != 0) + break; + + if (!instance->requestorId || + (instance->requestorId && megasas_get_ld_vf_affiliation(instance, 0))) + dcmd_ret = megasas_ld_list_query(instance, MR_LD_QUERY_TYPE_EXPOSED_TO_HOST); + + if (dcmd_ret != 0) + break; + + doscan = SCAN_VD_CHANNEL | SCAN_PD_CHANNEL; + dev_info(&instance->pdev->dev, "scanning for scsi%d...\n", + instance->host->host_no); + break; + + case MR_EVT_CTRL_PROP_CHANGED: + dcmd_ret = megasas_get_ctrl_info(instance); + break; + default: + doscan = 0; + break; + } + } else { + dev_err(&instance->pdev->dev, "invalid evt_detail!\n"); + mutex_unlock(&instance->reset_mutex); + kfree(ev); + return; + } + + mutex_unlock(&instance->reset_mutex); + + if (doscan & SCAN_PD_CHANNEL) { + for (i = 0; i < MEGASAS_MAX_PD_CHANNELS; i++) { + for (j = 0; j < MEGASAS_MAX_DEV_PER_CHANNEL; j++) { + pd_index = i*MEGASAS_MAX_DEV_PER_CHANNEL + j; sdev1 = scsi_device_lookup(host, i, j, 0); - - if (instance->pd_list[pd_index].driveState - == MR_PD_STATE_SYSTEM) { - if (sdev1) + if (instance->pd_list[pd_index].driveState == + MR_PD_STATE_SYSTEM) { + if (!sdev1) + scsi_add_device(host, i, j, 0); + else scsi_device_put(sdev1); } else { if (sdev1) { @@ -6725,164 +6742,53 @@ megasas_aen_polling(struct work_struct *work) scsi_device_put(sdev1); } } - } } - } - doscan = 0; - break; - - case MR_EVT_LD_OFFLINE: - case MR_EVT_CFG_CLEARED: - case MR_EVT_LD_DELETED: - if (!instance->requestorId || - megasas_get_ld_vf_affiliation(instance, 0)) { - if (megasas_ld_list_query(instance, - MR_LD_QUERY_TYPE_EXPOSED_TO_HOST)) - megasas_get_ld_list(instance); - for (i = 0; i < MEGASAS_MAX_LD_CHANNELS; i++) { - for (j = 0; - j < MEGASAS_MAX_DEV_PER_CHANNEL; - j++) { - - ld_index = - (i * MEGASAS_MAX_DEV_PER_CHANNEL) + j; - - sdev1 = scsi_device_lookup(host, MEGASAS_MAX_PD_CHANNELS + i, j, 0); - - if (instance->ld_ids[ld_index] - != 0xff) { - if (sdev1) - scsi_device_put(sdev1); - } else { - if (sdev1) { - scsi_remove_device(sdev1); - scsi_device_put(sdev1); - } - } - } - } - doscan = 0; - } - break; - case MR_EVT_LD_CREATED: - if (!instance->requestorId || - megasas_get_ld_vf_affiliation(instance, 0)) { - if (megasas_ld_list_query(instance, - MR_LD_QUERY_TYPE_EXPOSED_TO_HOST)) - megasas_get_ld_list(instance); - for (i = 0; i < MEGASAS_MAX_LD_CHANNELS; i++) { - for (j = 0; - j < MEGASAS_MAX_DEV_PER_CHANNEL; - j++) { - ld_index = - (i * MEGASAS_MAX_DEV_PER_CHANNEL) + j; - - sdev1 = scsi_device_lookup(host, MEGASAS_MAX_PD_CHANNELS + i, j, 0); - - if (instance->ld_ids[ld_index] - != 0xff) { - if (!sdev1) - scsi_add_device(host, MEGASAS_MAX_PD_CHANNELS + i, j, 0); - } - if (sdev1) - scsi_device_put(sdev1); - } - } - doscan = 0; - } - break; - case MR_EVT_CTRL_HOST_BUS_SCAN_REQUESTED: - case MR_EVT_FOREIGN_CFG_IMPORTED: - case MR_EVT_LD_STATE_CHANGE: - doscan = 1; - break; - case MR_EVT_CTRL_PROP_CHANGED: - megasas_get_ctrl_info(instance); - break; - default: - doscan = 0; - break; } - } else { - dev_err(&instance->pdev->dev, "invalid evt_detail!\n"); - kfree(ev); - return; } - if (doscan) { - dev_info(&instance->pdev->dev, "scanning for scsi%d...\n", - instance->host->host_no); - if (megasas_get_pd_list(instance) == 0) { - for (i = 0; i < MEGASAS_MAX_PD_CHANNELS; i++) { - for (j = 0; j < MEGASAS_MAX_DEV_PER_CHANNEL; j++) { - pd_index = i*MEGASAS_MAX_DEV_PER_CHANNEL + j; - sdev1 = scsi_device_lookup(host, i, j, 0); - if (instance->pd_list[pd_index].driveState == - MR_PD_STATE_SYSTEM) { - if (!sdev1) { - scsi_add_device(host, i, j, 0); - } - if (sdev1) - scsi_device_put(sdev1); - } else { - if (sdev1) { - scsi_remove_device(sdev1); - scsi_device_put(sdev1); - } - } - } - } - } - - if (!instance->requestorId || - megasas_get_ld_vf_affiliation(instance, 0)) { - if (megasas_ld_list_query(instance, - MR_LD_QUERY_TYPE_EXPOSED_TO_HOST)) - megasas_get_ld_list(instance); - for (i = 0; i < MEGASAS_MAX_LD_CHANNELS; i++) { - for (j = 0; j < MEGASAS_MAX_DEV_PER_CHANNEL; - j++) { - ld_index = - (i * MEGASAS_MAX_DEV_PER_CHANNEL) + j; - - sdev1 = scsi_device_lookup(host, - MEGASAS_MAX_PD_CHANNELS + i, j, 0); - if (instance->ld_ids[ld_index] - != 0xff) { - if (!sdev1) - scsi_add_device(host, MEGASAS_MAX_PD_CHANNELS + i, j, 0); - else - scsi_device_put(sdev1); - } else { - if (sdev1) { - scsi_remove_device(sdev1); - scsi_device_put(sdev1); - } + if (doscan & SCAN_VD_CHANNEL) { + for (i = 0; i < MEGASAS_MAX_LD_CHANNELS; i++) { + for (j = 0; j < MEGASAS_MAX_DEV_PER_CHANNEL; j++) { + ld_index = (i * MEGASAS_MAX_DEV_PER_CHANNEL) + j; + sdev1 = scsi_device_lookup(host, MEGASAS_MAX_PD_CHANNELS + i, j, 0); + if (instance->ld_ids[ld_index] != 0xff) { + if (!sdev1) + scsi_add_device(host, MEGASAS_MAX_PD_CHANNELS + i, j, 0); + else + scsi_device_put(sdev1); + } else { + if (sdev1) { + scsi_remove_device(sdev1); + scsi_device_put(sdev1); } } } } } - if (instance->aen_cmd != NULL) { - kfree(ev); - return ; - } - - seq_num = le32_to_cpu(instance->evt_detail->seq_num) + 1; + if (dcmd_ret == 0) + seq_num = le32_to_cpu(instance->evt_detail->seq_num) + 1; + else + seq_num = instance->last_seq_num; /* Register AEN with FW for latest sequence number plus 1 */ class_locale.members.reserved = 0; class_locale.members.locale = MR_EVT_LOCALE_ALL; class_locale.members.class = MR_EVT_CLASS_DEBUG; - mutex_lock(&instance->aen_mutex); + + if (instance->aen_cmd != NULL) { + kfree(ev); + return; + } + + mutex_lock(&instance->reset_mutex); error = megasas_register_aen(instance, seq_num, class_locale.word); - mutex_unlock(&instance->aen_mutex); - if (error) - dev_err(&instance->pdev->dev, "register aen failed error %x\n", error); + dev_err(&instance->pdev->dev, + "register aen failed error %x\n", error); + mutex_unlock(&instance->reset_mutex); kfree(ev); } From 948cb1ca1fc3b88443051c17c58ee94608465552 Mon Sep 17 00:00:00 2001 From: Sumit Saxena Date: Thu, 28 Jan 2016 21:14:26 +0530 Subject: [PATCH 636/813] megaraid_sas: Fix SMAP issue [ Upstream commit ea1c928bb6051ec4ccf24826898aa2361eaa71e5 ] Inside compat IOCTL hook of driver, driver was using wrong address of ioc->frame.raw which leads sense_ioc_ptr to be calculated wrongly and failing IOCTL. Signed-off-by: Sumit Saxena Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 8ab174694e19..6dbaa3c38d1e 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -6476,9 +6476,9 @@ static int megasas_mgmt_compat_ioctl_fw(struct file *file, unsigned long arg) int i; int error = 0; compat_uptr_t ptr; - unsigned long local_raw_ptr; u32 local_sense_off; u32 local_sense_len; + u32 user_sense_off; if (clear_user(ioc, sizeof(*ioc))) return -EFAULT; @@ -6496,17 +6496,16 @@ static int megasas_mgmt_compat_ioctl_fw(struct file *file, unsigned long arg) * sense_len is not null, so prepare the 64bit value under * the same condition. */ - if (get_user(local_raw_ptr, ioc->frame.raw) || - get_user(local_sense_off, &ioc->sense_off) || - get_user(local_sense_len, &ioc->sense_len)) + if (get_user(local_sense_off, &ioc->sense_off) || + get_user(local_sense_len, &ioc->sense_len) || + get_user(user_sense_off, &cioc->sense_off)) return -EFAULT; - if (local_sense_len) { void __user **sense_ioc_ptr = - (void __user **)((u8*)local_raw_ptr + local_sense_off); + (void __user **)((u8 *)((unsigned long)&ioc->frame.raw) + local_sense_off); compat_uptr_t *sense_cioc_ptr = - (compat_uptr_t *)(cioc->frame.raw + cioc->sense_off); + (compat_uptr_t *)(((unsigned long)&cioc->frame.raw) + user_sense_off); if (get_user(ptr, sense_cioc_ptr) || put_user(compat_ptr(ptr), sense_ioc_ptr)) return -EFAULT; From e77b14a6f3cee18cb03f124fd283a122b6516900 Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Mon, 1 Feb 2016 15:12:04 +0100 Subject: [PATCH 637/813] megaraid_sas: Add an i/o barrier [ Upstream commit b99dbe56d511eb07de33bfa1b99ac5a6ff76ae08 ] A barrier should be added to ensure proper ordering of memory mapped writes. Signed-off-by: Tomas Henzl Reviewed-by: Kashyap Desai Acked-by: Kashyap Desai Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 1 + drivers/scsi/megaraid/megaraid_sas_fusion.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 6dbaa3c38d1e..3f8d357b1bac 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -735,6 +735,7 @@ megasas_fire_cmd_skinny(struct megasas_instance *instance, &(regs)->inbound_high_queue_port); writel((lower_32_bits(frame_phys_addr) | (frame_count<<1))|1, &(regs)->inbound_low_queue_port); + mmiowb(); spin_unlock_irqrestore(&instance->hba_lock, flags); } diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index 4f391e747be2..021b994fdae8 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -201,6 +201,7 @@ megasas_fire_cmd_fusion(struct megasas_instance *instance, &instance->reg_set->inbound_low_queue_port); writel(le32_to_cpu(req_desc->u.high), &instance->reg_set->inbound_high_queue_port); + mmiowb(); spin_unlock_irqrestore(&instance->hba_lock, flags); #endif } From 67a915d817eb2929008e1bd05ef7875fcce09a16 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 23 Nov 2015 14:45:07 -0800 Subject: [PATCH 638/813] pwm: fsl-ftm: Fix clock enable/disable when using PM [ Upstream commit 816aec2325e620b6454474372a21f90a8740cb28 ] A FTM PWM instance enables/disables three clocks: The bus clock, the counter clock and the PWM clock. The bus clock gets enabled on pwm_request, whereas the counter and PWM clocks will be enabled upon pwm_enable. The driver has three closesly related issues when enabling/disabling clocks during suspend/resume: - The three clocks are not treated differently in regards to the individual PWM state enabled/requested. This can lead to clocks getting disabled which have not been enabled in the first place (a PWM channel which only has been requested going through suspend/resume). - When entering suspend, the current behavior relies on the FTM_OUTMASK register: If a PWM output is unmasked, the driver assumes the clocks are enabled. However, some PWM instances have only 2 channels connected (e.g. Vybrid's FTM1). In that case, the FTM_OUTMASK reads 0x3 if all channels are disabled, even if the code wrote 0xff to it before. For those PWM instances, the current approach to detect enabled PWM signals does not work. - A third issue applies to the bus clock only, which can get enabled multiple times (once for each PWM channel of a PWM chip). This is fine, however when entering suspend mode, the clock only gets disabled once. This change introduces a different approach by relying on the enable and prepared counters of the clock framework and using the frameworks PWM signal states to address all three issues. Clocks get disabled during suspend and back enabled on resume regarding to the PWM channels individual state (requested/enabled). Since we do not count the clock enables in the driver, this change no longer clears the Status and Control registers Clock Source Selection (FTM_SC[CLKS]). However, since we disable the selected clock anyway, and we explicitly select the clock source on reenabling a PWM channel this approach should not make a difference in practice. Signed-off-by: Stefan Agner Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/pwm/pwm-fsl-ftm.c | 58 +++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index f9dfc8b6407a..7225ac6b3df5 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -80,7 +80,6 @@ struct fsl_pwm_chip { struct mutex lock; - unsigned int use_count; unsigned int cnt_select; unsigned int clk_ps; @@ -300,9 +299,6 @@ static int fsl_counter_clock_enable(struct fsl_pwm_chip *fpc) { int ret; - if (fpc->use_count++ != 0) - return 0; - /* select counter clock source */ regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_CLK_MASK, FTM_SC_CLK(fpc->cnt_select)); @@ -334,25 +330,6 @@ static int fsl_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) return ret; } -static void fsl_counter_clock_disable(struct fsl_pwm_chip *fpc) -{ - /* - * already disabled, do nothing - */ - if (fpc->use_count == 0) - return; - - /* there are still users, so can't disable yet */ - if (--fpc->use_count > 0) - return; - - /* no users left, disable PWM counter clock */ - regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_CLK_MASK, 0); - - clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_CNTEN]); - clk_disable_unprepare(fpc->clk[fpc->cnt_select]); -} - static void fsl_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct fsl_pwm_chip *fpc = to_fsl_chip(chip); @@ -362,7 +339,8 @@ static void fsl_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) regmap_update_bits(fpc->regmap, FTM_OUTMASK, BIT(pwm->hwpwm), BIT(pwm->hwpwm)); - fsl_counter_clock_disable(fpc); + clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_CNTEN]); + clk_disable_unprepare(fpc->clk[fpc->cnt_select]); regmap_read(fpc->regmap, FTM_OUTMASK, &val); if ((val & 0xFF) == 0xFF) @@ -492,17 +470,24 @@ static int fsl_pwm_remove(struct platform_device *pdev) static int fsl_pwm_suspend(struct device *dev) { struct fsl_pwm_chip *fpc = dev_get_drvdata(dev); - u32 val; + int i; regcache_cache_only(fpc->regmap, true); regcache_mark_dirty(fpc->regmap); - /* read from cache */ - regmap_read(fpc->regmap, FTM_OUTMASK, &val); - if ((val & 0xFF) != 0xFF) { + for (i = 0; i < fpc->chip.npwm; i++) { + struct pwm_device *pwm = &fpc->chip.pwms[i]; + + if (!test_bit(PWMF_REQUESTED, &pwm->flags)) + continue; + + clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_SYS]); + + if (!pwm_is_enabled(pwm)) + continue; + clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_CNTEN]); clk_disable_unprepare(fpc->clk[fpc->cnt_select]); - clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_SYS]); } return 0; @@ -511,12 +496,19 @@ static int fsl_pwm_suspend(struct device *dev) static int fsl_pwm_resume(struct device *dev) { struct fsl_pwm_chip *fpc = dev_get_drvdata(dev); - u32 val; + int i; + + for (i = 0; i < fpc->chip.npwm; i++) { + struct pwm_device *pwm = &fpc->chip.pwms[i]; + + if (!test_bit(PWMF_REQUESTED, &pwm->flags)) + continue; - /* read from cache */ - regmap_read(fpc->regmap, FTM_OUTMASK, &val); - if ((val & 0xFF) != 0xFF) { clk_prepare_enable(fpc->clk[FSL_PWM_CLK_SYS]); + + if (!pwm_is_enabled(pwm)) + continue; + clk_prepare_enable(fpc->clk[fpc->cnt_select]); clk_prepare_enable(fpc->clk[FSL_PWM_CLK_CNTEN]); } From a5880180d33e982962322de3f46f731f82a9b9ff Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Sun, 6 Dec 2015 13:31:59 +0200 Subject: [PATCH 639/813] pwm: lpc32xx: correct number of PWM channels from 2 to 1 [ Upstream commit ebe1fca35038df28b5c183e8486863e765364ec1 ] LPC32xx SoC has two independent PWM controllers, they have different clock parents, clock gates and even slightly different controls, and each of these two PWM controllers has one output channel. Due to almost similar controls arranged in a row it is incorrectly set that there is one PWM controller with two channels, fix this problem, which at the moment prevents separate configuration of different clock parents and gates for both PWM controllers. The change makes previous PWM device node description incompatible with this update. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/pwm/pwm-lpc32xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index 9fde60ce8e7b..ce8ab202be8b 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -134,7 +134,7 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev) lpc32xx->chip.dev = &pdev->dev; lpc32xx->chip.ops = &lpc32xx_pwm_ops; - lpc32xx->chip.npwm = 2; + lpc32xx->chip.npwm = 1; lpc32xx->chip.base = -1; ret = pwmchip_add(&lpc32xx->chip); From e941af922e512e4b4b2653a33c8372e5e4d57579 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Sun, 6 Dec 2015 13:32:01 +0200 Subject: [PATCH 640/813] pwm: lpc32xx: fix and simplify duty cycle and period calculations [ Upstream commit 5a9fc9c666d5d759699cf5495bda85f1da0d747e ] The change fixes a problem, if duty_ns is too small in comparison to period_ns (as a valid corner case duty_ns is 0 ns), then due to PWM_DUTY() macro applied on a value the result is overflowed over 8 bits, and instead of the highest bitfield duty cycle value 0xff the invalid duty cycle bitfield value 0x00 is written. For reference the LPC32xx spec defines PWMx_DUTY bitfield description is this way and it seems to be correct: [Low]/[High] = [PWM_DUTY]/[256-PWM_DUTY], where 0 < PWM_DUTY <= 255. In addition according to my oscilloscope measurements LPC32xx PWM is "tristate" in sense that it produces a wave with floating min/max voltage levels for different duty cycle values, for corner cases: PWM_DUTY == 0x01 => signal is in range from -1.05v to 0v .... PWM_DUTY == 0x80 => signal is in range from -0.75v to +0.75v .... PWM_DUTY == 0xff => signal is in range from 0v to +1.05v PWM_DUTY == 0x00 => signal is around 0v, PWM is off Due to this peculiarity on very long period ranges (less than 1KHz) and odd pre-divider values PWM generated wave does not remind a clock shape signal, but rather a heartbit shape signal with positive and negative peaks, so I would recommend to use high-speed HCLK clock as a PWM parent clock and avoid using RTC clock as a parent. The change corrects PWM output in corner cases and prevents any possible overflows in calculation of values for PWM_DUTY and PWM_RELOADV bitfields, thus helper macro definitions may be removed. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/pwm/pwm-lpc32xx.c | 49 ++++++++++++++------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index ce8ab202be8b..6e203a65effb 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -24,9 +24,7 @@ struct lpc32xx_pwm_chip { void __iomem *base; }; -#define PWM_ENABLE (1 << 31) -#define PWM_RELOADV(x) (((x) & 0xFF) << 8) -#define PWM_DUTY(x) ((x) & 0xFF) +#define PWM_ENABLE BIT(31) #define to_lpc32xx_pwm_chip(_chip) \ container_of(_chip, struct lpc32xx_pwm_chip, chip) @@ -38,40 +36,27 @@ static int lpc32xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, unsigned long long c; int period_cycles, duty_cycles; u32 val; + c = clk_get_rate(lpc32xx->clk); - c = clk_get_rate(lpc32xx->clk) / 256; - c = c * period_ns; - do_div(c, NSEC_PER_SEC); + /* The highest acceptable divisor is 256, which is represented by 0 */ + period_cycles = div64_u64(c * period_ns, + (unsigned long long)NSEC_PER_SEC * 256); + if (!period_cycles) + period_cycles = 1; + if (period_cycles > 255) + period_cycles = 0; - /* Handle high and low extremes */ - if (c == 0) - c = 1; - if (c > 255) - c = 0; /* 0 set division by 256 */ - period_cycles = c; - - /* The duty-cycle value is as follows: - * - * DUTY-CYCLE HIGH LEVEL - * 1 99.9% - * 25 90.0% - * 128 50.0% - * 220 10.0% - * 255 0.1% - * 0 0.0% - * - * In other words, the register value is duty-cycle % 256 with - * duty-cycle in the range 1-256. - */ - c = 256 * duty_ns; - do_div(c, period_ns); - if (c > 255) - c = 255; - duty_cycles = 256 - c; + /* Compute 256 x #duty/period value and care for corner cases */ + duty_cycles = div64_u64((unsigned long long)(period_ns - duty_ns) * 256, + period_ns); + if (!duty_cycles) + duty_cycles = 1; + if (duty_cycles > 255) + duty_cycles = 255; val = readl(lpc32xx->base + (pwm->hwpwm << 2)); val &= ~0xFFFF; - val |= PWM_RELOADV(period_cycles) | PWM_DUTY(duty_cycles); + val |= (period_cycles << 8) | duty_cycles; writel(val, lpc32xx->base + (pwm->hwpwm << 2)); return 0; From d5251a1952bff1148ceeea5283e34d07a502da62 Mon Sep 17 00:00:00 2001 From: Tirumalesh Chalamarla Date: Thu, 4 Feb 2016 10:45:25 -0800 Subject: [PATCH 641/813] irqchip/gic-v3: Make sure read from ICC_IAR1_EL1 is visible on redestributor [ Upstream commit 1a1ebd5fb1e203ee8cc73508cc7a38ac4b804596 ] The ARM GICv3 specification mentions the need for dsb after a read from the ICC_IAR1_EL1 register: 4.1.1 Physical CPU Interface: The effects of reading ICC_IAR0_EL1 and ICC_IAR1_EL1 on the state of a returned INTID are not guaranteed to be visible until after the execution of a DSB. Not having this could result in missed interrupts, so let's add the required barrier. [Marc: fixed commit message] Acked-by: Marc Zyngier Signed-off-by: Tirumalesh Chalamarla Signed-off-by: Marc Zyngier Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/arch_gicv3.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 2731d3b25ed2..8ec88e5b290f 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -103,6 +103,7 @@ static inline u64 gic_read_iar_common(void) u64 irqstat; asm volatile("mrs_s %0, " __stringify(ICC_IAR1_EL1) : "=r" (irqstat)); + dsb(sy); return irqstat; } From 849e28efb04c4c46f6189f75e30d3f541eb6dfb4 Mon Sep 17 00:00:00 2001 From: Tirumalesh Chalamarla Date: Wed, 10 Feb 2016 10:46:53 -0800 Subject: [PATCH 642/813] arm64: KVM: Configure TCR_EL2.PS at runtime [ Upstream commit 3c5b1d92b3b02be07873d611a27950addff544d3 ] Setting TCR_EL2.PS to 40 bits is wrong on systems with less that less than 40 bits of physical addresses. and breaks KVM on systems where the RAM is above 40 bits. This patch uses ID_AA64MMFR0_EL1.PARange to set TCR_EL2.PS dynamically, just like we already do for VTCR_EL2.PS. [Marc: rewrote commit message, patch tidy up] Reviewed-by: Marc Zyngier Signed-off-by: Tirumalesh Chalamarla Signed-off-by: Marc Zyngier Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/kvm_arm.h | 2 -- arch/arm64/kvm/hyp-init.S | 13 ++++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 5e6857b6bdc4..2d960f8588b0 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,8 +107,6 @@ #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) -#define TCR_EL2_FLAGS (TCR_EL2_RES1 | TCR_EL2_PS_40B) - /* VTCR_EL2 Registers bits */ #define VTCR_EL2_RES1 (1 << 31) #define VTCR_EL2_PS_MASK (7 << 16) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 178ba2248a98..84c338f017b2 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -64,7 +64,7 @@ __do_hyp_init: mrs x4, tcr_el1 ldr x5, =TCR_EL2_MASK and x4, x4, x5 - ldr x5, =TCR_EL2_FLAGS + mov x5, #TCR_EL2_RES1 orr x4, x4, x5 #ifndef CONFIG_ARM64_VA_BITS_48 @@ -85,15 +85,18 @@ __do_hyp_init: ldr_l x5, idmap_t0sz bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH #endif - msr tcr_el2, x4 - - ldr x4, =VTCR_EL2_FLAGS /* * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in - * VTCR_EL2. + * TCR_EL2 and VTCR_EL2. */ mrs x5, ID_AA64MMFR0_EL1 bfi x4, x5, #16, #3 + + msr tcr_el2, x4 + + ldr x4, =VTCR_EL2_FLAGS + bfi x4, x5, #16, #3 + msr vtcr_el2, x4 mrs x4, mair_el1 From 6d0e49cf1b432a11bf60b9fc2ee41c3e4e0347fe Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Feb 2016 16:30:39 +0000 Subject: [PATCH 643/813] net: cavium: liquidio: fix check for in progress flag [ Upstream commit 19a6d156a7bd080f3a855a40a4a08ab475e34b4a ] smatch detected a suspicious looking bitop condition: drivers/net/ethernet/cavium/liquidio/lio_main.c:2529 handle_timestamp() warn: suspicious bitop condition (skb_shinfo(skb)->tx_flags | SKBTX_IN_PROGRESS is always non-zero, so the logic is definitely not correct. Use & to mask the correct bit. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/liquidio/lio_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index b89504405b72..7445da218bd9 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -2526,7 +2526,7 @@ static void handle_timestamp(struct octeon_device *oct, octeon_swap_8B_data(&resp->timestamp, 1); - if (unlikely((skb_shinfo(skb)->tx_flags | SKBTX_IN_PROGRESS) != 0)) { + if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) != 0)) { struct skb_shared_hwtstamps ts; u64 ns = resp->timestamp; From 8b208a5d0a3b37a68cb1694a125c8deee56be45c Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Wed, 23 Dec 2015 14:21:47 +0100 Subject: [PATCH 644/813] mpt3sas: A correction in unmap_resources [ Upstream commit 5f985d88bac34e7f3b4403118eab072902a0b392 ] It might happen that we try to free an already freed pointer. Reported-by: Maurizio Lombardi Signed-off-by: Tomas Henzl Acked-by: Chaitra P B Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/mpt3sas/mpt3sas_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 356233f86064..9137ae2b841f 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -2020,8 +2020,10 @@ mpt3sas_base_unmap_resources(struct MPT3SAS_ADAPTER *ioc) _base_free_irq(ioc); _base_disable_msix(ioc); - if (ioc->msix96_vector) + if (ioc->msix96_vector) { kfree(ioc->replyPostRegisterIndex); + ioc->replyPostRegisterIndex = NULL; + } if (ioc->chip_phys) { iounmap(ioc->chip); From 116a7584cbf95324d2ac66bbce0f2682e71a81f0 Mon Sep 17 00:00:00 2001 From: Suganath prabu Subramani Date: Thu, 28 Jan 2016 12:07:06 +0530 Subject: [PATCH 645/813] mpt3sas: Fix for Asynchronous completion of timedout IO and task abort of timedout IO. [ Upstream commit 03d1fb3a65783979f23bd58b5a0387e6992d9e26 ] Track msix of each IO and use the same msix for issuing abort to timed out IO. With this driver will process IO's reply first followed by TM. Signed-off-by: Suganath prabu Subramani Signed-off-by: Chaitra P B Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/mpt3sas/mpt3sas_base.c | 20 +++++++++++--------- drivers/scsi/mpt3sas/mpt3sas_base.h | 5 ++++- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 2 +- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 12 +++++++++--- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 9137ae2b841f..5b2c37f1e908 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -2242,6 +2242,12 @@ mpt3sas_base_get_reply_virt_addr(struct MPT3SAS_ADAPTER *ioc, u32 phys_addr) return ioc->reply + (phys_addr - (u32)ioc->reply_dma); } +static inline u8 +_base_get_msix_index(struct MPT3SAS_ADAPTER *ioc) +{ + return ioc->cpu_msix_table[raw_smp_processor_id()]; +} + /** * mpt3sas_base_get_smid - obtain a free smid from internal queue * @ioc: per adapter object @@ -2302,6 +2308,7 @@ mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx, request->scmd = scmd; request->cb_idx = cb_idx; smid = request->smid; + request->msix_io = _base_get_msix_index(ioc); list_del(&request->tracker_list); spin_unlock_irqrestore(&ioc->scsi_lookup_lock, flags); return smid; @@ -2424,12 +2431,6 @@ _base_writeq(__u64 b, volatile void __iomem *addr, spinlock_t *writeq_lock) } #endif -static inline u8 -_base_get_msix_index(struct MPT3SAS_ADAPTER *ioc) -{ - return ioc->cpu_msix_table[raw_smp_processor_id()]; -} - /** * mpt3sas_base_put_smid_scsi_io - send SCSI_IO request to firmware * @ioc: per adapter object @@ -2483,18 +2484,19 @@ mpt3sas_base_put_smid_fast_path(struct MPT3SAS_ADAPTER *ioc, u16 smid, * mpt3sas_base_put_smid_hi_priority - send Task Managment request to firmware * @ioc: per adapter object * @smid: system request message index - * + * @msix_task: msix_task will be same as msix of IO incase of task abort else 0. * Return nothing. */ void -mpt3sas_base_put_smid_hi_priority(struct MPT3SAS_ADAPTER *ioc, u16 smid) +mpt3sas_base_put_smid_hi_priority(struct MPT3SAS_ADAPTER *ioc, u16 smid, + u16 msix_task) { Mpi2RequestDescriptorUnion_t descriptor; u64 *request = (u64 *)&descriptor; descriptor.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; - descriptor.HighPriority.MSIxIndex = 0; + descriptor.HighPriority.MSIxIndex = msix_task; descriptor.HighPriority.SMID = cpu_to_le16(smid); descriptor.HighPriority.LMID = 0; descriptor.HighPriority.Reserved1 = 0; diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index 5ad271efbd45..92648a5ea2d2 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -643,6 +643,7 @@ struct chain_tracker { * @cb_idx: callback index * @direct_io: To indicate whether I/O is direct (WARPDRIVE) * @tracker_list: list of free request (ioc->free_list) + * @msix_io: IO's msix */ struct scsiio_tracker { u16 smid; @@ -651,6 +652,7 @@ struct scsiio_tracker { u8 direct_io; struct list_head chain_list; struct list_head tracker_list; + u16 msix_io; }; /** @@ -1213,7 +1215,8 @@ void mpt3sas_base_put_smid_scsi_io(struct MPT3SAS_ADAPTER *ioc, u16 smid, u16 handle); void mpt3sas_base_put_smid_fast_path(struct MPT3SAS_ADAPTER *ioc, u16 smid, u16 handle); -void mpt3sas_base_put_smid_hi_priority(struct MPT3SAS_ADAPTER *ioc, u16 smid); +void mpt3sas_base_put_smid_hi_priority(struct MPT3SAS_ADAPTER *ioc, + u16 smid, u16 msix_task); void mpt3sas_base_put_smid_default(struct MPT3SAS_ADAPTER *ioc, u16 smid); void mpt3sas_base_initialize_callback_handler(void); u8 mpt3sas_base_register_callback_handler(MPT_CALLBACK cb_func); diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index d8366b056b70..4ccde5a05b70 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -817,7 +817,7 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, tm_request->DevHandle)); ioc->build_sg_mpi(ioc, psge, data_out_dma, data_out_sz, data_in_dma, data_in_sz); - mpt3sas_base_put_smid_hi_priority(ioc, smid); + mpt3sas_base_put_smid_hi_priority(ioc, smid, 0); break; } case MPI2_FUNCTION_SMP_PASSTHROUGH: diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 9ab77b06434d..6180f7970bbf 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2193,6 +2193,7 @@ mpt3sas_scsih_issue_tm(struct MPT3SAS_ADAPTER *ioc, u16 handle, uint channel, unsigned long timeleft; struct scsiio_tracker *scsi_lookup = NULL; int rc; + u16 msix_task = 0; if (m_type == TM_MUTEX_ON) mutex_lock(&ioc->tm_cmds.mutex); @@ -2256,7 +2257,12 @@ mpt3sas_scsih_issue_tm(struct MPT3SAS_ADAPTER *ioc, u16 handle, uint channel, int_to_scsilun(lun, (struct scsi_lun *)mpi_request->LUN); mpt3sas_scsih_set_tm_flag(ioc, handle); init_completion(&ioc->tm_cmds.done); - mpt3sas_base_put_smid_hi_priority(ioc, smid); + if ((type == MPI2_SCSITASKMGMT_TASKTYPE_ABORT_TASK) && + (scsi_lookup->msix_io < ioc->reply_queue_count)) + msix_task = scsi_lookup->msix_io; + else + msix_task = 0; + mpt3sas_base_put_smid_hi_priority(ioc, smid, msix_task); timeleft = wait_for_completion_timeout(&ioc->tm_cmds.done, timeout*HZ); if (!(ioc->tm_cmds.status & MPT3_CMD_COMPLETE)) { pr_err(MPT3SAS_FMT "%s: timeout\n", @@ -3151,7 +3157,7 @@ _scsih_tm_tr_send(struct MPT3SAS_ADAPTER *ioc, u16 handle) mpi_request->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; mpi_request->DevHandle = cpu_to_le16(handle); mpi_request->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; - mpt3sas_base_put_smid_hi_priority(ioc, smid); + mpt3sas_base_put_smid_hi_priority(ioc, smid, 0); mpt3sas_trigger_master(ioc, MASTER_TRIGGER_DEVICE_REMOVAL); out: @@ -3332,7 +3338,7 @@ _scsih_tm_tr_volume_send(struct MPT3SAS_ADAPTER *ioc, u16 handle) mpi_request->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; mpi_request->DevHandle = cpu_to_le16(handle); mpi_request->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; - mpt3sas_base_put_smid_hi_priority(ioc, smid); + mpt3sas_base_put_smid_hi_priority(ioc, smid, 0); } /** From 8a840b7f37551fd072147968eafe447e2572e180 Mon Sep 17 00:00:00 2001 From: Anjali Singhai Jain Date: Wed, 9 Dec 2015 15:50:24 -0800 Subject: [PATCH 646/813] i40e/i40evf: Fix RSS rx-flow-hash configuration through ethtool [ Upstream commit 6e35c04cf633e55648acb9ccabff42aa37bd4044 ] This patch fixes the Hash PCTYPE enable for X722 since it supports a broader selection of PCTYPES for TCP and UDP. This patch also fixes a bug in XL710, X710, X722 support for RSS, as of now we cannot reduce the (4)tuple for RSS for TCP/IPv4/IPV6 or UDP/IPv4/IPv6 packets since this requires a product feature change that comes in a later release. A VF should never be allowed to change the tuples for RSS for any PCTYPE since that's a global setting for the device in case of i40e devices. Change-ID: I0ee7203c9b24813260f58f3220798bc9d9ac4a12 Signed-off-by: Anjali Singhai Jain Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/intel/i40e/i40e_ethtool.c | 14 ++----- .../ethernet/intel/i40evf/i40evf_ethtool.c | 40 ++++--------------- 2 files changed, 12 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 3f385ffe420f..488a50d59dca 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -2164,8 +2164,7 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc) case TCP_V4_FLOW: switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { case 0: - hena &= ~BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP); - break; + return -EINVAL; case (RXH_L4_B_0_1 | RXH_L4_B_2_3): hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP); break; @@ -2176,8 +2175,7 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc) case TCP_V6_FLOW: switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { case 0: - hena &= ~BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP); - break; + return -EINVAL; case (RXH_L4_B_0_1 | RXH_L4_B_2_3): hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP); break; @@ -2188,9 +2186,7 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc) case UDP_V4_FLOW: switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { case 0: - hena &= ~(BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | - BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4)); - break; + return -EINVAL; case (RXH_L4_B_0_1 | RXH_L4_B_2_3): hena |= (BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4)); @@ -2202,9 +2198,7 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc) case UDP_V6_FLOW: switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { case 0: - hena &= ~(BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_UDP) | - BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6)); - break; + return -EINVAL; case (RXH_L4_B_0_1 | RXH_L4_B_2_3): hena |= (BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_UDP) | BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6)); diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c index 4790437a50ac..2ac62efc36f7 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c @@ -477,54 +477,30 @@ static int i40evf_set_rss_hash_opt(struct i40evf_adapter *adapter, switch (nfc->flow_type) { case TCP_V4_FLOW: - switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { - case 0: - hena &= ~BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP); - break; - case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + if (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP); - break; - default: + else return -EINVAL; - } break; case TCP_V6_FLOW: - switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { - case 0: - hena &= ~BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP); - break; - case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + if (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP); - break; - default: + else return -EINVAL; - } break; case UDP_V4_FLOW: - switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { - case 0: - hena &= ~(BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | - BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4)); - break; - case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + if (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { hena |= (BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4)); - break; - default: + } else { return -EINVAL; } break; case UDP_V6_FLOW: - switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { - case 0: - hena &= ~(BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_UDP) | - BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6)); - break; - case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + if (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { hena |= (BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_UDP) | BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6)); - break; - default: + } else { return -EINVAL; } break; From 4fc2942b6e2de2efc8a9d3784d4b0d3543149613 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:09 +0000 Subject: [PATCH 647/813] hrtimer: Catch illegal clockids [ Upstream commit 9006a01829a50cfd6bbd4980910ed46e895e93d7 ] It is way too easy to take any random clockid and feed it to the hrtimer subsystem. At best, it gets mapped to a monotonic base, but it would be better to just catch illegal values as early as possible. This patch does exactly that, mapping illegal clockids to an illegal base index, and panicing when we detect the illegal condition. Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/time/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..17f7bcff1e02 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -94,6 +94,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, @@ -102,7 +105,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - return hrtimer_clock_to_base_table[clock_id]; + int base = hrtimer_clock_to_base_table[clock_id]; + BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); + return base; } /* From c459dadb94b704e440ab8d3ed9e3df634416088f Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Thu, 28 Jan 2016 16:04:12 +0200 Subject: [PATCH 648/813] drm/i915/bxt: update list of PCIIDs [ Upstream commit 985dd4360fdf2533fe48a33a4a2094f2e4718dc0 ] Add PCIIDs for new versions of the SOC, based on BSpec. Also add the name of the versions as code comment where this is available. The new versions don't have any changes visible to the kernel driver. Signed-off-by: Imre Deak Reviewed-by: Mika Kuoppala Link: http://patchwork.freedesktop.org/patch/msgid/1453989852-13569-1-git-send-email-imre.deak@intel.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/drm/i915_pciids.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index 17c445612e01..06b8b47467ee 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -289,6 +289,8 @@ #define INTEL_BXT_IDS(info) \ INTEL_VGA_DEVICE(0x0A84, info), \ INTEL_VGA_DEVICE(0x1A84, info), \ - INTEL_VGA_DEVICE(0x5A84, info) + INTEL_VGA_DEVICE(0x1A85, info), \ + INTEL_VGA_DEVICE(0x5A84, info), /* APL HD Graphics 505 */ \ + INTEL_VGA_DEVICE(0x5A85, info) /* APL HD Graphics 500 */ #endif /* _I915_PCIIDS_H */ From bea3a6d7c54c81e470e63e1bcca16e638c4ab07f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Fri, 5 Feb 2016 13:21:42 +0100 Subject: [PATCH 649/813] drm/i915/skl: Add missing SKL ids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7157bb27e79875db5603aa1e30f56e873a8300f9 ] Used by production devices: Intel(R) Iris Graphics 540 (Skylake GT3e) Intel(R) Iris Graphics 550 (Skylake GT3e) v2: More ids v3: Less ids (GT1 got duplicated) Cc: Mika Kuoppala Signed-off-by: Michał Winiarski Reviewed-by: Mika Kuoppala Signed-off-by: Mika Kuoppala Link: http://patchwork.freedesktop.org/patch/msgid/1454674902-26207-1-git-send-email-michal.winiarski@intel.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/drm/i915_pciids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index 06b8b47467ee..2cdc723d750f 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -277,7 +277,9 @@ INTEL_VGA_DEVICE(0x191D, info) /* WKS GT2 */ #define INTEL_SKL_GT3_IDS(info) \ + INTEL_VGA_DEVICE(0x1923, info), /* ULT GT3 */ \ INTEL_VGA_DEVICE(0x1926, info), /* ULT GT3 */ \ + INTEL_VGA_DEVICE(0x1927, info), /* ULT GT3 */ \ INTEL_VGA_DEVICE(0x192B, info), /* Halo GT3 */ \ INTEL_VGA_DEVICE(0x192A, info) /* SRV GT3 */ \ From 934e6279f9cee184f34d1247132bcd43ff74656b Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 7 Jan 2016 10:59:21 +0100 Subject: [PATCH 650/813] drm/atomic: Do not unset crtc when an encoder is stolen [ Upstream commit 97a8df90875f72ba3b4c3320759fd93cea743261 ] While we steal the encoder away from the connector the connector may be updated to use a different encoder. Without this change if 2 connectors swap encoders one of them will end up without a crtc. Signed-off-by: Maarten Lankhorst Reviewed-by: Gustavo Padovan Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1452160762-30487-5-git-send-email-maarten.lankhorst@linux.intel.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_atomic_helper.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index e5aec45bf985..1ac29d703c12 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -108,7 +108,6 @@ steal_encoder(struct drm_atomic_state *state, struct drm_crtc_state *crtc_state; struct drm_connector *connector; struct drm_connector_state *connector_state; - int ret; /* * We can only steal an encoder coming from a connector, which means we @@ -139,9 +138,6 @@ steal_encoder(struct drm_atomic_state *state, if (IS_ERR(connector_state)) return PTR_ERR(connector_state); - ret = drm_atomic_set_crtc_for_connector(connector_state, NULL); - if (ret) - return ret; connector_state->best_encoder = NULL; } From b07ef1af4d0e7130ac1917f662b1e410e9b09555 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 26 Nov 2015 14:00:49 +0200 Subject: [PATCH 651/813] mmc: sdhci: 64-bit DMA actually has 4-byte alignment [ Upstream commit 04a5ae6fdd018af29675eb8b6c2550c87f471570 ] The version 3.00 SDHCI spec. was a bit unclear about the required data alignment for 64-bit DMA, whereas the version 4.10 spec. uses different language and indicates that only 4-byte alignment is required rather than the 8-byte alignment currently implemented. That make no difference to SD and EMMC which invariably transfer data in sector-aligned blocks. However with SDIO, it results in using more DMA descriptors than necessary. Theoretically that slows DMA slightly although DMA is not the limiting factor for throughput, so there is no discernable impact on performance. Nevertheless, the driver should follw the spec unless there is good reason not to, so this patch corrects the alignment criterion. There is a more complicated criterion for the DMA descriptor table itself. However the table is allocated by dma_alloc_coherent() which allocates pages (i.e. aligned to a page boundary). For simplicity just check it is 8-byte aligned, but add a comment that some Intel controllers actually require 8-byte alignment even when using 32-bit DMA. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci.c | 31 ++++++++++++------------------- drivers/mmc/host/sdhci.h | 21 ++++++++++++--------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 1a802af827ed..fc9891fdb835 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -492,7 +492,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host, host->align_buffer, host->align_buffer_sz, direction); if (dma_mapping_error(mmc_dev(host->mmc), host->align_addr)) goto fail; - BUG_ON(host->align_addr & host->align_mask); + BUG_ON(host->align_addr & SDHCI_ADMA2_MASK); host->sg_count = sdhci_pre_dma_transfer(host, data); if (host->sg_count < 0) @@ -514,8 +514,8 @@ static int sdhci_adma_table_pre(struct sdhci_host *host, * the (up to three) bytes that screw up the * alignment. */ - offset = (host->align_sz - (addr & host->align_mask)) & - host->align_mask; + offset = (SDHCI_ADMA2_ALIGN - (addr & SDHCI_ADMA2_MASK)) & + SDHCI_ADMA2_MASK; if (offset) { if (data->flags & MMC_DATA_WRITE) { buffer = sdhci_kmap_atomic(sg, &flags); @@ -529,8 +529,8 @@ static int sdhci_adma_table_pre(struct sdhci_host *host, BUG_ON(offset > 65536); - align += host->align_sz; - align_addr += host->align_sz; + align += SDHCI_ADMA2_ALIGN; + align_addr += SDHCI_ADMA2_ALIGN; desc += host->desc_sz; @@ -611,7 +611,7 @@ static void sdhci_adma_table_post(struct sdhci_host *host, /* Do a quick scan of the SG list for any unaligned mappings */ has_unaligned = false; for_each_sg(data->sg, sg, host->sg_count, i) - if (sg_dma_address(sg) & host->align_mask) { + if (sg_dma_address(sg) & SDHCI_ADMA2_MASK) { has_unaligned = true; break; } @@ -623,15 +623,15 @@ static void sdhci_adma_table_post(struct sdhci_host *host, align = host->align_buffer; for_each_sg(data->sg, sg, host->sg_count, i) { - if (sg_dma_address(sg) & host->align_mask) { - size = host->align_sz - - (sg_dma_address(sg) & host->align_mask); + if (sg_dma_address(sg) & SDHCI_ADMA2_MASK) { + size = SDHCI_ADMA2_ALIGN - + (sg_dma_address(sg) & SDHCI_ADMA2_MASK); buffer = sdhci_kmap_atomic(sg, &flags); memcpy(buffer, align, size); sdhci_kunmap_atomic(buffer, &flags); - align += host->align_sz; + align += SDHCI_ADMA2_ALIGN; } } } @@ -2983,24 +2983,17 @@ int sdhci_add_host(struct sdhci_host *host) if (host->flags & SDHCI_USE_64_BIT_DMA) { host->adma_table_sz = (SDHCI_MAX_SEGS * 2 + 1) * SDHCI_ADMA2_64_DESC_SZ; - host->align_buffer_sz = SDHCI_MAX_SEGS * - SDHCI_ADMA2_64_ALIGN; host->desc_sz = SDHCI_ADMA2_64_DESC_SZ; - host->align_sz = SDHCI_ADMA2_64_ALIGN; - host->align_mask = SDHCI_ADMA2_64_ALIGN - 1; } else { host->adma_table_sz = (SDHCI_MAX_SEGS * 2 + 1) * SDHCI_ADMA2_32_DESC_SZ; - host->align_buffer_sz = SDHCI_MAX_SEGS * - SDHCI_ADMA2_32_ALIGN; host->desc_sz = SDHCI_ADMA2_32_DESC_SZ; - host->align_sz = SDHCI_ADMA2_32_ALIGN; - host->align_mask = SDHCI_ADMA2_32_ALIGN - 1; } host->adma_table = dma_alloc_coherent(mmc_dev(mmc), host->adma_table_sz, &host->adma_addr, GFP_KERNEL); + host->align_buffer_sz = SDHCI_MAX_SEGS * SDHCI_ADMA2_ALIGN; host->align_buffer = kmalloc(host->align_buffer_sz, GFP_KERNEL); if (!host->adma_table || !host->align_buffer) { if (host->adma_table) @@ -3014,7 +3007,7 @@ int sdhci_add_host(struct sdhci_host *host) host->flags &= ~SDHCI_USE_ADMA; host->adma_table = NULL; host->align_buffer = NULL; - } else if (host->adma_addr & host->align_mask) { + } else if (host->adma_addr & (SDHCI_ADMA2_DESC_ALIGN - 1)) { pr_warn("%s: unable to allocate aligned ADMA descriptor\n", mmc_hostname(mmc)); host->flags &= ~SDHCI_USE_ADMA; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 9c331ac5ad6b..0115e9907bf8 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -272,22 +272,27 @@ /* ADMA2 32-bit DMA descriptor size */ #define SDHCI_ADMA2_32_DESC_SZ 8 -/* ADMA2 32-bit DMA alignment */ -#define SDHCI_ADMA2_32_ALIGN 4 - /* ADMA2 32-bit descriptor */ struct sdhci_adma2_32_desc { __le16 cmd; __le16 len; __le32 addr; -} __packed __aligned(SDHCI_ADMA2_32_ALIGN); +} __packed __aligned(4); + +/* ADMA2 data alignment */ +#define SDHCI_ADMA2_ALIGN 4 +#define SDHCI_ADMA2_MASK (SDHCI_ADMA2_ALIGN - 1) + +/* + * ADMA2 descriptor alignment. Some controllers (e.g. Intel) require 8 byte + * alignment for the descriptor table even in 32-bit DMA mode. Memory + * allocation is at least 8 byte aligned anyway, so just stipulate 8 always. + */ +#define SDHCI_ADMA2_DESC_ALIGN 8 /* ADMA2 64-bit DMA descriptor size */ #define SDHCI_ADMA2_64_DESC_SZ 12 -/* ADMA2 64-bit DMA alignment */ -#define SDHCI_ADMA2_64_ALIGN 8 - /* * ADMA2 64-bit descriptor. Note 12-byte descriptor can't always be 8-byte * aligned. @@ -483,8 +488,6 @@ struct sdhci_host { dma_addr_t align_addr; /* Mapped bounce buffer */ unsigned int desc_sz; /* ADMA descriptor size */ - unsigned int align_sz; /* ADMA alignment */ - unsigned int align_mask; /* ADMA alignment mask */ struct tasklet_struct finish_tasklet; /* Tasklet structures */ From 07235140a4c3ed8e0d8fcb58fc51d1cbb5406ff4 Mon Sep 17 00:00:00 2001 From: Swapnil Nagle Date: Thu, 4 Feb 2016 11:45:17 -0500 Subject: [PATCH 652/813] qla2xxx: Use ATIO type to send correct tmr response [ Upstream commit d7236ac368212bd6fc8b45f050136ee53e6a6f2d ] The function value inside se_cmd can change if the TMR is cancelled. Use original ATIO Type to correctly determine CTIO response. Signed-off-by: Swapnil Nagle Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 75514a15bea0..f57d96984ae4 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1578,7 +1578,7 @@ void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *mcmd) qlt_send_notify_ack(vha, &mcmd->orig_iocb.imm_ntfy, 0, 0, 0, 0, 0, 0); else { - if (mcmd->se_cmd.se_tmr_req->function == TMR_ABORT_TASK) + if (mcmd->orig_iocb.atio.u.raw.entry_type == ABTS_RECV_24XX) qlt_24xx_send_abts_resp(vha, &mcmd->orig_iocb.abts, mcmd->fc_tm_rsp, false); else From 8844e74f8e0f1758ad9931b480ec88dd37c8528c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 17 Dec 2015 09:57:49 -0500 Subject: [PATCH 653/813] drm/amdgpu: fix dp link rate selection (v2) [ Upstream commit 41869c1c7fe583dec932eb3d87de2e010b30a737 ] Need to properly handle the max link rate in the dpcd. This prevents some cases where 5.4 Ghz is selected when it shouldn't be. v2: simplify logic, add array bounds check Reviewed-by: Tom St Denis Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/atombios_dp.c | 94 +++++++++--------------- 1 file changed, 35 insertions(+), 59 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c index 92b6acadfc52..21aacc1f45c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c +++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c @@ -243,7 +243,7 @@ static void amdgpu_atombios_dp_get_adjust_train(const u8 link_status[DP_LINK_STA /* convert bits per color to bits per pixel */ /* get bpc from the EDID */ -static int amdgpu_atombios_dp_convert_bpc_to_bpp(int bpc) +static unsigned amdgpu_atombios_dp_convert_bpc_to_bpp(int bpc) { if (bpc == 0) return 24; @@ -251,64 +251,32 @@ static int amdgpu_atombios_dp_convert_bpc_to_bpp(int bpc) return bpc * 3; } -/* get the max pix clock supported by the link rate and lane num */ -static int amdgpu_atombios_dp_get_max_dp_pix_clock(int link_rate, - int lane_num, - int bpp) -{ - return (link_rate * lane_num * 8) / bpp; -} - /***** amdgpu specific DP functions *****/ -/* First get the min lane# when low rate is used according to pixel clock - * (prefer low rate), second check max lane# supported by DP panel, - * if the max lane# < low rate lane# then use max lane# instead. - */ -static int amdgpu_atombios_dp_get_dp_lane_number(struct drm_connector *connector, +static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector, const u8 dpcd[DP_DPCD_SIZE], - int pix_clock) + unsigned pix_clock, + unsigned *dp_lanes, unsigned *dp_rate) { - int bpp = amdgpu_atombios_dp_convert_bpc_to_bpp(amdgpu_connector_get_monitor_bpc(connector)); - int max_link_rate = drm_dp_max_link_rate(dpcd); - int max_lane_num = drm_dp_max_lane_count(dpcd); - int lane_num; - int max_dp_pix_clock; + unsigned bpp = + amdgpu_atombios_dp_convert_bpc_to_bpp(amdgpu_connector_get_monitor_bpc(connector)); + static const unsigned link_rates[3] = { 162000, 270000, 540000 }; + unsigned max_link_rate = drm_dp_max_link_rate(dpcd); + unsigned max_lane_num = drm_dp_max_lane_count(dpcd); + unsigned lane_num, i, max_pix_clock; - for (lane_num = 1; lane_num < max_lane_num; lane_num <<= 1) { - max_dp_pix_clock = amdgpu_atombios_dp_get_max_dp_pix_clock(max_link_rate, lane_num, bpp); - if (pix_clock <= max_dp_pix_clock) - break; + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; + if (max_pix_clock >= pix_clock) { + *dp_lanes = lane_num; + *dp_rate = link_rates[i]; + return 0; + } + } } - return lane_num; -} - -static int amdgpu_atombios_dp_get_dp_link_clock(struct drm_connector *connector, - const u8 dpcd[DP_DPCD_SIZE], - int pix_clock) -{ - int bpp = amdgpu_atombios_dp_convert_bpc_to_bpp(amdgpu_connector_get_monitor_bpc(connector)); - int lane_num, max_pix_clock; - - if (amdgpu_connector_encoder_get_dp_bridge_encoder_id(connector) == - ENCODER_OBJECT_ID_NUTMEG) - return 270000; - - lane_num = amdgpu_atombios_dp_get_dp_lane_number(connector, dpcd, pix_clock); - max_pix_clock = amdgpu_atombios_dp_get_max_dp_pix_clock(162000, lane_num, bpp); - if (pix_clock <= max_pix_clock) - return 162000; - max_pix_clock = amdgpu_atombios_dp_get_max_dp_pix_clock(270000, lane_num, bpp); - if (pix_clock <= max_pix_clock) - return 270000; - if (amdgpu_connector_is_dp12_capable(connector)) { - max_pix_clock = amdgpu_atombios_dp_get_max_dp_pix_clock(540000, lane_num, bpp); - if (pix_clock <= max_pix_clock) - return 540000; - } - - return drm_dp_max_link_rate(dpcd); + return -EINVAL; } static u8 amdgpu_atombios_dp_encoder_service(struct amdgpu_device *adev, @@ -422,6 +390,7 @@ void amdgpu_atombios_dp_set_link_config(struct drm_connector *connector, { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); struct amdgpu_connector_atom_dig *dig_connector; + int ret; if (!amdgpu_connector->con_priv) return; @@ -429,10 +398,14 @@ void amdgpu_atombios_dp_set_link_config(struct drm_connector *connector, if ((dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT) || (dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_eDP)) { - dig_connector->dp_clock = - amdgpu_atombios_dp_get_dp_link_clock(connector, dig_connector->dpcd, mode->clock); - dig_connector->dp_lane_count = - amdgpu_atombios_dp_get_dp_lane_number(connector, dig_connector->dpcd, mode->clock); + ret = amdgpu_atombios_dp_get_dp_link_config(connector, dig_connector->dpcd, + mode->clock, + &dig_connector->dp_lane_count, + &dig_connector->dp_clock); + if (ret) { + dig_connector->dp_clock = 0; + dig_connector->dp_lane_count = 0; + } } } @@ -441,14 +414,17 @@ int amdgpu_atombios_dp_mode_valid_helper(struct drm_connector *connector, { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); struct amdgpu_connector_atom_dig *dig_connector; - int dp_clock; + unsigned dp_lanes, dp_clock; + int ret; if (!amdgpu_connector->con_priv) return MODE_CLOCK_HIGH; dig_connector = amdgpu_connector->con_priv; - dp_clock = - amdgpu_atombios_dp_get_dp_link_clock(connector, dig_connector->dpcd, mode->clock); + ret = amdgpu_atombios_dp_get_dp_link_config(connector, dig_connector->dpcd, + mode->clock, &dp_lanes, &dp_clock); + if (ret) + return MODE_CLOCK_HIGH; if ((dp_clock == 540000) && (!amdgpu_connector_is_dp12_capable(connector))) From 724457fa1c0ca2992bec5a3db85ac3b130409cfa Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 17 Dec 2015 10:23:34 -0500 Subject: [PATCH 654/813] drm/radeon: fix dp link rate selection (v2) [ Upstream commit 092c96a8ab9d1bd60ada2ed385cc364ce084180e ] Need to properly handle the max link rate in the dpcd. This prevents some cases where 5.4 Ghz is selected when it shouldn't be. v2: simplify logic, add array bounds check Reviewed-by: Tom St Denis Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/atombios_dp.c | 106 ++++++++----------------- drivers/gpu/drm/radeon/radeon_dp_mst.c | 12 ++- drivers/gpu/drm/radeon/radeon_mode.h | 6 +- 3 files changed, 48 insertions(+), 76 deletions(-) diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index bd73b4069069..44ee72e04df9 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -302,77 +302,31 @@ static int convert_bpc_to_bpp(int bpc) return bpc * 3; } -/* get the max pix clock supported by the link rate and lane num */ -static int dp_get_max_dp_pix_clock(int link_rate, - int lane_num, - int bpp) -{ - return (link_rate * lane_num * 8) / bpp; -} - /***** radeon specific DP functions *****/ -int radeon_dp_get_max_link_rate(struct drm_connector *connector, - const u8 dpcd[DP_DPCD_SIZE]) -{ - int max_link_rate; - - if (radeon_connector_is_dp12_capable(connector)) - max_link_rate = min(drm_dp_max_link_rate(dpcd), 540000); - else - max_link_rate = min(drm_dp_max_link_rate(dpcd), 270000); - - return max_link_rate; -} - -/* First get the min lane# when low rate is used according to pixel clock - * (prefer low rate), second check max lane# supported by DP panel, - * if the max lane# < low rate lane# then use max lane# instead. - */ -static int radeon_dp_get_dp_lane_number(struct drm_connector *connector, - const u8 dpcd[DP_DPCD_SIZE], - int pix_clock) +int radeon_dp_get_dp_link_config(struct drm_connector *connector, + const u8 dpcd[DP_DPCD_SIZE], + unsigned pix_clock, + unsigned *dp_lanes, unsigned *dp_rate) { int bpp = convert_bpc_to_bpp(radeon_get_monitor_bpc(connector)); - int max_link_rate = radeon_dp_get_max_link_rate(connector, dpcd); - int max_lane_num = drm_dp_max_lane_count(dpcd); - int lane_num; - int max_dp_pix_clock; + static const unsigned link_rates[3] = { 162000, 270000, 540000 }; + unsigned max_link_rate = drm_dp_max_link_rate(dpcd); + unsigned max_lane_num = drm_dp_max_lane_count(dpcd); + unsigned lane_num, i, max_pix_clock; - for (lane_num = 1; lane_num < max_lane_num; lane_num <<= 1) { - max_dp_pix_clock = dp_get_max_dp_pix_clock(max_link_rate, lane_num, bpp); - if (pix_clock <= max_dp_pix_clock) - break; + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; + if (max_pix_clock >= pix_clock) { + *dp_lanes = lane_num; + *dp_rate = link_rates[i]; + return 0; + } + } } - return lane_num; -} - -static int radeon_dp_get_dp_link_clock(struct drm_connector *connector, - const u8 dpcd[DP_DPCD_SIZE], - int pix_clock) -{ - int bpp = convert_bpc_to_bpp(radeon_get_monitor_bpc(connector)); - int lane_num, max_pix_clock; - - if (radeon_connector_encoder_get_dp_bridge_encoder_id(connector) == - ENCODER_OBJECT_ID_NUTMEG) - return 270000; - - lane_num = radeon_dp_get_dp_lane_number(connector, dpcd, pix_clock); - max_pix_clock = dp_get_max_dp_pix_clock(162000, lane_num, bpp); - if (pix_clock <= max_pix_clock) - return 162000; - max_pix_clock = dp_get_max_dp_pix_clock(270000, lane_num, bpp); - if (pix_clock <= max_pix_clock) - return 270000; - if (radeon_connector_is_dp12_capable(connector)) { - max_pix_clock = dp_get_max_dp_pix_clock(540000, lane_num, bpp); - if (pix_clock <= max_pix_clock) - return 540000; - } - - return radeon_dp_get_max_link_rate(connector, dpcd); + return -EINVAL; } static u8 radeon_dp_encoder_service(struct radeon_device *rdev, @@ -491,6 +445,7 @@ void radeon_dp_set_link_config(struct drm_connector *connector, { struct radeon_connector *radeon_connector = to_radeon_connector(connector); struct radeon_connector_atom_dig *dig_connector; + int ret; if (!radeon_connector->con_priv) return; @@ -498,10 +453,14 @@ void radeon_dp_set_link_config(struct drm_connector *connector, if ((dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT) || (dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_eDP)) { - dig_connector->dp_clock = - radeon_dp_get_dp_link_clock(connector, dig_connector->dpcd, mode->clock); - dig_connector->dp_lane_count = - radeon_dp_get_dp_lane_number(connector, dig_connector->dpcd, mode->clock); + ret = radeon_dp_get_dp_link_config(connector, dig_connector->dpcd, + mode->clock, + &dig_connector->dp_lane_count, + &dig_connector->dp_clock); + if (ret) { + dig_connector->dp_clock = 0; + dig_connector->dp_lane_count = 0; + } } } @@ -510,7 +469,8 @@ int radeon_dp_mode_valid_helper(struct drm_connector *connector, { struct radeon_connector *radeon_connector = to_radeon_connector(connector); struct radeon_connector_atom_dig *dig_connector; - int dp_clock; + unsigned dp_clock, dp_lanes; + int ret; if ((mode->clock > 340000) && (!radeon_connector_is_dp12_capable(connector))) @@ -520,8 +480,12 @@ int radeon_dp_mode_valid_helper(struct drm_connector *connector, return MODE_CLOCK_HIGH; dig_connector = radeon_connector->con_priv; - dp_clock = - radeon_dp_get_dp_link_clock(connector, dig_connector->dpcd, mode->clock); + ret = radeon_dp_get_dp_link_config(connector, dig_connector->dpcd, + mode->clock, + &dp_lanes, + &dp_clock); + if (ret) + return MODE_CLOCK_HIGH; if ((dp_clock == 540000) && (!radeon_connector_is_dp12_capable(connector))) diff --git a/drivers/gpu/drm/radeon/radeon_dp_mst.c b/drivers/gpu/drm/radeon/radeon_dp_mst.c index 744f5c49c664..b431c9c2b247 100644 --- a/drivers/gpu/drm/radeon/radeon_dp_mst.c +++ b/drivers/gpu/drm/radeon/radeon_dp_mst.c @@ -525,11 +525,17 @@ static bool radeon_mst_mode_fixup(struct drm_encoder *encoder, drm_mode_set_crtcinfo(adjusted_mode, 0); { struct radeon_connector_atom_dig *dig_connector; + int ret; dig_connector = mst_enc->connector->con_priv; - dig_connector->dp_lane_count = drm_dp_max_lane_count(dig_connector->dpcd); - dig_connector->dp_clock = radeon_dp_get_max_link_rate(&mst_enc->connector->base, - dig_connector->dpcd); + ret = radeon_dp_get_dp_link_config(&mst_enc->connector->base, + dig_connector->dpcd, adjusted_mode->clock, + &dig_connector->dp_lane_count, + &dig_connector->dp_clock); + if (ret) { + dig_connector->dp_lane_count = 0; + dig_connector->dp_clock = 0; + } DRM_DEBUG_KMS("dig clock %p %d %d\n", dig_connector, dig_connector->dp_lane_count, dig_connector->dp_clock); } diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h index bba112628b47..7a0666ac4e23 100644 --- a/drivers/gpu/drm/radeon/radeon_mode.h +++ b/drivers/gpu/drm/radeon/radeon_mode.h @@ -757,8 +757,10 @@ extern u8 radeon_dp_getsinktype(struct radeon_connector *radeon_connector); extern bool radeon_dp_getdpcd(struct radeon_connector *radeon_connector); extern int radeon_dp_get_panel_mode(struct drm_encoder *encoder, struct drm_connector *connector); -int radeon_dp_get_max_link_rate(struct drm_connector *connector, - const u8 *dpcd); +extern int radeon_dp_get_dp_link_config(struct drm_connector *connector, + const u8 *dpcd, + unsigned pix_clock, + unsigned *dp_lanes, unsigned *dp_rate); extern void radeon_dp_set_rx_power_state(struct drm_connector *connector, u8 power_state); extern void radeon_dp_aux_init(struct radeon_connector *radeon_connector); From e7c3692b809bfbcf03cbb70104dc365658b3bfb6 Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Wed, 24 Feb 2016 16:40:50 +0530 Subject: [PATCH 655/813] net: thunderx: Fix for Qset error due to CQ full [ Upstream commit 4c0b6eaf373a5323f03a3a20c42fc435715b073d ] On Thunderx pass 1.x and pass2 due to a HW errata default CQ DROP_LEVEL of 0x80 is not sufficient to avoid CQ_WR_FULL Qset error when packets are being received at >20Mpps resulting in complete stall of packet reception. This patch will configure it to 0x100 which is what is expected by HW on Thunderx. On future passes of thunderx and other chips HW default/reset value will be 0x100 or higher hence not overwritten. Signed-off-by: Jerin Jacob Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/thunder/nic.h | 9 +++++++++ drivers/net/ethernet/cavium/thunder/nic_main.c | 6 ++++++ drivers/net/ethernet/cavium/thunder/nic_reg.h | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index 39ca6744a4e6..22471d283a95 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -116,6 +116,15 @@ #define NIC_PF_INTR_ID_MBOX0 8 #define NIC_PF_INTR_ID_MBOX1 9 +/* Minimum FIFO level before all packets for the CQ are dropped + * + * This value ensures that once a packet has been "accepted" + * for reception it will not get dropped due to non-availability + * of CQ descriptor. An errata in HW mandates this value to be + * atleast 0x100. + */ +#define NICPF_CQM_MIN_DROP_LEVEL 0x100 + /* Global timer for CQ timer thresh interrupts * Calculated for SCLK of 700Mhz * value written should be a 1/16th of what is expected diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 5f24d11cb16a..16baaafed26c 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -309,6 +309,7 @@ static void nic_set_lmac_vf_mapping(struct nicpf *nic) static void nic_init_hw(struct nicpf *nic) { int i; + u64 cqm_cfg; /* Enable NIC HW block */ nic_reg_write(nic, NIC_PF_CFG, 0x3); @@ -345,6 +346,11 @@ static void nic_init_hw(struct nicpf *nic) /* Enable VLAN ethertype matching and stripping */ nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7, (2 << 19) | (ETYPE_ALG_VLAN_STRIP << 16) | ETH_P_8021Q); + + /* Check if HW expected value is higher (could be in future chips) */ + cqm_cfg = nic_reg_read(nic, NIC_PF_CQM_CFG); + if (cqm_cfg < NICPF_CQM_MIN_DROP_LEVEL) + nic_reg_write(nic, NIC_PF_CQM_CFG, NICPF_CQM_MIN_DROP_LEVEL); } /* Channel parse index configuration */ diff --git a/drivers/net/ethernet/cavium/thunder/nic_reg.h b/drivers/net/ethernet/cavium/thunder/nic_reg.h index dd536be20193..afb10e326b4f 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_reg.h +++ b/drivers/net/ethernet/cavium/thunder/nic_reg.h @@ -21,7 +21,7 @@ #define NIC_PF_TCP_TIMER (0x0060) #define NIC_PF_BP_CFG (0x0080) #define NIC_PF_RRM_CFG (0x0088) -#define NIC_PF_CQM_CF (0x00A0) +#define NIC_PF_CQM_CFG (0x00A0) #define NIC_PF_CNM_CF (0x00A8) #define NIC_PF_CNM_STATUS (0x00B0) #define NIC_PF_CQ_AVG_CFG (0x00C0) From 950464b230e007f5206b4a4ac86aeba70524b2f6 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Wed, 24 Feb 2016 17:44:57 -0800 Subject: [PATCH 656/813] arm64: Add workaround for Cavium erratum 27456 [ Upstream commit 104a0c02e8b1936c049e18a6d4e4ab040fb61213 ] On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI instructions may cause the icache to become corrupted if it contains data for a non-current ASID. This patch implements the workaround (which invalidates the local icache when switching the mm) by using code patching. Signed-off-by: Andrew Pinski Signed-off-by: David Daney Reviewed-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/mm/proc.S | 12 ++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 871f21783866..c7236d1a3f64 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -401,6 +401,17 @@ config CAVIUM_ERRATUM_23154 If unsure, say Y. +config CAVIUM_ERRATUM_27456 + bool "Cavium erratum 27456: Broadcast TLBI instructions may cause icache corruption" + default y + help + On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI + instructions may cause the icache to become corrupted if it + contains data for a non-current ASID. The fix is to + invalidate the icache when changing the mm context. + + If unsure, say Y. + endmenu diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8f271b83f910..8136afc9df0d 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -30,8 +30,9 @@ #define ARM64_HAS_LSE_ATOMICS 5 #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 +#define ARM64_WORKAROUND_CAVIUM_27456 8 -#define ARM64_NCAPS 8 +#define ARM64_NCAPS 9 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index feb6b4efa641..a3e846a28b05 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -99,6 +99,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_WORKAROUND_CAVIUM_23154, MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01), }, +#endif +#ifdef CONFIG_CAVIUM_ERRATUM_27456 + { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + .desc = "Cavium erratum 27456", + .capability = ARM64_WORKAROUND_CAVIUM_27456, + MIDR_RANGE(MIDR_THUNDERX, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 1), + }, #endif { } diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 1f6bb29ca53b..18201e9e8cc7 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "proc-macros.S" @@ -137,7 +139,17 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb +alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 ret + nop + nop + nop +alternative_else + ic iallu + dsb nsh + isb + ret +alternative_endif ENDPROC(cpu_do_switch_mm) .section ".text.init", #alloc, #execinstr From 1b82da380dcfd030bd1ee32853b132659bb0ab4c Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 3 Mar 2016 17:54:54 +0100 Subject: [PATCH 657/813] tipc: fix nullptr crash during subscription cancel [ Upstream commit 4de13d7ed6ffdcbb34317acaa9236f121176f5f8 ] commit 4d5cfcba2f6e ('tipc: fix connection abort during subscription cancel'), removes the check for a valid subscription before calling tipc_nametbl_subscribe(). This will lead to a nullptr exception when we process a subscription cancel request. For a cancel request, a null subscription is passed to tipc_nametbl_subscribe() resulting in exception. In this commit, we call tipc_nametbl_subscribe() only for a valid subscription. Fixes: 4d5cfcba2f6e ('tipc: fix connection abort during subscription cancel') Reported-by: Anders Widell Signed-off-by: Parthasarathy Bhuvaragan Acked-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/tipc/subscr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 69ee2eeef968..f9ff73a8d815 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -296,7 +296,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub)) return tipc_conn_terminate(tn->topsrv, subscrb->conid); - tipc_nametbl_subscribe(sub); + if (sub) + tipc_nametbl_subscribe(sub); } /* Handle one request to establish a new subscriber */ From a554bd7ccf3be8be0ecb430dae27fb97c4915564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 10 Feb 2016 19:59:05 +0200 Subject: [PATCH 658/813] drm/i915: Fix hpd live status bits for g4x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0780cd36c7af70c55981ee624084f0f48cae9b95 ] Looks like g4x hpd live status bits actually agree with the spec. At least they do on the machine I have, and apparently on Nick Bowler's g4x as well. So gm45 may be the only platform where they don't agree. At least that seems to be the case based on the (somewhat incomplete) logs/dumps in [1], and Daniel has also tested this on his gm45 sometime in the past. So let's change the bits to match the spec on g4x. That actually makes the g4x bits identical to vlv/chv so we can just share the code between those platforms, leaving gm45 as the special case. [1] https://bugzilla.kernel.org/show_bug.cgi?id=52361 Cc: Shashank Sharma Cc: Sonika Jindal Cc: Daniel Vetter Cc: Jani Nikula Cc: Nick Bowler References: https://lists.freedesktop.org/archives/dri-devel/2016-February/100382.html Reported-by: Nick Bowler Cc: stable@vger.kernel.org Fixes: 237ed86c693d ("drm/i915: Check live status before reading edid") Signed-off-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/1455127145-20087-1-git-send-email-ville.syrjala@linux.intel.com Reviewed-by: Daniel Vetter Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_reg.h | 15 ++++++++------- drivers/gpu/drm/i915/intel_dp.c | 14 +++++++------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 9ed9f6dde86f..cace154bbdc0 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3240,19 +3240,20 @@ enum skl_disp_power_wells { #define PORT_HOTPLUG_STAT (dev_priv->info.display_mmio_offset + 0x61114) /* - * HDMI/DP bits are gen4+ + * HDMI/DP bits are g4x+ * * WARNING: Bspec for hpd status bits on gen4 seems to be completely confused. * Please check the detailed lore in the commit message for for experimental * evidence. */ -#define PORTD_HOTPLUG_LIVE_STATUS_G4X (1 << 29) +/* Bspec says GM45 should match G4X/VLV/CHV, but reality disagrees */ +#define PORTD_HOTPLUG_LIVE_STATUS_GM45 (1 << 29) +#define PORTC_HOTPLUG_LIVE_STATUS_GM45 (1 << 28) +#define PORTB_HOTPLUG_LIVE_STATUS_GM45 (1 << 27) +/* G4X/VLV/CHV DP/HDMI bits again match Bspec */ +#define PORTD_HOTPLUG_LIVE_STATUS_G4X (1 << 27) #define PORTC_HOTPLUG_LIVE_STATUS_G4X (1 << 28) -#define PORTB_HOTPLUG_LIVE_STATUS_G4X (1 << 27) -/* VLV DP/HDMI bits again match Bspec */ -#define PORTD_HOTPLUG_LIVE_STATUS_VLV (1 << 27) -#define PORTC_HOTPLUG_LIVE_STATUS_VLV (1 << 28) -#define PORTB_HOTPLUG_LIVE_STATUS_VLV (1 << 29) +#define PORTB_HOTPLUG_LIVE_STATUS_G4X (1 << 29) #define PORTD_HOTPLUG_INT_STATUS (3 << 21) #define PORTD_HOTPLUG_INT_LONG_PULSE (2 << 21) #define PORTD_HOTPLUG_INT_SHORT_PULSE (1 << 21) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 8e1d6d74c203..1f8a10fb95ab 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -4592,20 +4592,20 @@ static bool g4x_digital_port_connected(struct drm_i915_private *dev_priv, return I915_READ(PORT_HOTPLUG_STAT) & bit; } -static bool vlv_digital_port_connected(struct drm_i915_private *dev_priv, - struct intel_digital_port *port) +static bool gm45_digital_port_connected(struct drm_i915_private *dev_priv, + struct intel_digital_port *port) { u32 bit; switch (port->port) { case PORT_B: - bit = PORTB_HOTPLUG_LIVE_STATUS_VLV; + bit = PORTB_HOTPLUG_LIVE_STATUS_GM45; break; case PORT_C: - bit = PORTC_HOTPLUG_LIVE_STATUS_VLV; + bit = PORTC_HOTPLUG_LIVE_STATUS_GM45; break; case PORT_D: - bit = PORTD_HOTPLUG_LIVE_STATUS_VLV; + bit = PORTD_HOTPLUG_LIVE_STATUS_GM45; break; default: MISSING_CASE(port->port); @@ -4657,8 +4657,8 @@ bool intel_digital_port_connected(struct drm_i915_private *dev_priv, return cpt_digital_port_connected(dev_priv, port); else if (IS_BROXTON(dev_priv)) return bxt_digital_port_connected(dev_priv, port); - else if (IS_VALLEYVIEW(dev_priv)) - return vlv_digital_port_connected(dev_priv, port); + else if (IS_GM45(dev_priv)) + return gm45_digital_port_connected(dev_priv, port); else return g4x_digital_port_connected(dev_priv, port); } From 5fb01ac92f6138d1f81f54766c80a23bc63e896d Mon Sep 17 00:00:00 2001 From: Libin Yang Date: Thu, 14 Jan 2016 14:09:00 +0800 Subject: [PATCH 659/813] ALSA: hda - add codec support for Kabylake display audio codec [ Upstream commit 91815d8aa7e2f45d30e51caa297061ad893628d9 ] This patch adds codec ID (0x8086280b) for Kabylake display codec and apply the hsw fix-ups to Kabylake. Signed-off-by: Libin Yang Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_hdmi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index f7bcd8dbac14..a8045b8a2a18 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -51,8 +51,10 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't restrict PCM parameters per ELD info"); #define is_broadwell(codec) ((codec)->core.vendor_id == 0x80862808) #define is_skylake(codec) ((codec)->core.vendor_id == 0x80862809) #define is_broxton(codec) ((codec)->core.vendor_id == 0x8086280a) +#define is_kabylake(codec) ((codec)->core.vendor_id == 0x8086280b) #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \ - || is_skylake(codec) || is_broxton(codec)) + || is_skylake(codec) || is_broxton(codec) \ + || is_kabylake(codec)) #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882) #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883) @@ -3584,6 +3586,7 @@ HDA_CODEC_ENTRY(0x80862807, "Haswell HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862808, "Broadwell HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862809, "Skylake HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x8086280a, "Broxton HDMI", patch_generic_hdmi), +HDA_CODEC_ENTRY(0x8086280b, "Kabylake HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862882, "Valleyview2 HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862883, "Braswell HDMI", patch_generic_hdmi), From f2b8424f35f5d457c2d35b3823ee0fb272e77728 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 20 Jan 2016 12:36:58 +0800 Subject: [PATCH 660/813] sched/numa: Fix use-after-free bug in the task_numa_compare [ Upstream commit 1dff76b92f69051e579bdc131e01500da9fa2a91 ] The following message can be observed on the Ubuntu v3.13.0-65 with KASan backported: ================================================================== BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8 Read of size 8 by task qemu-system-x86/3998900 ============================================================================= BUG kmalloc-128 (Tainted: G B ): kasan: bad access detected ----------------------------------------------------------------------------- INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890 __slab_alloc+0x4f8/0x560 __kmalloc+0x1eb/0x280 task_numa_fault+0xc1b/0xed0 do_numa_page+0x192/0x200 handle_mm_fault+0x808/0x1160 __do_page_fault+0x218/0x750 do_page_fault+0x1a/0x70 page_fault+0x28/0x30 SyS_poll+0x66/0x1a0 system_call_fastpath+0x1a/0x1f INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0 __slab_free+0x2ab/0x3f0 kfree+0x161/0x170 task_numa_free+0x1d2/0x200 finish_task_switch+0x1d2/0x210 __schedule+0x5d4/0xc60 schedule_preempt_disabled+0x40/0xc0 cpu_startup_entry+0x2da/0x340 start_secondary+0x28f/0x360 Call Trace: [] dump_stack+0x45/0x56 [] print_trailer+0xfd/0x170 [] object_err+0x36/0x40 [] kasan_report_error+0x1e9/0x3a0 [] kasan_report+0x40/0x50 [] ? task_numa_find_cpu+0x64c/0x890 [] __asan_load8+0x69/0xa0 [] ? find_next_bit+0xd8/0x120 [] task_numa_find_cpu+0x64c/0x890 [] task_numa_migrate+0x4ac/0x7b0 [] numa_migrate_preferred+0xb3/0xc0 [] task_numa_fault+0xb88/0xed0 [] do_numa_page+0x192/0x200 [] handle_mm_fault+0x808/0x1160 [] ? sched_clock_cpu+0x10d/0x160 [] ? native_load_tls+0x82/0xa0 [] __do_page_fault+0x218/0x750 [] ? hrtimer_try_to_cancel+0x76/0x160 [] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0 [] do_page_fault+0x1a/0x70 [] page_fault+0x28/0x30 [] ? do_sys_poll+0x1c4/0x6d0 [] ? enqueue_task_fair+0x4b6/0xaa0 [] ? sched_clock+0x9/0x10 [] ? resched_task+0x7a/0xc0 [] ? check_preempt_curr+0xb3/0x130 [] ? poll_select_copy_remaining+0x170/0x170 [] ? wake_up_state+0x10/0x20 [] ? drop_futex_key_refs.isra.14+0x1f/0x90 [] ? futex_requeue+0x3de/0xba0 [] ? do_futex+0xbe/0x8f0 [] ? read_tsc+0x9/0x20 [] ? ktime_get_ts+0x12d/0x170 [] ? timespec_add_safe+0x59/0xe0 [] SyS_poll+0x66/0x1a0 [] system_call_fastpath+0x1a/0x1f As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in task_numa_assign()") points out, the rcu_read_lock() cannot protect the task_struct from being freed in the finish_task_switch(). And the bug happens in the process of calculation of imp which requires the access of p->numa_faults being freed in the following path: do_exit() current->flags |= PF_EXITING; release_task() ~~delayed_put_task_struct()~~ schedule() ... ... rq->curr = next; context_switch() finish_task_switch() put_task_struct() __put_task_struct() task_numa_free() The fix here to get_task_struct() early before end of dst_rq->lock to protect the calculation process and also put_task_struct() in the corresponding point if finally the dst_rq->curr somehow cannot be assigned. Additional credit to Liang Chen who helped fix the error logic and add the put_task_struct() to the place it missed. Signed-off-by: Gavin Guo Signed-off-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: jay.vosburgh@canonical.com Cc: liang.chen@canonical.com Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8b516c37bf1..8f258f437ac2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1191,8 +1191,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1260,20 +1258,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1357,6 +1365,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1382,9 +1391,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, From 840a59324e631a167c0aab4136024e58374ddd2c Mon Sep 17 00:00:00 2001 From: Aviv Greenberg Date: Fri, 16 Oct 2015 08:48:51 -0300 Subject: [PATCH 661/813] UVC: Add support for R200 depth camera [ Upstream commit 5d8d8db851ef81337e7026b32a9d5a9cfb2271d5 ] Add support for Intel R200 depth camera in uvc driver. This includes adding new uvc GUIDs for the new pixel formats, adding new V4L pixel format definition to user api headers, and updating the uvc driver GUID-to-4cc tables with the new formats. Tested-by: Greenberg, Aviv D Signed-off-by: Aviv Greenberg Signed-off-by: Sakari Ailus Signed-off-by: Guennadi Liakhovetski Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/uvc/uvc_driver.c | 20 ++++++++++++++++++++ drivers/media/usb/uvc/uvcvideo.h | 12 ++++++++++++ include/uapi/linux/videodev2.h | 3 +++ 3 files changed, 35 insertions(+) diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index d11fd6ac2df0..5cefca95734e 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -148,6 +148,26 @@ static struct uvc_format_desc uvc_fmts[] = { .guid = UVC_GUID_FORMAT_H264, .fcc = V4L2_PIX_FMT_H264, }, + { + .name = "Greyscale 8 L/R (Y8I)", + .guid = UVC_GUID_FORMAT_Y8I, + .fcc = V4L2_PIX_FMT_Y8I, + }, + { + .name = "Greyscale 12 L/R (Y12I)", + .guid = UVC_GUID_FORMAT_Y12I, + .fcc = V4L2_PIX_FMT_Y12I, + }, + { + .name = "Depth data 16-bit (Z16)", + .guid = UVC_GUID_FORMAT_Z16, + .fcc = V4L2_PIX_FMT_Z16, + }, + { + .name = "Bayer 10-bit (SRGGB10P)", + .guid = UVC_GUID_FORMAT_RW10, + .fcc = V4L2_PIX_FMT_SRGGB10P, + }, }; /* ------------------------------------------------------------------------ diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h index f0f2391e1b43..7e4d3eea371b 100644 --- a/drivers/media/usb/uvc/uvcvideo.h +++ b/drivers/media/usb/uvc/uvcvideo.h @@ -119,6 +119,18 @@ #define UVC_GUID_FORMAT_H264 \ { 'H', '2', '6', '4', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y8I \ + { 'Y', '8', 'I', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y12I \ + { 'Y', '1', '2', 'I', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Z16 \ + { 'Z', '1', '6', ' ', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_RW10 \ + { 'R', 'W', '1', '0', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} /* ------------------------------------------------------------------------ * Driver specific constants. diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index a0e87d16b726..421d27413731 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -621,6 +621,9 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_JPGL v4l2_fourcc('J', 'P', 'G', 'L') /* JPEG-Lite */ #define V4L2_PIX_FMT_SE401 v4l2_fourcc('S', '4', '0', '1') /* se401 janggu compressed rgb */ #define V4L2_PIX_FMT_S5C_UYVY_JPG v4l2_fourcc('S', '5', 'C', 'I') /* S5C73M3 interleaved UYVY/JPEG */ +#define V4L2_PIX_FMT_Y8I v4l2_fourcc('Y', '8', 'I', ' ') /* Greyscale 8-bit L/R interleaved */ +#define V4L2_PIX_FMT_Y12I v4l2_fourcc('Y', '1', '2', 'I') /* Greyscale 12-bit L/R interleaved */ +#define V4L2_PIX_FMT_Z16 v4l2_fourcc('Z', '1', '6', ' ') /* Depth data 16-bit */ /* SDR formats - used only for Software Defined Radio devices */ #define V4L2_SDR_FMT_CU8 v4l2_fourcc('C', 'U', '0', '8') /* IQ u8 */ From 4ab4d8c00baf7f8fd0859599e3e72bb68d7daf42 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 26 Nov 2015 14:00:46 +0200 Subject: [PATCH 662/813] mmc: sdhci: Do not BUG on invalid vdd [ Upstream commit 9d5de93f6d543b356e39e225988ef443a7bce34c ] The driver may not be able to set the power correctly but that is not a reason to BUG(). Signed-off-by: Adrian Hunter Reviewed-by: Venu Byravarasu Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index fc9891fdb835..552a34dc4f82 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1315,7 +1315,9 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned char mode, pwr = SDHCI_POWER_330; break; default: - BUG(); + WARN(1, "%s: Invalid vdd %#x\n", + mmc_hostname(host->mmc), vdd); + break; } } From 3b763fc8f05e755a0c1baa1ba6c2daaf87fa5ac5 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 2 Mar 2016 00:13:37 +0200 Subject: [PATCH 663/813] net/mlx5e: Don't try to modify CQ moderation if it is not supported [ Upstream commit 7524a5d88b94afef8397a79f1e664af5b7052c22 ] If CQ moderation is not supported by the device, print a warning on netdevice load, and return error when trying to modify/query cq moderation via ethtool. Fixes: f62b8bb8f2d3 ('net/mlx5: Extend mlx5_core to support ConnectX-4 Ethernet functionality') Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 2e022e900939..2094898c78c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -399,6 +399,9 @@ static int mlx5e_get_coalesce(struct net_device *netdev, { struct mlx5e_priv *priv = netdev_priv(netdev); + if (!MLX5_CAP_GEN(priv->mdev, cq_moderation)) + return -ENOTSUPP; + coal->rx_coalesce_usecs = priv->params.rx_cq_moderation_usec; coal->rx_max_coalesced_frames = priv->params.rx_cq_moderation_pkts; coal->tx_coalesce_usecs = priv->params.tx_cq_moderation_usec; @@ -416,6 +419,9 @@ static int mlx5e_set_coalesce(struct net_device *netdev, int tc; int i; + if (!MLX5_CAP_GEN(mdev, cq_moderation)) + return -ENOTSUPP; + priv->params.tx_cq_moderation_usec = coal->tx_coalesce_usecs; priv->params.tx_cq_moderation_pkts = coal->tx_max_coalesced_frames; priv->params.rx_cq_moderation_usec = coal->rx_coalesce_usecs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index cbd17e25beeb..90e876ecc720 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -863,12 +863,10 @@ static int mlx5e_open_cq(struct mlx5e_channel *c, if (err) goto err_destroy_cq; - err = mlx5_core_modify_cq_moderation(mdev, &cq->mcq, - moderation_usecs, - moderation_frames); - if (err) - goto err_destroy_cq; - + if (MLX5_CAP_GEN(mdev, cq_moderation)) + mlx5_core_modify_cq_moderation(mdev, &cq->mcq, + moderation_usecs, + moderation_frames); return 0; err_destroy_cq: @@ -1963,6 +1961,8 @@ static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev) } if (!MLX5_CAP_ETH(mdev, self_lb_en_modifiable)) mlx5_core_warn(mdev, "Self loop back prevention is not supported\n"); + if (!MLX5_CAP_GEN(mdev, cq_moderation)) + mlx5_core_warn(mdev, "CQ modiration is not supported\n"); return 0; } From 160790a044c27bbd8b54bed71a7725f7ef6c1964 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 2 Mar 2016 00:13:38 +0200 Subject: [PATCH 664/813] net/mlx5e: Don't modify CQ before it was created [ Upstream commit 2fcb92fbd04eef26dfe7e67839da6262d83d6b65 ] Calling mlx5e_set_coalesce while the interface is down will result in modifying CQs that don't exist. Fixes: f62b8bb8f2d3 ('net/mlx5: Extend mlx5_core to support ConnectX-4 Ethernet functionality') Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 2094898c78c4..7cc9df717323 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -422,11 +422,15 @@ static int mlx5e_set_coalesce(struct net_device *netdev, if (!MLX5_CAP_GEN(mdev, cq_moderation)) return -ENOTSUPP; + mutex_lock(&priv->state_lock); priv->params.tx_cq_moderation_usec = coal->tx_coalesce_usecs; priv->params.tx_cq_moderation_pkts = coal->tx_max_coalesced_frames; priv->params.rx_cq_moderation_usec = coal->rx_coalesce_usecs; priv->params.rx_cq_moderation_pkts = coal->rx_max_coalesced_frames; + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto out; + for (i = 0; i < priv->params.num_channels; ++i) { c = priv->channel[i]; @@ -442,6 +446,8 @@ static int mlx5e_set_coalesce(struct net_device *netdev, coal->rx_max_coalesced_frames); } +out: + mutex_unlock(&priv->state_lock); return 0; } From 1e455f2fdb4bdd372b89b7c1c1cd80f70977010b Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 16 Nov 2015 14:35:48 +0100 Subject: [PATCH 665/813] s390/pci_dma: fix DMA table corruption with > 4 TB main memory [ Upstream commit 69eea95c48857c9dfcac120d6acea43027627b28 ] DMA addresses returned from map_page() are calculated by using an iommu bitmap plus a start_dma offset. The size of this bitmap is based on the main memory size. If we have more than (4 TB - start_dma) main memory, the DMA address calculation will also produce addresses > 4 TB. Such addresses cannot be inserted in the 3-level DMA page table, instead the entries modulo 4 TB will be overwritten. Fix this by restricting the iommu bitmap size to (4 TB - start_dma). Also set zdev->end_dma to the actual end address of the usable range, instead of the theoretical maximum as reported by the hardware, which fixes a sanity check in dma_map() and also the IOMMU API domain geometry aperture calculation. Signed-off-by: Gerald Schaefer Reviewed-by: Sebastian Ott Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/pci_dma.h | 2 ++ arch/s390/pci/pci.c | 3 +-- arch/s390/pci/pci_dma.c | 19 ++++++++++++++----- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 1aac41e83ea1..92df3eb8d14e 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -23,6 +23,8 @@ enum zpci_ioat_dtype { #define ZPCI_IOTA_FS_2G 2 #define ZPCI_KEY (PAGE_DEFAULT_KEY << 5) +#define ZPCI_TABLE_SIZE_RT (1UL << 42) + #define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST) #define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT) #define ZPCI_IOTA_RSTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RS) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 19442395f413..f2f6720a3331 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -701,8 +701,7 @@ static int zpci_restore(struct device *dev) goto out; zpci_map_resources(pdev); - zpci_register_ioat(zdev, 0, zdev->start_dma + PAGE_OFFSET, - zdev->start_dma + zdev->iommu_size - 1, + zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, (u64) zdev->dma_table); out: diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index d348f2c09a1e..3a40f718baef 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -458,7 +458,19 @@ int zpci_dma_init_device(struct zpci_dev *zdev) goto out_clean; } - zdev->iommu_size = (unsigned long) high_memory - PAGE_OFFSET; + /* + * Restrict the iommu bitmap size to the minimum of the following: + * - main memory size + * - 3-level pagetable address limit minus start_dma offset + * - DMA address range allowed by the hardware (clp query pci fn) + * + * Also set zdev->end_dma to the actual end address of the usable + * range, instead of the theoretical maximum as reported by hardware. + */ + zdev->iommu_size = min3((u64) high_memory, + ZPCI_TABLE_SIZE_RT - zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); + zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT; zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8); if (!zdev->iommu_bitmap) { @@ -466,10 +478,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) goto out_reg; } - rc = zpci_register_ioat(zdev, - 0, - zdev->start_dma + PAGE_OFFSET, - zdev->start_dma + zdev->iommu_size - 1, + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, (u64) zdev->dma_table); if (rc) goto out_reg; From c77d6c3c88f359fb760664a2cf054785b69a4f71 Mon Sep 17 00:00:00 2001 From: Ching Huang Date: Wed, 25 Nov 2015 19:36:02 +0800 Subject: [PATCH 666/813] arcmsr: fixed getting wrong configuration data [ Upstream commit 251e2d25bfb72b69edd414abfa42a41191d9657a ] Fixed getting wrong configuration data of adapter type B and type D. Signed-off-by: Ching Huang Reviewed-by: Hannes Reinicke Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/arcmsr/arcmsr_hba.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 333db5953607..397cdd52fbfe 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -2694,15 +2694,15 @@ static bool arcmsr_hbaB_get_config(struct AdapterControlBlock *acb) acb->firm_model, acb->firm_version); - acb->signature = readl(®->message_rwbuffer[1]); + acb->signature = readl(®->message_rwbuffer[0]); /*firm_signature,1,00-03*/ - acb->firm_request_len = readl(®->message_rwbuffer[2]); + acb->firm_request_len = readl(®->message_rwbuffer[1]); /*firm_request_len,1,04-07*/ - acb->firm_numbers_queue = readl(®->message_rwbuffer[3]); + acb->firm_numbers_queue = readl(®->message_rwbuffer[2]); /*firm_numbers_queue,2,08-11*/ - acb->firm_sdram_size = readl(®->message_rwbuffer[4]); + acb->firm_sdram_size = readl(®->message_rwbuffer[3]); /*firm_sdram_size,3,12-15*/ - acb->firm_hd_channels = readl(®->message_rwbuffer[5]); + acb->firm_hd_channels = readl(®->message_rwbuffer[4]); /*firm_ide_channels,4,16-19*/ acb->firm_cfg_version = readl(®->message_rwbuffer[25]); /*firm_cfg_version,25,100-103*/ /*firm_ide_channels,4,16-19*/ @@ -2880,15 +2880,15 @@ static bool arcmsr_hbaD_get_config(struct AdapterControlBlock *acb) iop_device_map++; count--; } - acb->signature = readl(®->msgcode_rwbuffer[1]); + acb->signature = readl(®->msgcode_rwbuffer[0]); /*firm_signature,1,00-03*/ - acb->firm_request_len = readl(®->msgcode_rwbuffer[2]); + acb->firm_request_len = readl(®->msgcode_rwbuffer[1]); /*firm_request_len,1,04-07*/ - acb->firm_numbers_queue = readl(®->msgcode_rwbuffer[3]); + acb->firm_numbers_queue = readl(®->msgcode_rwbuffer[2]); /*firm_numbers_queue,2,08-11*/ - acb->firm_sdram_size = readl(®->msgcode_rwbuffer[4]); + acb->firm_sdram_size = readl(®->msgcode_rwbuffer[3]); /*firm_sdram_size,3,12-15*/ - acb->firm_hd_channels = readl(®->msgcode_rwbuffer[5]); + acb->firm_hd_channels = readl(®->msgcode_rwbuffer[4]); /*firm_hd_channels,4,16-19*/ acb->firm_cfg_version = readl(®->msgcode_rwbuffer[25]); pr_notice("Areca RAID Controller%d: Model %s, F/W %s\n", From def391c897a2a99da371b74bd70f32f0f80dfed3 Mon Sep 17 00:00:00 2001 From: Ching Huang Date: Wed, 25 Nov 2015 19:41:23 +0800 Subject: [PATCH 667/813] arcmsr: fixes not release allocated resource [ Upstream commit 98f90debc2b64a40a416dd9794ac2d8de6b43af2 ] Releasing allocated resource if get configuration data failed. Signed-off-by: Ching Huang Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinicke Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/arcmsr/arcmsr_hba.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 397cdd52fbfe..41f9a00e4f74 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -2664,7 +2664,7 @@ static bool arcmsr_hbaB_get_config(struct AdapterControlBlock *acb) if (!arcmsr_hbaB_wait_msgint_ready(acb)) { printk(KERN_NOTICE "arcmsr%d: wait 'get adapter firmware \ miscellaneous data' timeout \n", acb->host->host_no); - return false; + goto err_free_dma; } count = 8; while (count){ @@ -2707,6 +2707,10 @@ static bool arcmsr_hbaB_get_config(struct AdapterControlBlock *acb) acb->firm_cfg_version = readl(®->message_rwbuffer[25]); /*firm_cfg_version,25,100-103*/ /*firm_ide_channels,4,16-19*/ return true; +err_free_dma: + dma_free_coherent(&acb->pdev->dev, acb->roundup_ccbsize, + acb->dma_coherent2, acb->dma_coherent_handle2); + return false; } static bool arcmsr_hbaC_get_config(struct AdapterControlBlock *pACB) From d0d84ae99d3c11775896420ff2e4bd2a084e1f90 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 27 Jan 2016 22:29:34 -0800 Subject: [PATCH 668/813] Drivers: hv: vmbus: avoid infinite loop in init_vp_index() [ Upstream commit 79fd8e706637a5c7c41f9498fe0fbfb437abfdc8 ] When we pick a CPU to use for a new subchannel we try find a non-used one on the appropriate NUMA node, we keep track of them with the primary->alloced_cpus_in_node mask. Under normal circumstances we don't run out of available CPUs but it is possible when we we don't initialize some cpus in Linux, e.g. when we boot with 'nr_cpus=' limitation. Avoid the infinite loop in init_vp_index() by checking that we still have non-used CPUs in the alloced_cpus_in_node mask and resetting it in case we don't. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel_mgmt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index bd2e9f60272a..08f922356442 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -459,6 +459,17 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui cpumask_of_node(primary->numa_node)); cur_cpu = -1; + + /* + * Normally Hyper-V host doesn't create more subchannels than there + * are VCPUs on the node but it is possible when not all present VCPUs + * on the node are initialized by guest. Clear the alloced_cpus_in_node + * to start over. + */ + if (cpumask_equal(&primary->alloced_cpus_in_node, + cpumask_of_node(primary->numa_node))) + cpumask_clear(&primary->alloced_cpus_in_node); + while (true) { cur_cpu = cpumask_next(cur_cpu, &available_mask); if (cur_cpu >= nr_cpu_ids) { From d6d72d7cbe464e3776370b65f518866953cb4e12 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 27 Jan 2016 22:29:35 -0800 Subject: [PATCH 669/813] Drivers: hv: vmbus: avoid scheduling in interrupt context in vmbus_initiate_unload() [ Upstream commit 415719160de3fae3bb9cbc617664649919cd00d0 ] We have to call vmbus_initiate_unload() on crash to make kdump work but the crash can also be happening in interrupt (e.g. Sysrq + c results in such) where we can't schedule or the following will happen: [ 314.905786] bad: scheduling from the idle thread! Just skipping the wait (and even adding some random wait here) won't help: to make host-side magic working we're supposed to receive CHANNELMSG_UNLOAD (and actually confirm the fact that we received it) but we can't use interrupt-base path (vmbus_isr()-> vmbus_on_msg_dpc()). Implement a simple busy wait ignoring all the other messages and use it if we're in an interrupt context. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel_mgmt.c | 44 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 08f922356442..37238dffd947 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "hyperv_vmbus.h" @@ -499,6 +500,40 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui channel->target_vp = hv_context.vp_index[cur_cpu]; } +static void vmbus_wait_for_unload(void) +{ + int cpu = smp_processor_id(); + void *page_addr = hv_context.synic_message_page[cpu]; + struct hv_message *msg = (struct hv_message *)page_addr + + VMBUS_MESSAGE_SINT; + struct vmbus_channel_message_header *hdr; + bool unloaded = false; + + while (1) { + if (msg->header.message_type == HVMSG_NONE) { + mdelay(10); + continue; + } + + hdr = (struct vmbus_channel_message_header *)msg->u.payload; + if (hdr->msgtype == CHANNELMSG_UNLOAD_RESPONSE) + unloaded = true; + + msg->header.message_type = HVMSG_NONE; + /* + * header.message_type needs to be written before we do + * wrmsrl() below. + */ + mb(); + + if (msg->header.message_flags.msg_pending) + wrmsrl(HV_X64_MSR_EOM, 0); + + if (unloaded) + break; + } +} + /* * vmbus_unload_response - Handler for the unload response. */ @@ -524,7 +559,14 @@ void vmbus_initiate_unload(void) hdr.msgtype = CHANNELMSG_UNLOAD; vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header)); - wait_for_completion(&vmbus_connection.unload_event); + /* + * vmbus_initiate_unload() is also called on crash and the crash can be + * happening in an interrupt context, where scheduling is impossible. + */ + if (!in_interrupt()) + wait_for_completion(&vmbus_connection.unload_event); + else + vmbus_wait_for_unload(); } /* From bee629f1b07375d467a679797acbf8d8ab3db61f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 27 Jan 2016 22:29:36 -0800 Subject: [PATCH 670/813] Drivers: hv: vmbus: don't manipulate with clocksources on crash [ Upstream commit 3ccb4fd8f492f99aece21acc1bd6142275f26236 ] clocksource_change_rating() involves mutex usage and can't be called in interrupt context. It also makes sense to avoid doing redundant work on crash. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6341be8739ae..63194a9a7189 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -293,8 +293,14 @@ void hv_cleanup(void) * Cleanup the TSC page based CS. */ if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { - clocksource_change_rating(&hyperv_cs_tsc, 10); - clocksource_unregister(&hyperv_cs_tsc); + /* + * Crash can happen in an interrupt context and unregistering + * a clocksource is impossible and redundant in this case. + */ + if (!oops_in_progress) { + clocksource_change_rating(&hyperv_cs_tsc, 10); + clocksource_unregister(&hyperv_cs_tsc); + } hypercall_msr.as_uint64 = 0; wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); From 321aeea695a2a9343c20d03f33a475978b55c719 Mon Sep 17 00:00:00 2001 From: "Manoj N. Kumar" Date: Fri, 4 Mar 2016 15:55:19 -0600 Subject: [PATCH 671/813] cxlflash: Fix to avoid unnecessary scan with internal LUNs [ Upstream commit 603ecce95f4817074a724a889cd88c3c8210f933 ] When switching to the internal LUN defined on the IBM CXL flash adapter, there is an unnecessary scan occurring on the second port. This scan leads to the following extra lines in the log: Dec 17 10:09:00 tul83p1 kernel: [ 3708.561134] cxlflash 0008:00:00.0: cxlflash_queuecommand: (scp=c0000000fc1f0f00) 11/1/0/0 cdb=(A0000000-00000000-10000000-00000000) Dec 17 10:09:00 tul83p1 kernel: [ 3708.561147] process_cmd_err: cmd failed afu_rc=32 scsi_rc=0 fc_rc=0 afu_extra=0xE, scsi_extra=0x0, fc_extra=0x0 By definition, both of the internal LUNs are on the first port/channel. When the lun_mode is switched to internal LUN the same value for host->max_channel is retained. This causes an unnecessary scan over the second port/channel. This fix alters the host->max_channel to 0 (1 port), if internal LUNs are configured and switches it back to 1 (2 ports) while going back to external LUNs. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index d1d077420964..ad8dc8d4d1c2 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -2149,6 +2149,16 @@ static ssize_t lun_mode_store(struct device *dev, rc = kstrtouint(buf, 10, &lun_mode); if (!rc && (lun_mode < 5) && (lun_mode != afu->internal_lun)) { afu->internal_lun = lun_mode; + + /* + * When configured for internal LUN, there is only one channel, + * channel number 0, else there will be 2 (default). + */ + if (afu->internal_lun) + shost->max_channel = 0; + else + shost->max_channel = NUM_FC_PORTS - 1; + afu_reset(cfg); scsi_scan_host(cfg->host); } From 25db8d1cb34a0e6eb61152529be268265c91a015 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 4 Sep 2014 17:22:54 -0700 Subject: [PATCH 672/813] intel_idle: Support for Intel Xeon Phi Processor x200 Product Family [ Upstream commit 281baf7a702693deaa45c98ef0c5161006b48257 ] Enables "Intel(R) Xeon Phi(TM) Processor x200 Product Family" support, formerly code-named KNL. It is based on modified Intel Atom Silvermont microarchitecture. Signed-off-by: Dasaratharaman Chandramouli [micah.barany@intel.com: adjusted values of residency and latency] Signed-off-by: Micah Barany [hubert.chrzaniuk@intel.com: removed deprecated CPUIDLE_FLAG_TIME_VALID flag] Signed-off-by: Hubert Chrzaniuk Signed-off-by: Pawel Karczewski Signed-off-by: Len Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/idle/intel_idle.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 146eed70bdf4..ba947df5a8c7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -716,6 +716,26 @@ static struct cpuidle_state avn_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state knl_cstates[] = { + { + .name = "C1-KNL", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze }, + { + .name = "C6-KNL", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 120, + .target_residency = 500, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze }, + { + .enter = NULL } +}; /** * intel_idle @@ -890,6 +910,10 @@ static const struct idle_cpu idle_cpu_avn = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_knl = { + .state_table = knl_cstates, +}; + #define ICPU(model, cpu) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } @@ -921,6 +945,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x57, idle_cpu_knl), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); From f3de8fbe2a2a3ec4c612e2e0ddeee68f9c5bd972 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Feb 2016 16:57:29 -0800 Subject: [PATCH 673/813] proc: revert /proc//maps [stack:TID] annotation [ Upstream commit 65376df582174ffcec9e6471bf5b0dd79ba05e4a ] Commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") added [stack:TID] annotation to /proc//maps. Finding the task of a stack VMA requires walking the entire thread list, turning this into quadratic behavior: a thousand threads means a thousand stacks, so the rendering of /proc//maps needs to look at a million combinations. The cost is not in proportion to the usefulness as described in the patch. Drop the [stack:TID] annotation to make /proc//maps (and /proc//numa_maps) usable again for higher thread counts. The [stack] annotation inside /proc//task//maps is retained, as identifying the stack VMA there is an O(1) operation. Siddesh said: "The end users needed a way to identify thread stacks programmatically and there wasn't a way to do that. I'm afraid I no longer remember (or have access to the resources that would aid my memory since I changed employers) the details of their requirement. However, I did do this on my own time because I thought it was an interesting project for me and nobody really gave any feedback then as to its utility, so as far as I am concerned you could roll back the main thread maps information since the information is available in the thread-specific files" Signed-off-by: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Siddhesh Poyarekar Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- Documentation/filesystems/proc.txt | 9 ++-- fs/proc/task_mmu.c | 66 +++++++++++------------------- fs/proc/task_nommu.c | 47 +++++++++------------ include/linux/mm.h | 3 +- mm/util.c | 27 +----------- 5 files changed, 47 insertions(+), 105 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 402ab99e409f..6716413c17ba 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -346,7 +346,7 @@ address perms offset dev inode pathname a7cb1000-a7cb2000 ---p 00000000 00:00 0 a7cb2000-a7eb2000 rw-p 00000000 00:00 0 a7eb2000-a7eb3000 ---p 00000000 00:00 0 -a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] +a7eb3000-a7ed5000 rw-p 00000000 00:00 0 a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 @@ -378,7 +378,6 @@ is not associated with a file: [heap] = the heap of the program [stack] = the stack of the main process - [stack:1001] = the stack of the thread with tid 1001 [vdso] = the "virtual dynamic shared object", the kernel system call handler @@ -386,10 +385,8 @@ is not associated with a file: The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint of the individual tasks of a process. In this file you will see a mapping marked -as [stack] if that task sees it as a stack. This is a key difference from the -content of /proc/PID/maps, where you will see all mappings that are being used -as stack by all of those tasks. Hence, for the example above, the task-level -map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: +as [stack] if that task sees it as a stack. Hence, for the example above, the +task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: 08048000-08049000 r-xp 00000000 03:00 8312 /opt/test 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f6478301db00..d598b9c809c1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -248,23 +248,29 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; + int stack = 0; - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + if (is_pid) { + stack = vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } static void @@ -324,8 +330,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = arch_vma_name(vma); if (!name) { - pid_t tid; - if (!mm) { name = "[vdso]"; goto done; @@ -337,21 +341,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = pid_of_stack(priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) { - name = "[stack]"; - } else { - /* Thread stack in /proc/PID/maps */ - seq_pad(m, ' '); - seq_printf(m, "[stack:%d]", tid); - } - } + if (is_stack(priv, vma, is_pid)) + name = "[stack]"; } done: @@ -1566,19 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else { - pid_t tid = pid_of_stack(proc_priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_puts(m, " stack"); - else - seq_printf(m, " stack:%d", tid); - } + } else if (is_stack(proc_priv, vma, is_pid)) { + seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index e0d64c92e4f6..faacb0c0d857 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; + struct mm_struct *mm = vma->vm_mm; + int stack = 0; - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + if (is_pid) { + stack = vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } /* @@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm) { - pid_t tid = pid_of_stack(priv, vma, is_pid); - - if (tid != 0) { - seq_pad(m, ' '); - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_printf(m, "[stack]"); - else - seq_printf(m, "[stack:%d]", tid); - } + } else if (mm && is_stack(priv, vma, is_pid)) { + seq_pad(m, ' '); + seq_printf(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/include/linux/mm.h b/include/linux/mm.h index f24df9c0b9df..8a761248d01e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1311,8 +1311,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } -extern struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group); +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, diff --git a/mm/util.c b/mm/util.c index 9af1c12b310c..d5259b62f8d7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -199,36 +199,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { From 72f4972b8812e0a1749d3be19d8017de25a9d15d Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Thu, 17 Mar 2016 14:52:17 +0100 Subject: [PATCH 674/813] s390/crypto: provide correct file mode at device register. [ Upstream commit 74b2375e6767935e6d9220bdbc6ed0db57f71a59 ] When the prng device driver calls misc_register() there is the possibility to also provide the recommented file permissions. This fix now gives useful values (0644) where previously just the default was used (resulting in 0600 for the device file). Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/s390/crypto/prng.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index b8045b97f4fb..d750cc0dfe30 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -669,11 +669,13 @@ static const struct file_operations prng_tdes_fops = { static struct miscdevice prng_sha512_dev = { .name = "prandom", .minor = MISC_DYNAMIC_MINOR, + .mode = 0644, .fops = &prng_sha512_fops, }; static struct miscdevice prng_tdes_dev = { .name = "prandom", .minor = MISC_DYNAMIC_MINOR, + .mode = 0644, .fops = &prng_tdes_fops, }; From 0b152db0426acee35ae4798f682ce512fc7f6a35 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Thu, 10 Mar 2016 15:32:07 -0800 Subject: [PATCH 675/813] perf/x86/cqm: Fix CQM handling of grouping events into a cache_group [ Upstream commit a223c1c7ab4cc64537dc4b911f760d851683768a ] Currently CQM (cache quality of service monitoring) is grouping all events belonging to same PID to use one RMID. However its not counting all of these different events. Hence we end up with a count of zero for all events other than the group leader. The patch tries to address the issue by keeping a flag in the perf_event.hw which has other CQM related fields. The field is updated at event creation and during grouping. Signed-off-by: Vikas Shivappa [peterz: Changed hw_perf_event::is_group_event to an int] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/1457652732-4499-2-git-send-email-vikas.shivappa@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 13 ++++++++++--- include/linux/perf_event.h | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index a316ca96f1b6..e6be335ecb54 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -281,9 +281,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. + * Mark it as a multi event group, so that we update ->count + * for every event rather than just the group leader later. */ - if (a->hw.target == b->hw.target) + if (a->hw.target == b->hw.target) { + b->hw.is_group_event = true; return true; + } /* * Are we an inherited event? @@ -849,6 +853,7 @@ static void intel_cqm_setup_event(struct perf_event *event, bool conflict = false; u32 rmid; + event->hw.is_group_event = false; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { rmid = iter->hw.cqm_rmid; @@ -940,7 +945,9 @@ static u64 intel_cqm_event_count(struct perf_event *event) return __perf_event_count(event); /* - * Only the group leader gets to report values. This stops us + * Only the group leader gets to report values except in case of + * multiple events in the same group, we still need to read the + * other events.This stops us * reporting duplicate values to userspace, and gives us a clear * rule for which task gets to report the values. * @@ -948,7 +955,7 @@ static u64 intel_cqm_event_count(struct perf_event *event) * specific packages - we forfeit that ability when we create * task events. */ - if (!cqm_group_leader(event)) + if (!cqm_group_leader(event) && !event->hw.is_group_event) return 0; /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..6cdd50f7f52d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -121,6 +121,7 @@ struct hw_perf_event { struct { /* intel_cqm */ int cqm_state; u32 cqm_rmid; + int is_group_event; struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; From 12850f3616f89b750630ae757cd8eb15d15027f1 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Thu, 10 Mar 2016 15:32:08 -0800 Subject: [PATCH 676/813] perf/x86/cqm: Fix CQM memory leak and notifier leak [ Upstream commit ada2f634cd50d050269b67b4e2966582387e7c27 ] Fixes the hotcpu notifier leak and other global variable memory leaks during CQM (cache quality of service monitoring) initialization. Signed-off-by: Vikas Shivappa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/1457652732-4499-3-git-send-email-vikas.shivappa@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 43 ++++++++++++++++------ 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index e6be335ecb54..fc704ed587e8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -211,6 +211,20 @@ static void __put_rmid(u32 rmid) list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } +static void cqm_cleanup(void) +{ + int i; + + if (!cqm_rmid_ptrs) + return; + + for (i = 0; i < cqm_max_rmid; i++) + kfree(cqm_rmid_ptrs[i]); + + kfree(cqm_rmid_ptrs); + cqm_rmid_ptrs = NULL; +} + static int intel_cqm_setup_rmid_cache(void) { struct cqm_rmid_entry *entry; @@ -218,7 +232,7 @@ static int intel_cqm_setup_rmid_cache(void) int r = 0; nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * + cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * nr_rmids, GFP_KERNEL); if (!cqm_rmid_ptrs) return -ENOMEM; @@ -249,11 +263,9 @@ static int intel_cqm_setup_rmid_cache(void) mutex_unlock(&cache_mutex); return 0; -fail: - while (r--) - kfree(cqm_rmid_ptrs[r]); - kfree(cqm_rmid_ptrs); +fail: + cqm_cleanup(); return -ENOMEM; } @@ -1322,7 +1334,7 @@ static const struct x86_cpu_id intel_cqm_match[] = { static int __init intel_cqm_init(void) { - char *str, scale[20]; + char *str = NULL, scale[20]; int i, cpu, ret; if (!x86_match_cpu(intel_cqm_match)) @@ -1382,16 +1394,25 @@ static int __init intel_cqm_init(void) cqm_pick_event_reader(i); } - __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) + if (ret) { pr_err("Intel CQM perf registration failed: %d\n", ret); - else - pr_info("Intel CQM monitoring enabled\n"); + goto out; + } + pr_info("Intel CQM monitoring enabled\n"); + + /* + * Register the hot cpu notifier once we are sure cqm + * is enabled to avoid notifier leak. + */ + __perf_cpu_notifier(intel_cqm_cpu_notifier); out: cpu_notifier_register_done(); + if (ret) { + kfree(str); + cqm_cleanup(); + } return ret; } From 61c188313c6c0808339c3dc034adfc255d84bf32 Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Tue, 16 Feb 2016 16:29:49 +0530 Subject: [PATCH 677/813] net: thunderx: Fix for multiqset not configured upon interface toggle [ Upstream commit 6a9bab79bb79bd9b2eda16f0aba1b4c43f677be9 ] When a interface is assigned morethan 8 queues and the logical interface is toggled i.e down & up, additional queues or qsets are not initialized as secondary qset count is being set to zero while tearing down. Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index dde8dc720cd3..45740af163f6 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1117,7 +1117,6 @@ int nicvf_stop(struct net_device *netdev) /* Clear multiqset info */ nic->pnicvf = nic; - nic->sqs_count = 0; return 0; } From 4c79345330edabd74f5d2e12067a8c1e762abe53 Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Tue, 16 Feb 2016 16:29:51 +0530 Subject: [PATCH 678/813] net: thunderx: Fix receive packet stats [ Upstream commit ad2ecebd67d8a80fe5412d11df375a5ed2db7cd1 ] Counting rx packets for every CQE_RX in CQ irq handler is incorrect. Synchronization is missing when multiple queues are receiving packets simultaneously. Like transmit packet stats use HW stats here. Also removed unused 'cqe_type' parameter in nicvf_rcv_pkt_handler(). Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 11 ++++++----- drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 8 ++------ drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 3 +-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 45740af163f6..b7093b9cd1e8 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -566,8 +566,7 @@ static inline void nicvf_set_rxhash(struct net_device *netdev, static void nicvf_rcv_pkt_handler(struct net_device *netdev, struct napi_struct *napi, - struct cmp_queue *cq, - struct cqe_rx_t *cqe_rx, int cqe_type) + struct cqe_rx_t *cqe_rx) { struct sk_buff *skb; struct nicvf *nic = netdev_priv(netdev); @@ -583,7 +582,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev, } /* Check for errors */ - err = nicvf_check_cqe_rx_errs(nic, cq, cqe_rx); + err = nicvf_check_cqe_rx_errs(nic, cqe_rx); if (err && !cqe_rx->rb_cnt) return; @@ -674,8 +673,7 @@ loop: cq_idx, cq_desc->cqe_type); switch (cq_desc->cqe_type) { case CQE_TYPE_RX: - nicvf_rcv_pkt_handler(netdev, napi, cq, - cq_desc, CQE_TYPE_RX); + nicvf_rcv_pkt_handler(netdev, napi, cq_desc); work_done++; break; case CQE_TYPE_SEND: @@ -1345,6 +1343,9 @@ void nicvf_update_stats(struct nicvf *nic) drv_stats->tx_frames_ok = stats->tx_ucast_frames_ok + stats->tx_bcast_frames_ok + stats->tx_mcast_frames_ok; + drv_stats->rx_frames_ok = stats->rx_ucast_frames + + stats->rx_bcast_frames + + stats->rx_mcast_frames; drv_stats->rx_drops = stats->rx_drop_red + stats->rx_drop_overrun; drv_stats->tx_drops = stats->tx_drops; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index d1c217eaf417..912ee28ab58b 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -1414,16 +1414,12 @@ void nicvf_update_sq_stats(struct nicvf *nic, int sq_idx) } /* Check for errors in the receive cmp.queue entry */ -int nicvf_check_cqe_rx_errs(struct nicvf *nic, - struct cmp_queue *cq, struct cqe_rx_t *cqe_rx) +int nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cqe_rx_t *cqe_rx) { struct nicvf_hw_stats *stats = &nic->hw_stats; - struct nicvf_drv_stats *drv_stats = &nic->drv_stats; - if (!cqe_rx->err_level && !cqe_rx->err_opcode) { - drv_stats->rx_frames_ok++; + if (!cqe_rx->err_level && !cqe_rx->err_opcode) return 0; - } if (netif_msg_rx_err(nic)) netdev_err(nic->netdev, diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h index 033e8306e91c..5652c612e20b 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h @@ -344,8 +344,7 @@ u64 nicvf_queue_reg_read(struct nicvf *nic, /* Stats */ void nicvf_update_rq_stats(struct nicvf *nic, int rq_idx); void nicvf_update_sq_stats(struct nicvf *nic, int sq_idx); -int nicvf_check_cqe_rx_errs(struct nicvf *nic, - struct cmp_queue *cq, struct cqe_rx_t *cqe_rx); +int nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cqe_rx_t *cqe_rx); int nicvf_check_cqe_tx_errs(struct nicvf *nic, struct cmp_queue *cq, struct cqe_send_t *cqe_tx); #endif /* NICVF_QUEUES_H */ From efdd094f63fd62357988695de6d5152cbd58d26c Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Wed, 9 Dec 2015 11:57:01 -0800 Subject: [PATCH 679/813] Input: xpad - correctly handle concurrent LED and FF requests [ Upstream commit 7fc595f4c02636eadaeeecfe7bbc45b57c173004 ] Track the status of the irq_out URB to prevent submission iof new requests while current one is active. Failure to do so results in the "URB submitted while active" warning/stack trace. Store pending brightness and FF effect in the driver structure and replace it with the latest requests until the device is ready to process next request. Alternate serving LED vs FF requests to make sure one does not starve another. See [1] for discussion. Inspired by patch of Sarah Bessmer [2]. [1]: http://www.spinics.net/lists/linux-input/msg40708.html [2]: http://www.spinics.net/lists/linux-input/msg31450.html Signed-off-by: Pavel Rojtberg Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 322 +++++++++++++++++++++++----------- 1 file changed, 223 insertions(+), 99 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 2b2f9d66c2c7..0679c4143c29 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -317,6 +317,19 @@ static struct usb_device_id xpad_table[] = { MODULE_DEVICE_TABLE(usb, xpad_table); +struct xpad_output_packet { + u8 data[XPAD_PKT_LEN]; + u8 len; + bool pending; +}; + +#define XPAD_OUT_CMD_IDX 0 +#define XPAD_OUT_FF_IDX 1 +#define XPAD_OUT_LED_IDX (1 + IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF)) +#define XPAD_NUM_OUT_PACKETS (1 + \ + IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF) + \ + IS_ENABLED(CONFIG_JOYSTICK_XPAD_LEDS)) + struct usb_xpad { struct input_dev *dev; /* input device interface */ struct usb_device *udev; /* usb device */ @@ -329,9 +342,13 @@ struct usb_xpad { dma_addr_t idata_dma; struct urb *irq_out; /* urb for interrupt out report */ + bool irq_out_active; /* we must not use an active URB */ unsigned char *odata; /* output data */ dma_addr_t odata_dma; - struct mutex odata_mutex; + spinlock_t odata_lock; + + struct xpad_output_packet out_packets[XPAD_NUM_OUT_PACKETS]; + int last_out_packet; #if defined(CONFIG_JOYSTICK_XPAD_LEDS) struct xpad_led *led; @@ -678,18 +695,71 @@ exit: __func__, retval); } +/* Callers must hold xpad->odata_lock spinlock */ +static bool xpad_prepare_next_out_packet(struct usb_xpad *xpad) +{ + struct xpad_output_packet *pkt, *packet = NULL; + int i; + + for (i = 0; i < XPAD_NUM_OUT_PACKETS; i++) { + if (++xpad->last_out_packet >= XPAD_NUM_OUT_PACKETS) + xpad->last_out_packet = 0; + + pkt = &xpad->out_packets[xpad->last_out_packet]; + if (pkt->pending) { + dev_dbg(&xpad->intf->dev, + "%s - found pending output packet %d\n", + __func__, xpad->last_out_packet); + packet = pkt; + break; + } + } + + if (packet) { + memcpy(xpad->odata, packet->data, packet->len); + xpad->irq_out->transfer_buffer_length = packet->len; + return true; + } + + return false; +} + +/* Callers must hold xpad->odata_lock spinlock */ +static int xpad_try_sending_next_out_packet(struct usb_xpad *xpad) +{ + int error; + + if (!xpad->irq_out_active && xpad_prepare_next_out_packet(xpad)) { + error = usb_submit_urb(xpad->irq_out, GFP_ATOMIC); + if (error) { + dev_err(&xpad->intf->dev, + "%s - usb_submit_urb failed with result %d\n", + __func__, error); + return -EIO; + } + + xpad->irq_out_active = true; + } + + return 0; +} + static void xpad_irq_out(struct urb *urb) { struct usb_xpad *xpad = urb->context; struct device *dev = &xpad->intf->dev; - int retval, status; + int status = urb->status; + int error; + unsigned long flags; - status = urb->status; + spin_lock_irqsave(&xpad->odata_lock, flags); switch (status) { case 0: /* success */ - return; + xpad->out_packets[xpad->last_out_packet].pending = false; + xpad->irq_out_active = xpad_prepare_next_out_packet(xpad); + break; case -ECONNRESET: case -ENOENT: @@ -697,19 +767,26 @@ static void xpad_irq_out(struct urb *urb) /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, status); - return; + xpad->irq_out_active = false; + break; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, status); - goto exit; + break; } -exit: - retval = usb_submit_urb(urb, GFP_ATOMIC); - if (retval) - dev_err(dev, "%s - usb_submit_urb failed with result %d\n", - __func__, retval); + if (xpad->irq_out_active) { + error = usb_submit_urb(urb, GFP_ATOMIC); + if (error) { + dev_err(dev, + "%s - usb_submit_urb failed with result %d\n", + __func__, error); + xpad->irq_out_active = false; + } + } + + spin_unlock_irqrestore(&xpad->odata_lock, flags); } static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad) @@ -728,7 +805,7 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad) goto fail1; } - mutex_init(&xpad->odata_mutex); + spin_lock_init(&xpad->odata_lock); xpad->irq_out = usb_alloc_urb(0, GFP_KERNEL); if (!xpad->irq_out) { @@ -770,27 +847,57 @@ static void xpad_deinit_output(struct usb_xpad *xpad) static int xpad_inquiry_pad_presence(struct usb_xpad *xpad) { + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_CMD_IDX]; + unsigned long flags; int retval; - mutex_lock(&xpad->odata_mutex); + spin_lock_irqsave(&xpad->odata_lock, flags); - xpad->odata[0] = 0x08; - xpad->odata[1] = 0x00; - xpad->odata[2] = 0x0F; - xpad->odata[3] = 0xC0; - xpad->odata[4] = 0x00; - xpad->odata[5] = 0x00; - xpad->odata[6] = 0x00; - xpad->odata[7] = 0x00; - xpad->odata[8] = 0x00; - xpad->odata[9] = 0x00; - xpad->odata[10] = 0x00; - xpad->odata[11] = 0x00; - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x08; + packet->data[1] = 0x00; + packet->data[2] = 0x0F; + packet->data[3] = 0xC0; + packet->data[4] = 0x00; + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; - retval = usb_submit_urb(xpad->irq_out, GFP_KERNEL); + /* Reset the sequence so we send out presence first */ + xpad->last_out_packet = -1; + retval = xpad_try_sending_next_out_packet(xpad); - mutex_unlock(&xpad->odata_mutex); + spin_unlock_irqrestore(&xpad->odata_lock, flags); + + return retval; +} + +static int xpad_start_xbox_one(struct usb_xpad *xpad) +{ + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_CMD_IDX]; + unsigned long flags; + int retval; + + spin_lock_irqsave(&xpad->odata_lock, flags); + + /* Xbox one controller needs to be initialized. */ + packet->data[0] = 0x05; + packet->data[1] = 0x20; + packet->len = 2; + packet->pending = true; + + /* Reset the sequence so we send out start packet first */ + xpad->last_out_packet = -1; + retval = xpad_try_sending_next_out_packet(xpad); + + spin_unlock_irqrestore(&xpad->odata_lock, flags); return retval; } @@ -799,8 +906,11 @@ static int xpad_inquiry_pad_presence(struct usb_xpad *xpad) static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect *effect) { struct usb_xpad *xpad = input_get_drvdata(dev); + struct xpad_output_packet *packet = &xpad->out_packets[XPAD_OUT_FF_IDX]; __u16 strong; __u16 weak; + int retval; + unsigned long flags; if (effect->type != FF_RUMBLE) return 0; @@ -808,69 +918,80 @@ static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect strong = effect->u.rumble.strong_magnitude; weak = effect->u.rumble.weak_magnitude; + spin_lock_irqsave(&xpad->odata_lock, flags); + switch (xpad->xtype) { case XTYPE_XBOX: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x06; - xpad->odata[2] = 0x00; - xpad->odata[3] = strong / 256; /* left actuator */ - xpad->odata[4] = 0x00; - xpad->odata[5] = weak / 256; /* right actuator */ - xpad->irq_out->transfer_buffer_length = 6; + packet->data[0] = 0x00; + packet->data[1] = 0x06; + packet->data[2] = 0x00; + packet->data[3] = strong / 256; /* left actuator */ + packet->data[4] = 0x00; + packet->data[5] = weak / 256; /* right actuator */ + packet->len = 6; + packet->pending = true; break; case XTYPE_XBOX360: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x08; - xpad->odata[2] = 0x00; - xpad->odata[3] = strong / 256; /* left actuator? */ - xpad->odata[4] = weak / 256; /* right actuator? */ - xpad->odata[5] = 0x00; - xpad->odata[6] = 0x00; - xpad->odata[7] = 0x00; - xpad->irq_out->transfer_buffer_length = 8; + packet->data[0] = 0x00; + packet->data[1] = 0x08; + packet->data[2] = 0x00; + packet->data[3] = strong / 256; /* left actuator? */ + packet->data[4] = weak / 256; /* right actuator? */ + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->len = 8; + packet->pending = true; break; case XTYPE_XBOX360W: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x01; - xpad->odata[2] = 0x0F; - xpad->odata[3] = 0xC0; - xpad->odata[4] = 0x00; - xpad->odata[5] = strong / 256; - xpad->odata[6] = weak / 256; - xpad->odata[7] = 0x00; - xpad->odata[8] = 0x00; - xpad->odata[9] = 0x00; - xpad->odata[10] = 0x00; - xpad->odata[11] = 0x00; - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x00; + packet->data[1] = 0x01; + packet->data[2] = 0x0F; + packet->data[3] = 0xC0; + packet->data[4] = 0x00; + packet->data[5] = strong / 256; + packet->data[6] = weak / 256; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; break; case XTYPE_XBOXONE: - xpad->odata[0] = 0x09; /* activate rumble */ - xpad->odata[1] = 0x08; - xpad->odata[2] = 0x00; - xpad->odata[3] = 0x08; /* continuous effect */ - xpad->odata[4] = 0x00; /* simple rumble mode */ - xpad->odata[5] = 0x03; /* L and R actuator only */ - xpad->odata[6] = 0x00; /* TODO: LT actuator */ - xpad->odata[7] = 0x00; /* TODO: RT actuator */ - xpad->odata[8] = strong / 256; /* left actuator */ - xpad->odata[9] = weak / 256; /* right actuator */ - xpad->odata[10] = 0x80; /* length of pulse */ - xpad->odata[11] = 0x00; /* stop period of pulse */ - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x09; /* activate rumble */ + packet->data[1] = 0x08; + packet->data[2] = 0x00; + packet->data[3] = 0x08; /* continuous effect */ + packet->data[4] = 0x00; /* simple rumble mode */ + packet->data[5] = 0x03; /* L and R actuator only */ + packet->data[6] = 0x00; /* TODO: LT actuator */ + packet->data[7] = 0x00; /* TODO: RT actuator */ + packet->data[8] = strong / 256; /* left actuator */ + packet->data[9] = weak / 256; /* right actuator */ + packet->data[10] = 0x80; /* length of pulse */ + packet->data[11] = 0x00; /* stop period of pulse */ + packet->len = 12; + packet->pending = true; break; default: dev_dbg(&xpad->dev->dev, "%s - rumble command sent to unsupported xpad type: %d\n", __func__, xpad->xtype); - return -EINVAL; + retval = -EINVAL; + goto out; } - return usb_submit_urb(xpad->irq_out, GFP_ATOMIC); + retval = xpad_try_sending_next_out_packet(xpad); + +out: + spin_unlock_irqrestore(&xpad->odata_lock, flags); + return retval; } static int xpad_init_ff(struct usb_xpad *xpad) @@ -921,36 +1042,44 @@ struct xpad_led { */ static void xpad_send_led_command(struct usb_xpad *xpad, int command) { + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_LED_IDX]; + unsigned long flags; + command %= 16; - mutex_lock(&xpad->odata_mutex); + spin_lock_irqsave(&xpad->odata_lock, flags); switch (xpad->xtype) { case XTYPE_XBOX360: - xpad->odata[0] = 0x01; - xpad->odata[1] = 0x03; - xpad->odata[2] = command; - xpad->irq_out->transfer_buffer_length = 3; + packet->data[0] = 0x01; + packet->data[1] = 0x03; + packet->data[2] = command; + packet->len = 3; + packet->pending = true; break; + case XTYPE_XBOX360W: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x00; - xpad->odata[2] = 0x08; - xpad->odata[3] = 0x40 + command; - xpad->odata[4] = 0x00; - xpad->odata[5] = 0x00; - xpad->odata[6] = 0x00; - xpad->odata[7] = 0x00; - xpad->odata[8] = 0x00; - xpad->odata[9] = 0x00; - xpad->odata[10] = 0x00; - xpad->odata[11] = 0x00; - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x00; + packet->data[1] = 0x00; + packet->data[2] = 0x08; + packet->data[3] = 0x40 + command; + packet->data[4] = 0x00; + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; break; } - usb_submit_urb(xpad->irq_out, GFP_KERNEL); - mutex_unlock(&xpad->odata_mutex); + xpad_try_sending_next_out_packet(xpad); + + spin_unlock_irqrestore(&xpad->odata_lock, flags); } /* @@ -1048,13 +1177,8 @@ static int xpad_open(struct input_dev *dev) if (usb_submit_urb(xpad->irq_in, GFP_KERNEL)) return -EIO; - if (xpad->xtype == XTYPE_XBOXONE) { - /* Xbox one controller needs to be initialized. */ - xpad->odata[0] = 0x05; - xpad->odata[1] = 0x20; - xpad->irq_out->transfer_buffer_length = 2; - return usb_submit_urb(xpad->irq_out, GFP_KERNEL); - } + if (xpad->xtype == XTYPE_XBOXONE) + return xpad_start_xbox_one(xpad); return 0; } From e79e7333c3a3d94a2b4f10f4977b45162ef160cf Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 3 Dec 2015 22:09:31 -0500 Subject: [PATCH 680/813] time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow [ Upstream commit 37cf4dc3370fbca0344e23bb96446eb2c3548ba7 ] For adjtimex()'s ADJ_SETOFFSET, make sure the tv_usec value is sane. We might multiply them later which can cause an overflow and undefined behavior. This patch introduces new helper functions to simplify the checking code and adds comments to clarify Orginally this patch was by Sasha Levin, but I've basically rewritten it, so he should get credit for finding the issue and I should get the blame for any mistakes made since. Also, credit to Richard Cochran for the phrasing used in the comment for what is considered valid here. Cc: Sasha Levin Cc: Richard Cochran Cc: Thomas Gleixner Reported-by: Sasha Levin Signed-off-by: John Stultz Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/time.h | 26 ++++++++++++++++++++++++++ kernel/time/ntp.c | 10 ++++++++-- kernel/time/timekeeping.c | 2 +- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index beebe3a02d43..297f09f23896 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -125,6 +125,32 @@ static inline bool timeval_valid(const struct timeval *tv) extern struct timespec timespec_trunc(struct timespec t, unsigned gran); +/* + * Validates if a timespec/timeval used to inject a time offset is valid. + * Offsets can be postive or negative. The value of the timeval/timespec + * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must + * always be non-negative. + */ +static inline bool timeval_inject_offset_valid(const struct timeval *tv) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_inject_offset_valid(const struct timespec *ts) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more nanoseconds then a second */ + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 149cc8086aea..d7654e2f902c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -674,8 +674,14 @@ int ntp_validate_timex(struct timex *txc) return -EINVAL; } - if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) - return -EPERM; + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } /* * Check for potential multiplication overflows that can diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 99188ee5d9d0..d9249daf14ba 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -958,7 +958,7 @@ int timekeeping_inject_offset(struct timespec *ts) struct timespec64 ts64, tmp; int ret = 0; - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + if (!timespec_inject_offset_valid(ts)) return -EINVAL; ts64 = timespec_to_timespec64(*ts); From 1db396648ca33eb92dd0ed5cc2ea2f58816eeb9a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Apr 2016 10:25:14 -0600 Subject: [PATCH 681/813] ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO [ Upstream commit dd4e17ab704269bce71402285f5e8b9ac24b1eff ] Recently, in commit 37cf4dc3370f I forgot to check if the timeval being passed was actually a timespec (as is signaled with ADJ_NANO). This resulted in that patch breaking ADJ_SETOFFSET users who set ADJ_NANO, by rejecting valid timespecs that were compared with valid timeval ranges. This patch addresses this by checking for the ADJ_NANO flag and using the timepsec check instead in that case. Reported-by: Harald Hoyer Reported-by: Kay Sievers Fixes: 37cf4dc3370f "time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow" Signed-off-by: John Stultz Cc: Sasha Levin Cc: Richard Cochran Cc: Prarit Bhargava Cc: David Herrmann Link: http://lkml.kernel.org/r/1453417415-19110-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/time/ntp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d7654e2f902c..ab861771e37f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -679,8 +679,18 @@ int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } } /* From e0df9595ca96607755a04929a7e9df665de01181 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 9 May 2016 12:01:27 -0600 Subject: [PATCH 682/813] drm: Balance error path for GEM handle allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6984128d01cf935820a0563f3a00c6623ba58109 ] The current error path for failure when establishing a handle for a GEM object is unbalance, e.g. we call object_close() without calling first object_open(). Use the typical onion structure to only undo what has been set up prior to the error. Signed-off-by: Chris Wilson Reviewed-by: Ville Syrjälä Signed-off-by: Daniel Vetter Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_gem.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index c7de454e8e88..b205224f1a44 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -338,27 +338,32 @@ drm_gem_handle_create_tail(struct drm_file *file_priv, spin_unlock(&file_priv->table_lock); idr_preload_end(); mutex_unlock(&dev->object_name_lock); - if (ret < 0) { - drm_gem_object_handle_unreference_unlocked(obj); - return ret; - } + if (ret < 0) + goto err_unref; + *handlep = ret; ret = drm_vma_node_allow(&obj->vma_node, file_priv->filp); - if (ret) { - drm_gem_handle_delete(file_priv, *handlep); - return ret; - } + if (ret) + goto err_remove; if (dev->driver->gem_open_object) { ret = dev->driver->gem_open_object(obj, file_priv); - if (ret) { - drm_gem_handle_delete(file_priv, *handlep); - return ret; - } + if (ret) + goto err_revoke; } return 0; + +err_revoke: + drm_vma_node_revoke(&obj->vma_node, file_priv->filp); +err_remove: + spin_lock(&file_priv->table_lock); + idr_remove(&file_priv->object_idr, *handlep); + spin_unlock(&file_priv->table_lock); +err_unref: + drm_gem_object_handle_unreference_unlocked(obj); + return ret; } /** From c00414f20096c279d317b7763e5b90eb15d58e21 Mon Sep 17 00:00:00 2001 From: Maruthi Srinivas Bayyavarapu Date: Wed, 11 May 2016 08:16:36 -0400 Subject: [PATCH 683/813] ALSA: hda: add AMD Polaris-10/11 AZ PCI IDs with proper driver caps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8eb22214b7cb0c0a28be6caf3b81201629d8ea7c ] This commit fixes garbled audio on Polaris-10/11 variants Signed-off-by: Maruthi Bayyavarapu Reviewed-by: Alex Deucher Acked-by: Christian König Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 12f7f6fdae4d..d4671973d889 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2366,6 +2366,10 @@ static const struct pci_device_id azx_ids[] = { .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, { PCI_DEVICE(0x1002, 0xaae8), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, + { PCI_DEVICE(0x1002, 0xaae0), + .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, + { PCI_DEVICE(0x1002, 0xaaf0), + .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, /* VIA VT8251/VT8237A */ { PCI_DEVICE(0x1106, 0x3288), .driver_data = AZX_DRIVER_VIA | AZX_DCAPS_POSFIX_VIA }, From 25dcddef047a2720d8cce5d8a46a9c69d7cbd142 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Jun 2016 19:20:33 +0100 Subject: [PATCH 684/813] ecryptfs: fix handling of directory opening [ Upstream commit 6a480a7842545ec520a91730209ec0bae41694c1 ] First of all, trying to open them r/w is idiocy; it's guaranteed to fail. Moreover, assigning ->f_pos and assuming that everything will work is blatantly broken - try that with e.g. tmpfs as underlying layer and watch the fireworks. There may be a non-trivial amount of state associated with current IO position, well beyond the numeric offset. Using the single struct file associated with underlying inode is really not a good idea; we ought to open one for each ecryptfs directory struct file. Additionally, file_operations both for directories and non-directories are full of pointless methods; non-directories should *not* have ->iterate(), directories should not have ->flush(), ->fasync() and ->splice_read(). Signed-off-by: Al Viro Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 11309683d65f..27794b137b24 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -236,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -260,6 +251,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -280,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -359,20 +402,16 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, From e4b3a13ece27435f509b40235af7bec052b4dddb Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 22 Mar 2016 09:38:18 +1000 Subject: [PATCH 685/813] drm/radeon/mst: fix regression in lane/link handling. [ Upstream commit b36f7d26a7fdc0b07b1217368ee09bb8560269f8 ] The function this used changed in 092c96a8ab9d1bd60ada2ed385cc364ce084180e drm/radeon: fix dp link rate selection (v2) However for MST we should just always train to the max link/rate. Though we probably need to limit this for future hw, in theory radeon won't support it. This fixes my 30" monitor with MST enabled. Cc: stable@vger.kernel.org # v4.4 Signed-off-by: Dave Airlie Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_dp_mst.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_dp_mst.c b/drivers/gpu/drm/radeon/radeon_dp_mst.c index b431c9c2b247..6dd39bdedb97 100644 --- a/drivers/gpu/drm/radeon/radeon_dp_mst.c +++ b/drivers/gpu/drm/radeon/radeon_dp_mst.c @@ -525,17 +525,9 @@ static bool radeon_mst_mode_fixup(struct drm_encoder *encoder, drm_mode_set_crtcinfo(adjusted_mode, 0); { struct radeon_connector_atom_dig *dig_connector; - int ret; - dig_connector = mst_enc->connector->con_priv; - ret = radeon_dp_get_dp_link_config(&mst_enc->connector->base, - dig_connector->dpcd, adjusted_mode->clock, - &dig_connector->dp_lane_count, - &dig_connector->dp_clock); - if (ret) { - dig_connector->dp_lane_count = 0; - dig_connector->dp_clock = 0; - } + dig_connector->dp_lane_count = drm_dp_max_lane_count(dig_connector->dpcd); + dig_connector->dp_clock = drm_dp_max_link_rate(dig_connector->dpcd); DRM_DEBUG_KMS("dig clock %p %d %d\n", dig_connector, dig_connector->dp_lane_count, dig_connector->dp_clock); } From d774bfcec0980b1be5c02405c5ac2e6c4ede56b7 Mon Sep 17 00:00:00 2001 From: "Manoj N. Kumar" Date: Mon, 23 May 2016 14:30:42 -0600 Subject: [PATCH 686/813] cxlflash: Fix to resolve dead-lock during EEH recovery [ Upstream commit 635f6b0893cff193a1774881ebb1e4a4b9a7fead ] When a cxlflash adapter goes into EEH recovery and multiple processes (each having established its own context) are active, the EEH recovery can hang if the processes attempt to recover in parallel. The symptom logged after a couple of minutes is: INFO: task eehd:48 blocked for more than 120 seconds. Not tainted 4.5.0-491-26f710d+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. eehd 0 48 2 Call Trace: __switch_to+0x2f0/0x410 __schedule+0x300/0x980 schedule+0x48/0xc0 rwsem_down_write_failed+0x294/0x410 down_write+0x88/0xb0 cxlflash_pci_error_detected+0x100/0x1c0 [cxlflash] cxl_vphb_error_detected+0x88/0x110 [cxl] cxl_pci_error_detected+0xb0/0x1d0 [cxl] eeh_report_error+0xbc/0x130 eeh_pe_dev_traverse+0x94/0x160 eeh_handle_normal_event+0x17c/0x450 eeh_handle_event+0x184/0x370 eeh_event_handler+0x1c8/0x1d0 kthread+0x110/0x130 ret_from_kernel_thread+0x5c/0xa4 INFO: task blockio:33215 blocked for more than 120 seconds. Not tainted 4.5.0-491-26f710d+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. blockio 0 33215 33213 Call Trace: 0x1 (unreliable) __switch_to+0x2f0/0x410 __schedule+0x300/0x980 schedule+0x48/0xc0 rwsem_down_read_failed+0x124/0x1d0 down_read+0x68/0x80 cxlflash_ioctl+0x70/0x6f0 [cxlflash] scsi_ioctl+0x3b0/0x4c0 sg_ioctl+0x960/0x1010 do_vfs_ioctl+0xd8/0x8c0 SyS_ioctl+0xd4/0xf0 system_call+0x38/0xb4 INFO: task eehd:48 blocked for more than 120 seconds. The hang is because of a 3 way dead-lock: Process A holds the recovery mutex, and waits for eehd to complete. Process B holds the semaphore and waits for the recovery mutex. eehd waits for semaphore. The fix is to have Process B above release the semaphore before attempting to acquire the recovery mutex. This will allow eehd to proceed to completion. Signed-off-by: Manoj N. Kumar Reviewed-by: Matthew R. Ochs Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/superpipe.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index 34b21a0a926a..babe7ccc1777 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c @@ -1590,6 +1590,13 @@ err1: * place at the same time and the failure was due to CXL services being * unable to keep up. * + * As this routine is called on ioctl context, it holds the ioctl r/w + * semaphore that is used to drain ioctls in recovery scenarios. The + * implementation to achieve the pacing described above (a local mutex) + * requires that the ioctl r/w semaphore be dropped and reacquired to + * avoid a 3-way deadlock when multiple process recoveries operate in + * parallel. + * * Because a user can detect an error condition before the kernel, it is * quite possible for this routine to act as the kernel's EEH detection * source (MMIO read of mbox_r). Because of this, there is a window of @@ -1617,9 +1624,17 @@ static int cxlflash_afu_recover(struct scsi_device *sdev, int rc = 0; atomic_inc(&cfg->recovery_threads); + up_read(&cfg->ioctl_rwsem); rc = mutex_lock_interruptible(mutex); + down_read(&cfg->ioctl_rwsem); if (rc) goto out; + rc = check_state(cfg); + if (rc) { + dev_err(dev, "%s: Failed state! rc=%d\n", __func__, rc); + rc = -ENODEV; + goto out; + } dev_dbg(dev, "%s: reason 0x%016llX rctxid=%016llX\n", __func__, recover->reason, rctxid); From 4af03e19226cebe72d45c669ddc3b33666adf18e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 May 2016 10:25:51 -0600 Subject: [PATCH 687/813] blk-mq: End unstarted requests on dying queue [ Upstream commit a59e0f5795fe52dad42a99c00287e3766153b312 ] Go directly to ending a request if it wasn't started. Previously, completing a request may invoke a driver callback for a request it didn't initialize. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6d6f8feb48c0..839b1e17481b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -601,8 +601,10 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) - blk_mq_complete_request(rq, -EIO); + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } return; } if (rq->cmd_flags & REQ_NO_TIMEOUT) From b5282220fdd4a1e39b59a28fd499d03f2cb0c056 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Fri, 27 May 2016 14:59:00 -0400 Subject: [PATCH 688/813] btrfs: Continue write in case of can_not_nocow [ Upstream commit 4da2e26a2a32b174878744bd0f07db180c875f26 ] btrfs failed in xfstests btrfs/080 with -o nodatacow. Can be reproduced by following script: DEV=/dev/vdg MNT=/mnt/tmp umount $DEV &>/dev/null mkfs.btrfs -f $DEV mount -o nodatacow $DEV $MNT dd if=/dev/zero of=$MNT/test bs=1 count=2048 & btrfs subvolume snapshot -r $MNT $MNT/test_snap & wait -- We can see dd failed on NO_SPACE. Reason: __btrfs_buffered_write should run cow write when no_cow impossible, and current code is designed with above logic. But check_can_nocow() have 2 type of return value(0 and <0) on can_not_no_cow, and current code only continue write on first case, the second case happened in doing subvolume. Fix: Continue write when check_can_nocow() return 0 and <0. Reviewed-by: Filipe Manana Signed-off-by: Zhao Lei Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/file.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5e5db3687e34..353f4bae658c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1526,27 +1526,24 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) { - ret = check_can_nocow(inode, pos, &write_bytes); - if (ret < 0) - break; - if (ret > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); - reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - goto reserve_metadata; - } + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + goto reserve_metadata; } + ret = btrfs_check_data_free_space(inode, pos, write_bytes); if (ret < 0) break; From a2350f3d827a4669cf41b976b8cd083e6269c0fa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 3 Jun 2016 20:38:35 -0600 Subject: [PATCH 689/813] clocksource: Allow unregistering the watchdog [ Upstream commit bbf66d897adf2bb0c310db96c97e8db6369f39e1 ] Hyper-V vmbus module registers TSC page clocksource when loaded. This is the clocksource with the highest rating and thus it becomes the watchdog making unloading of the vmbus module impossible. Separate clocksource_select_watchdog() from clocksource_enqueue_watchdog() and use it on clocksource register/rating change/unregister. After all, lobotomized monkeys may need some love too. Signed-off-by: Vitaly Kuznetsov Cc: John Stultz Cc: Dexuan Cui Cc: K. Y. Srinivasan Link: http://lkml.kernel.org/r/1453483913-25672-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/time/clocksource.c | 56 ++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1347882d131e..b98810d2f3b4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { - watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + + /* Pick the best watchdog. */ + if (!watchdog || cs->rating > watchdog->rating) + watchdog = cs; + } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ From 80cce3cde88e274943085c21241f445935cdfaa3 Mon Sep 17 00:00:00 2001 From: Ganapatrao Kulkarni Date: Wed, 25 May 2016 15:29:20 +0200 Subject: [PATCH 690/813] irqchip/gicv3-its: numa: Enable workaround for Cavium thunderx erratum 23144 [ Upstream commit fbf8f40e1658cb2f17452dbd3c708e329c5d27e0 ] The erratum fixes the hang of ITS SYNC command by avoiding inter node io and collections/cpu mapping on thunderx dual-socket platform. This fix is only applicable for Cavium's ThunderX dual-socket platform. Reviewed-by: Robert Richter Signed-off-by: Ganapatrao Kulkarni Signed-off-by: Robert Richter Signed-off-by: Marc Zyngier Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 9 ++++++ drivers/irqchip/irq-gic-v3-its.c | 49 ++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c7236d1a3f64..14cdc6dea493 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -391,6 +391,15 @@ config CAVIUM_ERRATUM_22375 If unsure, say Y. +config CAVIUM_ERRATUM_23144 + bool "Cavium erratum 23144: ITS SYNC hang on dual socket system" + depends on NUMA + default y + help + ITS SYNC command hang for cross node io and collections/cpu mapping. + + If unsure, say Y. + config CAVIUM_ERRATUM_23154 bool "Cavium erratum 23154: Access to ICC_IAR1_EL1 is not sync'ed" default y diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index a159529f9d53..c5f1757ac61d 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -41,6 +41,7 @@ #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING (1ULL << 0) #define ITS_FLAGS_WORKAROUND_CAVIUM_22375 (1ULL << 1) +#define ITS_FLAGS_WORKAROUND_CAVIUM_23144 (1ULL << 2) #define RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING (1 << 0) @@ -71,6 +72,7 @@ struct its_node { struct list_head its_device_list; u64 flags; u32 ite_size; + int numa_node; }; #define ITS_ITT_ALIGN SZ_256 @@ -600,11 +602,23 @@ static void its_unmask_irq(struct irq_data *d) static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { - unsigned int cpu = cpumask_any_and(mask_val, cpu_online_mask); + unsigned int cpu; + const struct cpumask *cpu_mask = cpu_online_mask; struct its_device *its_dev = irq_data_get_irq_chip_data(d); struct its_collection *target_col; u32 id = its_get_event_id(d); + /* lpi cannot be routed to a redistributor that is on a foreign node */ + if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) { + if (its_dev->its->numa_node >= 0) { + cpu_mask = cpumask_of_node(its_dev->its->numa_node); + if (!cpumask_intersects(mask_val, cpu_mask)) + return -EINVAL; + } + } + + cpu = cpumask_any_and(mask_val, cpu_mask); + if (cpu >= nr_cpu_ids) return -EINVAL; @@ -1081,6 +1095,16 @@ static void its_cpu_init_collection(void) list_for_each_entry(its, &its_nodes, entry) { u64 target; + /* avoid cross node collections and its mapping */ + if (its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) { + struct device_node *cpu_node; + + cpu_node = of_get_cpu_node(cpu, NULL); + if (its->numa_node != NUMA_NO_NODE && + its->numa_node != of_node_to_nid(cpu_node)) + continue; + } + /* * We now have to bind each collection to its target * redistributor. @@ -1308,9 +1332,14 @@ static void its_irq_domain_activate(struct irq_domain *domain, { struct its_device *its_dev = irq_data_get_irq_chip_data(d); u32 event = its_get_event_id(d); + const struct cpumask *cpu_mask = cpu_online_mask; + + /* get the cpu_mask of local node */ + if (its_dev->its->numa_node >= 0) + cpu_mask = cpumask_of_node(its_dev->its->numa_node); /* Bind the LPI to the first possible CPU */ - its_dev->event_map.col_map[event] = cpumask_first(cpu_online_mask); + its_dev->event_map.col_map[event] = cpumask_first(cpu_mask); /* Map the GIC IRQ and event to the device */ its_send_mapvi(its_dev, d->hwirq, event); @@ -1400,6 +1429,13 @@ static void __maybe_unused its_enable_quirk_cavium_22375(void *data) its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_22375; } +static void __maybe_unused its_enable_quirk_cavium_23144(void *data) +{ + struct its_node *its = data; + + its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_23144; +} + static const struct gic_quirk its_quirks[] = { #ifdef CONFIG_CAVIUM_ERRATUM_22375 { @@ -1408,6 +1444,14 @@ static const struct gic_quirk its_quirks[] = { .mask = 0xffff0fff, .init = its_enable_quirk_cavium_22375, }, +#endif +#ifdef CONFIG_CAVIUM_ERRATUM_23144 + { + .desc = "ITS: Cavium erratum 23144", + .iidr = 0xa100034c, /* ThunderX pass 1.x */ + .mask = 0xffff0fff, + .init = its_enable_quirk_cavium_23144, + }, #endif { } @@ -1470,6 +1514,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent) its->base = its_base; its->phys_base = res.start; its->ite_size = ((readl_relaxed(its_base + GITS_TYPER) >> 4) & 0xf) + 1; + its->numa_node = of_node_to_nid(node); its->cmd_base = kzalloc(ITS_CMD_QUEUE_SZ, GFP_KERNEL); if (!its->cmd_base) { From f7cd8506b35cf5b357c25a6f052de61ff88a724e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2016 07:42:17 -0600 Subject: [PATCH 691/813] block: fix blk_rq_get_max_sectors for driver private requests [ Upstream commit f21018427cb007a0894c36ad702990ab639cbbb4 ] Driver private request types should not get the artifical cap for the FS requests. This is important to use the full device capabilities for internal command or NVMe pass through commands. Signed-off-by: Christoph Hellwig Reported-by: Jeff Lien Tested-by: Jeff Lien Reviewed-by: Keith Busch Updated by me to use an explicit check for the one command type that does support extended checking, instead of relying on the ordering of the enum command values - as suggested by Keith. Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 168755791ec8..fe14382f9664 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -890,7 +890,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) { struct request_queue *q = rq->q; - if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) + if (unlikely(rq->cmd_type != REQ_TYPE_FS)) return q->limits.max_hw_sectors; if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) From 9f4a5a1c0cac1e655234984b3f877ae0b7585f00 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Jun 2016 07:14:08 -0600 Subject: [PATCH 692/813] lpfc: Fix DMA faults observed upon plugging loopback connector [ Upstream commit ae09c765109293b600ba9169aa3d632e1ac1a843 ] Driver didn't program the REG_VFI mailbox correctly, giving the adapter bad addresses. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_mbox.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_mbox.c b/drivers/scsi/lpfc/lpfc_mbox.c index f87f90e9b7df..1e34b5408a29 100644 --- a/drivers/scsi/lpfc/lpfc_mbox.c +++ b/drivers/scsi/lpfc/lpfc_mbox.c @@ -2145,10 +2145,12 @@ lpfc_reg_vfi(struct lpfcMboxq *mbox, struct lpfc_vport *vport, dma_addr_t phys) reg_vfi->wwn[1] = cpu_to_le32(reg_vfi->wwn[1]); reg_vfi->e_d_tov = phba->fc_edtov; reg_vfi->r_a_tov = phba->fc_ratov; - reg_vfi->bde.addrHigh = putPaddrHigh(phys); - reg_vfi->bde.addrLow = putPaddrLow(phys); - reg_vfi->bde.tus.f.bdeSize = sizeof(vport->fc_sparam); - reg_vfi->bde.tus.f.bdeFlags = BUFF_TYPE_BDE_64; + if (phys) { + reg_vfi->bde.addrHigh = putPaddrHigh(phys); + reg_vfi->bde.addrLow = putPaddrLow(phys); + reg_vfi->bde.tus.f.bdeSize = sizeof(vport->fc_sparam); + reg_vfi->bde.tus.f.bdeFlags = BUFF_TYPE_BDE_64; + } bf_set(lpfc_reg_vfi_nport_id, reg_vfi, vport->fc_myDID); /* Only FC supports upd bit */ From 5349cdd3b49cea2d57dc05bbf3f313979751fed3 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 14 Jun 2016 10:55:22 -0700 Subject: [PATCH 693/813] HID: core: prevent out-of-bound readings [ Upstream commit 50220dead1650609206efe91f0cc116132d59b3f ] Plugging a Logitech DJ receiver with KASAN activated raises a bunch of out-of-bound readings. The fields are allocated up to MAX_USAGE, meaning that potentially, we do not have enough fields to fit the incoming values. Add checks and silence KASAN. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index ec791e169f8f..936960202cf4 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1251,6 +1251,7 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && value[n] >= min && value[n] <= max && + value[n] - min < field->maxusage && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) goto exit; } @@ -1263,11 +1264,13 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, } if (field->value[n] >= min && field->value[n] <= max + && field->value[n] - min < field->maxusage && field->usage[field->value[n] - min].hid && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (value[n] >= min && value[n] <= max + && value[n] - min < field->maxusage && field->usage[value[n] - min].hid && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); From e29e85ac08345e9276bff3ca39debc88a46d79a4 Mon Sep 17 00:00:00 2001 From: Paulo Flabiano Smorigo Date: Thu, 5 May 2016 11:09:27 -0300 Subject: [PATCH 694/813] crypto: vmx - comply with ABIs that specify vrsave as reserved. [ Upstream commit 5ca55738201c7ae1b556ad87bbb22c139ecc01dd ] It gives significant improvements ( ~+15%) on some modes. These code has been adopted from OpenSSL project in collaboration with the original author (Andy Polyakov ). Signed-off-by: Paulo Flabiano Smorigo Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/vmx/ppc-xlate.pl | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/crypto/vmx/ppc-xlate.pl b/drivers/crypto/vmx/ppc-xlate.pl index b9997335f193..9f4994cabcc7 100644 --- a/drivers/crypto/vmx/ppc-xlate.pl +++ b/drivers/crypto/vmx/ppc-xlate.pl @@ -139,6 +139,26 @@ my $vmr = sub { " vor $vx,$vy,$vy"; }; +# Some ABIs specify vrsave, special-purpose register #256, as reserved +# for system use. +my $no_vrsave = ($flavour =~ /aix|linux64le/); +my $mtspr = sub { + my ($f,$idx,$ra) = @_; + if ($idx == 256 && $no_vrsave) { + " or $ra,$ra,$ra"; + } else { + " mtspr $idx,$ra"; + } +}; +my $mfspr = sub { + my ($f,$rd,$idx) = @_; + if ($idx == 256 && $no_vrsave) { + " li $rd,-1"; + } else { + " mfspr $rd,$idx"; + } +}; + # PowerISA 2.06 stuff sub vsxmem_op { my ($f, $vrt, $ra, $rb, $op) = @_; From 4be6661ccec8a1dfe8c5242126b8efe316d894c6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 10 Jun 2016 16:47:02 +1000 Subject: [PATCH 695/813] crypto: vmx - Fix ABI detection [ Upstream commit 975f57fdff1d0eb9816806cabd27162a8a1a4038 ] When calling ppc-xlate.pl, we pass it either linux-ppc64 or linux-ppc64le. The script however was expecting linux64le, a result of its OpenSSL origins. This means we aren't obeying the ppc64le ABIv2 rules. Fix this by checking for linux-ppc64le. Fixes: 5ca55738201c ("crypto: vmx - comply with ABIs that specify vrsave as reserved.") Cc: stable@vger.kernel.org Signed-off-by: Anton Blanchard Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/vmx/ppc-xlate.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/vmx/ppc-xlate.pl b/drivers/crypto/vmx/ppc-xlate.pl index 9f4994cabcc7..b18e67d0e065 100644 --- a/drivers/crypto/vmx/ppc-xlate.pl +++ b/drivers/crypto/vmx/ppc-xlate.pl @@ -141,7 +141,7 @@ my $vmr = sub { # Some ABIs specify vrsave, special-purpose register #256, as reserved # for system use. -my $no_vrsave = ($flavour =~ /aix|linux64le/); +my $no_vrsave = ($flavour =~ /linux-ppc64le/); my $mtspr = sub { my ($f,$idx,$ra) = @_; if ($idx == 256 && $no_vrsave) { From 1062520ea49aa89b42119e868d44a67699e3a552 Mon Sep 17 00:00:00 2001 From: Matthias Schwarzott Date: Mon, 20 Jun 2016 06:22:12 -0600 Subject: [PATCH 696/813] tda10071: Fix dependency to REGMAP_I2C [ Upstream commit b046d3ad38d90276379c862f15ddd99fa8739906 ] Without I get this error for by dvb-card: tda10071: Unknown symbol devm_regmap_init_i2c (err 0) cx23885_dvb_register() dvb_register failed err = -22 cx23885_dev_setup() Failed to register dvb adapters on VID_B Signed-off-by: Matthias Schwarzott Reviewed-by: Antti Palosaari Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/media/dvb-frontends/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/dvb-frontends/Kconfig b/drivers/media/dvb-frontends/Kconfig index 292c9479bb75..310e4b8beae8 100644 --- a/drivers/media/dvb-frontends/Kconfig +++ b/drivers/media/dvb-frontends/Kconfig @@ -264,7 +264,7 @@ config DVB_MB86A16 config DVB_TDA10071 tristate "NXP TDA10071" depends on DVB_CORE && I2C - select REGMAP + select REGMAP_I2C default m if !MEDIA_SUBDRV_AUTOSELECT help Say Y when you want to support this frontend. From 3d390b83a990e688edc5682bee55c52ee46c6880 Mon Sep 17 00:00:00 2001 From: Leonidas Da Silva Barbosa Date: Mon, 27 Jun 2016 09:12:02 -0600 Subject: [PATCH 697/813] crypto: vmx - IV size failing on skcipher API [ Upstream commit 0d3d054b43719ef33232677ba27ba6097afdafbc ] IV size was zero on CBC and CTR modes, causing a bug triggered by skcipher. Fixing this adding a correct size. Signed-off-by: Leonidas Da Silva Barbosa Signed-off-by: Paulo Smorigo Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/vmx/aes_cbc.c | 2 +- drivers/crypto/vmx/aes_ctr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c index f3801b983f42..3f8bb9a40df1 100644 --- a/drivers/crypto/vmx/aes_cbc.c +++ b/drivers/crypto/vmx/aes_cbc.c @@ -191,7 +191,7 @@ struct crypto_alg p8_aes_cbc_alg = { .cra_init = p8_aes_cbc_init, .cra_exit = p8_aes_cbc_exit, .cra_blkcipher = { - .ivsize = 0, + .ivsize = AES_BLOCK_SIZE, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .setkey = p8_aes_cbc_setkey, diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c index 404a1b69a3ab..72f138985e18 100644 --- a/drivers/crypto/vmx/aes_ctr.c +++ b/drivers/crypto/vmx/aes_ctr.c @@ -175,7 +175,7 @@ struct crypto_alg p8_aes_ctr_alg = { .cra_init = p8_aes_ctr_init, .cra_exit = p8_aes_ctr_exit, .cra_blkcipher = { - .ivsize = 0, + .ivsize = AES_BLOCK_SIZE, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .setkey = p8_aes_ctr_setkey, From c5852a85ed87d963ea5bcfead4f61fc9a276882c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 15 Apr 2016 15:50:32 +0200 Subject: [PATCH 698/813] x86/hyperv: Avoid reporting bogus NMI status for Gen2 instances [ Upstream commit 1e2ae9ec072f3b7887f456426bc2cf23b80f661a ] Generation2 instances don't support reporting the NMI status on port 0x61, read from there returns 'ff' and we end up reporting nonsensical PCI error (as there is no PCI bus in these instances) on all NMIs: NMI: PCI system error (SERR) for reason ff on CPU 0. Dazed and confused, but trying to continue Fix the issue by overriding x86_platform.get_nmi_reason. Use 'booted on EFI' flag to detect Gen2 instances. Signed-off-by: Vitaly Kuznetsov Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Cathy Avery Cc: Haiyang Zhang Cc: Jiri Olsa Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: devel@linuxdriverproject.org Link: http://lkml.kernel.org/r/1460728232-31433-1-git-send-email-vkuznets@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mshyperv.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 20e242ea1bc4..cfc4a966e2b9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -152,6 +152,11 @@ static struct clocksource hyperv_cs = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +static unsigned char hv_get_nmi_reason(void) +{ + return 0; +} + static void __init ms_hyperv_init_platform(void) { /* @@ -191,6 +196,13 @@ static void __init ms_hyperv_init_platform(void) machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif mark_tsc_unstable("running on Hyper-V"); + + /* + * Generation 2 instances don't support reading the NMI status from + * 0x61 port. + */ + if (efi_enabled(EFI_BOOT)) + x86_platform.get_nmi_reason = hv_get_nmi_reason; } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { From be9755f99926603ded05d50b8adef1e73234b3fd Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Mon, 27 Jun 2016 15:30:02 +0530 Subject: [PATCH 699/813] net: thunderx: Fix link status reporting [ Upstream commit 3f4c68cfde30caa1f6d8368fd19590671411ade2 ] Check for SMU RX local/remote faults along with SPU LINK status. Otherwise at times link is UP at our end but DOWN at link partner's side. Also due to an issue in BGX it's rarely seen that initialization doesn't happen properly and SMU RX reports faults with everything fine at SPU. This patch tries to reinitialize LMAC to fix it. Also fixed LMAC disable sequence to properly bring down link. Signed-off-by: Sunil Goutham Signed-off-by: Tao Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/cavium/thunder/thunder_bgx.c | 91 ++++++++++++------- .../net/ethernet/cavium/thunder/thunder_bgx.h | 2 + 2 files changed, 62 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 9df26c2263bc..42718cc7d4e8 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -549,7 +549,9 @@ static int bgx_xaui_check_link(struct lmac *lmac) } /* Clear rcvflt bit (latching high) and read it back */ - bgx_reg_modify(bgx, lmacid, BGX_SPUX_STATUS2, SPU_STATUS2_RCVFLT); + if (bgx_reg_read(bgx, lmacid, BGX_SPUX_STATUS2) & SPU_STATUS2_RCVFLT) + bgx_reg_modify(bgx, lmacid, + BGX_SPUX_STATUS2, SPU_STATUS2_RCVFLT); if (bgx_reg_read(bgx, lmacid, BGX_SPUX_STATUS2) & SPU_STATUS2_RCVFLT) { dev_err(&bgx->pdev->dev, "Receive fault, retry training\n"); if (bgx->use_training) { @@ -568,13 +570,6 @@ static int bgx_xaui_check_link(struct lmac *lmac) return -1; } - /* Wait for MAC RX to be ready */ - if (bgx_poll_reg(bgx, lmacid, BGX_SMUX_RX_CTL, - SMU_RX_CTL_STATUS, true)) { - dev_err(&bgx->pdev->dev, "SMU RX link not okay\n"); - return -1; - } - /* Wait for BGX RX to be idle */ if (bgx_poll_reg(bgx, lmacid, BGX_SMUX_CTL, SMU_CTL_RX_IDLE, false)) { dev_err(&bgx->pdev->dev, "SMU RX not idle\n"); @@ -587,29 +582,30 @@ static int bgx_xaui_check_link(struct lmac *lmac) return -1; } - if (bgx_reg_read(bgx, lmacid, BGX_SPUX_STATUS2) & SPU_STATUS2_RCVFLT) { - dev_err(&bgx->pdev->dev, "Receive fault\n"); - return -1; - } - - /* Receive link is latching low. Force it high and verify it */ - bgx_reg_modify(bgx, lmacid, BGX_SPUX_STATUS1, SPU_STATUS1_RCV_LNK); - if (bgx_poll_reg(bgx, lmacid, BGX_SPUX_STATUS1, - SPU_STATUS1_RCV_LNK, false)) { - dev_err(&bgx->pdev->dev, "SPU receive link down\n"); - return -1; - } - + /* Clear receive packet disable */ cfg = bgx_reg_read(bgx, lmacid, BGX_SPUX_MISC_CONTROL); cfg &= ~SPU_MISC_CTL_RX_DIS; bgx_reg_write(bgx, lmacid, BGX_SPUX_MISC_CONTROL, cfg); - return 0; + + /* Check for MAC RX faults */ + cfg = bgx_reg_read(bgx, lmacid, BGX_SMUX_RX_CTL); + /* 0 - Link is okay, 1 - Local fault, 2 - Remote fault */ + cfg &= SMU_RX_CTL_STATUS; + if (!cfg) + return 0; + + /* Rx local/remote fault seen. + * Do lmac reinit to see if condition recovers + */ + bgx_lmac_xaui_init(bgx, lmacid, bgx->lmac_type); + + return -1; } static void bgx_poll_for_link(struct work_struct *work) { struct lmac *lmac; - u64 link; + u64 spu_link, smu_link; lmac = container_of(work, struct lmac, dwork.work); @@ -619,8 +615,11 @@ static void bgx_poll_for_link(struct work_struct *work) bgx_poll_reg(lmac->bgx, lmac->lmacid, BGX_SPUX_STATUS1, SPU_STATUS1_RCV_LNK, false); - link = bgx_reg_read(lmac->bgx, lmac->lmacid, BGX_SPUX_STATUS1); - if (link & SPU_STATUS1_RCV_LNK) { + spu_link = bgx_reg_read(lmac->bgx, lmac->lmacid, BGX_SPUX_STATUS1); + smu_link = bgx_reg_read(lmac->bgx, lmac->lmacid, BGX_SMUX_RX_CTL); + + if ((spu_link & SPU_STATUS1_RCV_LNK) && + !(smu_link & SMU_RX_CTL_STATUS)) { lmac->link_up = 1; if (lmac->bgx->lmac_type == BGX_MODE_XLAUI) lmac->last_speed = 40000; @@ -634,9 +633,15 @@ static void bgx_poll_for_link(struct work_struct *work) } if (lmac->last_link != lmac->link_up) { + if (lmac->link_up) { + if (bgx_xaui_check_link(lmac)) { + /* Errors, clear link_up state */ + lmac->link_up = 0; + lmac->last_speed = SPEED_UNKNOWN; + lmac->last_duplex = DUPLEX_UNKNOWN; + } + } lmac->last_link = lmac->link_up; - if (lmac->link_up) - bgx_xaui_check_link(lmac); } queue_delayed_work(lmac->check_link, &lmac->dwork, HZ * 2); @@ -708,7 +713,7 @@ static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid) static void bgx_lmac_disable(struct bgx *bgx, u8 lmacid) { struct lmac *lmac; - u64 cmrx_cfg; + u64 cfg; lmac = &bgx->lmac[lmacid]; if (lmac->check_link) { @@ -717,9 +722,33 @@ static void bgx_lmac_disable(struct bgx *bgx, u8 lmacid) destroy_workqueue(lmac->check_link); } - cmrx_cfg = bgx_reg_read(bgx, lmacid, BGX_CMRX_CFG); - cmrx_cfg &= ~(1 << 15); - bgx_reg_write(bgx, lmacid, BGX_CMRX_CFG, cmrx_cfg); + /* Disable packet reception */ + cfg = bgx_reg_read(bgx, lmacid, BGX_CMRX_CFG); + cfg &= ~CMR_PKT_RX_EN; + bgx_reg_write(bgx, lmacid, BGX_CMRX_CFG, cfg); + + /* Give chance for Rx/Tx FIFO to get drained */ + bgx_poll_reg(bgx, lmacid, BGX_CMRX_RX_FIFO_LEN, (u64)0x1FFF, true); + bgx_poll_reg(bgx, lmacid, BGX_CMRX_TX_FIFO_LEN, (u64)0x3FFF, true); + + /* Disable packet transmission */ + cfg = bgx_reg_read(bgx, lmacid, BGX_CMRX_CFG); + cfg &= ~CMR_PKT_TX_EN; + bgx_reg_write(bgx, lmacid, BGX_CMRX_CFG, cfg); + + /* Disable serdes lanes */ + if (!lmac->is_sgmii) + bgx_reg_modify(bgx, lmacid, + BGX_SPUX_CONTROL1, SPU_CTL_LOW_POWER); + else + bgx_reg_modify(bgx, lmacid, + BGX_GMP_PCS_MRX_CTL, PCS_MRX_CTL_PWR_DN); + + /* Disable LMAC */ + cfg = bgx_reg_read(bgx, lmacid, BGX_CMRX_CFG); + cfg &= ~CMR_EN; + bgx_reg_write(bgx, lmacid, BGX_CMRX_CFG, cfg); + bgx_flush_dmac_addrs(bgx, lmacid); if ((bgx->lmac_type != BGX_MODE_XFI) && diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h index 149e179363a1..42010d2e5ddf 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h @@ -41,6 +41,7 @@ #define BGX_CMRX_RX_STAT10 0xC0 #define BGX_CMRX_RX_BP_DROP 0xC8 #define BGX_CMRX_RX_DMAC_CTL 0x0E8 +#define BGX_CMRX_RX_FIFO_LEN 0x108 #define BGX_CMR_RX_DMACX_CAM 0x200 #define RX_DMACX_CAM_EN BIT_ULL(48) #define RX_DMACX_CAM_LMACID(x) (x << 49) @@ -50,6 +51,7 @@ #define BGX_CMR_CHAN_MSK_AND 0x450 #define BGX_CMR_BIST_STATUS 0x460 #define BGX_CMR_RX_LMACS 0x468 +#define BGX_CMRX_TX_FIFO_LEN 0x518 #define BGX_CMRX_TX_STAT0 0x600 #define BGX_CMRX_TX_STAT1 0x608 #define BGX_CMRX_TX_STAT2 0x610 From 822480f8f45473c41307e2abe81de681ca2ca349 Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Fri, 1 Jul 2016 17:32:09 -0400 Subject: [PATCH 700/813] Input: xpad - move pending clear to the correct location [ Upstream commit 4efc6939a83c54fb3417541be48991afd0290ba3 ] otherwise we lose ff commands: https://github.com/paroj/xpad/issues/27 Signed-off-by: Pavel Rojtberg Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 0679c4143c29..aff42d5e2296 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -718,6 +718,7 @@ static bool xpad_prepare_next_out_packet(struct usb_xpad *xpad) if (packet) { memcpy(xpad->odata, packet->data, packet->len); xpad->irq_out->transfer_buffer_length = packet->len; + packet->pending = false; return true; } @@ -757,7 +758,6 @@ static void xpad_irq_out(struct urb *urb) switch (status) { case 0: /* success */ - xpad->out_packets[xpad->last_out_packet].pending = false; xpad->irq_out_active = xpad_prepare_next_out_packet(xpad); break; From af115b1a3adff2d01ff2e55a184adf3775bf2b1c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 7 Jul 2016 16:24:28 +0800 Subject: [PATCH 701/813] drm/i915: Only ignore eDP ports that are connected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 457c52d87e5dac9a4cf1a6a287e60ea7645067d4 ] If the VBT says that a certain port should be eDP (and hence fused off from HDMI), but in reality it isn't, we need to try and acquire the HDMI connection instead. So only trust the VBT edp setting if we can connect to an eDP device on that port. Fixes: d2182a6608 (drm/i915: Don't register HDMI connectors for eDP ports on VLV/CHV) References: https://bugs.freedesktop.org/show_bug.cgi?id=96288 Signed-off-by: Chris Wilson Tested-by: Phidias Chiang Cc: Ville Syrjälä Cc: Jani Nikula Cc: Daniel Vetter Reviewed-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/1464766070-31623-1-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_display.c | 20 ++++++++++---------- drivers/gpu/drm/i915/intel_dp.c | 12 ++++++------ drivers/gpu/drm/i915/intel_drv.h | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 3292495ee10f..a95445fc46d6 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14160,6 +14160,8 @@ static void intel_setup_outputs(struct drm_device *dev) if (I915_READ(PCH_DP_D) & DP_DETECTED) intel_dp_init(dev, PCH_DP_D, PORT_D); } else if (IS_VALLEYVIEW(dev)) { + bool has_edp; + /* * The DP_DETECTED bit is the latched state of the DDC * SDA pin at boot. However since eDP doesn't require DDC @@ -14169,19 +14171,17 @@ static void intel_setup_outputs(struct drm_device *dev) * eDP ports. Consult the VBT as well as DP_DETECTED to * detect eDP ports. */ - if (I915_READ(VLV_HDMIB) & SDVO_DETECTED && - !intel_dp_is_edp(dev, PORT_B)) + has_edp = intel_dp_is_edp(dev, PORT_B); + if (I915_READ(VLV_DP_B) & DP_DETECTED || has_edp) + has_edp &= intel_dp_init(dev, VLV_DP_B, PORT_B); + if (I915_READ(VLV_HDMIB) & SDVO_DETECTED && !has_edp) intel_hdmi_init(dev, VLV_HDMIB, PORT_B); - if (I915_READ(VLV_DP_B) & DP_DETECTED || - intel_dp_is_edp(dev, PORT_B)) - intel_dp_init(dev, VLV_DP_B, PORT_B); - if (I915_READ(VLV_HDMIC) & SDVO_DETECTED && - !intel_dp_is_edp(dev, PORT_C)) + has_edp = intel_dp_is_edp(dev, PORT_C); + if (I915_READ(VLV_DP_C) & DP_DETECTED || has_edp) + has_edp &= intel_dp_init(dev, VLV_DP_C, PORT_C); + if (I915_READ(VLV_HDMIC) & SDVO_DETECTED && !has_edp) intel_hdmi_init(dev, VLV_HDMIC, PORT_C); - if (I915_READ(VLV_DP_C) & DP_DETECTED || - intel_dp_is_edp(dev, PORT_C)) - intel_dp_init(dev, VLV_DP_C, PORT_C); if (IS_CHERRYVIEW(dev)) { /* eDP not supported on port D, so don't check VBT */ diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 1f8a10fb95ab..ebbd23407a80 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -6113,8 +6113,9 @@ intel_dp_init_connector(struct intel_digital_port *intel_dig_port, return true; } -void -intel_dp_init(struct drm_device *dev, int output_reg, enum port port) +bool intel_dp_init(struct drm_device *dev, + int output_reg, + enum port port) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_digital_port *intel_dig_port; @@ -6124,7 +6125,7 @@ intel_dp_init(struct drm_device *dev, int output_reg, enum port port) intel_dig_port = kzalloc(sizeof(*intel_dig_port), GFP_KERNEL); if (!intel_dig_port) - return; + return false; intel_connector = intel_connector_alloc(); if (!intel_connector) @@ -6179,15 +6180,14 @@ intel_dp_init(struct drm_device *dev, int output_reg, enum port port) if (!intel_dp_init_connector(intel_dig_port, intel_connector)) goto err_init_connector; - return; + return true; err_init_connector: drm_encoder_cleanup(encoder); kfree(intel_connector); err_connector_alloc: kfree(intel_dig_port); - - return; + return false; } void intel_dp_mst_suspend(struct drm_device *dev) diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index c5f11e0c5d5b..67f72a7ee7cb 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1195,7 +1195,7 @@ void intel_csr_ucode_fini(struct drm_device *dev); void assert_csr_loaded(struct drm_i915_private *dev_priv); /* intel_dp.c */ -void intel_dp_init(struct drm_device *dev, int output_reg, enum port port); +bool intel_dp_init(struct drm_device *dev, int output_reg, enum port port); bool intel_dp_init_connector(struct intel_digital_port *intel_dig_port, struct intel_connector *intel_connector); void intel_dp_set_link_params(struct intel_dp *intel_dp, From e153f52df99a88614384dcc6568014ece08802ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 7 Jul 2016 16:24:29 +0800 Subject: [PATCH 702/813] drm/i915: Check VBT for port presence in addition to the strap on VLV/CHV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 22f35042593c2b369861f0b9740efb8065a42db0 ] Apparently some CHV boards failed to hook up the port presence straps for HDMI ports as well (earlier we assumed this problem only affected eDP ports). So let's check the VBT in addition to the strap, and if either one claims that the port is present go ahead and register the relevant connector. While at it, change port D to register DP before HDMI as we do for ports B and C since commit 457c52d87e5d ("drm/i915: Only ignore eDP ports that are connected") Also print a debug message when we register a HDMI connector to aid in diagnosing missing/incorrect ports. We already had such a print for DP/eDP. v2: Improve the comment in the code a bit, note the port D change in the commit message Cc: Radoslav Duda Tested-by: Radoslav Duda Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96321 Signed-off-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/1464945463-14364-1-git-send-email-ville.syrjala@linux.intel.com Reviewed-by: Chris Wilson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/intel_bios.c | 39 ++++++++++++++++++++++++++++ drivers/gpu/drm/i915/intel_display.c | 30 ++++++++++++++------- drivers/gpu/drm/i915/intel_hdmi.c | 3 +++ 4 files changed, 66 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d3ce4da6a6ad..d400d6773bbb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3313,6 +3313,9 @@ static inline bool intel_gmbus_is_forced_bit(struct i2c_adapter *adapter) } extern void intel_i2c_reset(struct drm_device *dev); +/* intel_bios.c */ +bool intel_bios_is_port_present(struct drm_i915_private *dev_priv, enum port port); + /* intel_opregion.c */ #ifdef CONFIG_ACPI extern int intel_opregion_setup(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index ce82f9c7df24..d14bdc537587 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1351,3 +1351,42 @@ intel_parse_bios(struct drm_device *dev) return 0; } + +/** + * intel_bios_is_port_present - is the specified digital port present + * @dev_priv: i915 device instance + * @port: port to check + * + * Return true if the device in %port is present. + */ +bool intel_bios_is_port_present(struct drm_i915_private *dev_priv, enum port port) +{ + static const struct { + u16 dp, hdmi; + } port_mapping[] = { + [PORT_B] = { DVO_PORT_DPB, DVO_PORT_HDMIB, }, + [PORT_C] = { DVO_PORT_DPC, DVO_PORT_HDMIC, }, + [PORT_D] = { DVO_PORT_DPD, DVO_PORT_HDMID, }, + [PORT_E] = { DVO_PORT_DPE, DVO_PORT_HDMIE, }, + }; + int i; + + /* FIXME maybe deal with port A as well? */ + if (WARN_ON(port == PORT_A) || port >= ARRAY_SIZE(port_mapping)) + return false; + + if (!dev_priv->vbt.child_dev_num) + return false; + + for (i = 0; i < dev_priv->vbt.child_dev_num; i++) { + const union child_device_config *p_child = + &dev_priv->vbt.child_dev[i]; + if ((p_child->common.dvo_port == port_mapping[port].dp || + p_child->common.dvo_port == port_mapping[port].hdmi) && + (p_child->common.device_type & (DEVICE_TYPE_TMDS_DVI_SIGNALING | + DEVICE_TYPE_DISPLAYPORT_OUTPUT))) + return true; + } + + return false; +} diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index a95445fc46d6..a3254c3bcc7c 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14160,7 +14160,7 @@ static void intel_setup_outputs(struct drm_device *dev) if (I915_READ(PCH_DP_D) & DP_DETECTED) intel_dp_init(dev, PCH_DP_D, PORT_D); } else if (IS_VALLEYVIEW(dev)) { - bool has_edp; + bool has_edp, has_port; /* * The DP_DETECTED bit is the latched state of the DDC @@ -14170,25 +14170,37 @@ static void intel_setup_outputs(struct drm_device *dev) * Thus we can't rely on the DP_DETECTED bit alone to detect * eDP ports. Consult the VBT as well as DP_DETECTED to * detect eDP ports. + * + * Sadly the straps seem to be missing sometimes even for HDMI + * ports (eg. on Voyo V3 - CHT x7-Z8700), so check both strap + * and VBT for the presence of the port. Additionally we can't + * trust the port type the VBT declares as we've seen at least + * HDMI ports that the VBT claim are DP or eDP. */ has_edp = intel_dp_is_edp(dev, PORT_B); - if (I915_READ(VLV_DP_B) & DP_DETECTED || has_edp) + has_port = intel_bios_is_port_present(dev_priv, PORT_B); + if (I915_READ(VLV_DP_B) & DP_DETECTED || has_port) has_edp &= intel_dp_init(dev, VLV_DP_B, PORT_B); - if (I915_READ(VLV_HDMIB) & SDVO_DETECTED && !has_edp) + if ((I915_READ(VLV_HDMIB) & SDVO_DETECTED || has_port) && !has_edp) intel_hdmi_init(dev, VLV_HDMIB, PORT_B); has_edp = intel_dp_is_edp(dev, PORT_C); - if (I915_READ(VLV_DP_C) & DP_DETECTED || has_edp) + has_port = intel_bios_is_port_present(dev_priv, PORT_C); + if (I915_READ(VLV_DP_C) & DP_DETECTED || has_port) has_edp &= intel_dp_init(dev, VLV_DP_C, PORT_C); - if (I915_READ(VLV_HDMIC) & SDVO_DETECTED && !has_edp) + if ((I915_READ(VLV_HDMIC) & SDVO_DETECTED || has_port) && !has_edp) intel_hdmi_init(dev, VLV_HDMIC, PORT_C); if (IS_CHERRYVIEW(dev)) { - /* eDP not supported on port D, so don't check VBT */ - if (I915_READ(CHV_HDMID) & SDVO_DETECTED) - intel_hdmi_init(dev, CHV_HDMID, PORT_D); - if (I915_READ(CHV_DP_D) & DP_DETECTED) + /* + * eDP not supported on port D, + * so no need to worry about it + */ + has_port = intel_bios_is_port_present(dev_priv, PORT_D); + if (I915_READ(CHV_DP_D) & DP_DETECTED || has_port) intel_dp_init(dev, CHV_DP_D, PORT_D); + if (I915_READ(CHV_HDMID) & SDVO_DETECTED || has_port) + intel_hdmi_init(dev, CHV_HDMID, PORT_D); } intel_dsi_init(dev); diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c index 4b8ed9f2dabc..dff69fef47e0 100644 --- a/drivers/gpu/drm/i915/intel_hdmi.c +++ b/drivers/gpu/drm/i915/intel_hdmi.c @@ -2030,6 +2030,9 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port, enum port port = intel_dig_port->port; uint8_t alternate_ddc_pin; + DRM_DEBUG_KMS("Adding HDMI connector on port %c\n", + port_name(port)); + drm_connector_init(dev, connector, &intel_hdmi_connector_funcs, DRM_MODE_CONNECTOR_HDMIA); drm_connector_helper_add(connector, &intel_hdmi_connector_helper_funcs); From c6d7c62c0092fd5d0fca299eeb9d4eb0a46a8a9d Mon Sep 17 00:00:00 2001 From: Tedd Ho-Jeong An Date: Wed, 13 Jul 2016 16:13:23 +0800 Subject: [PATCH 703/813] Bluetooth: Add support for Intel Bluetooth device 8265 [8087:0a2b] [ Upstream commit a0af53b511423cca93900066512379e21586d7dd ] This patch adds support for Intel Bluetooth device 8265 also known as Windstorm Peak (WsP). T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 6 Spd=12 MxCh= 0 D: Ver= 2.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=8087 ProdID=0a2b Rev= 0.10 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Tedd Ho-Jeong An Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 79107597a594..c306b483de60 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2056,12 +2056,13 @@ static int btusb_setup_intel_new(struct hci_dev *hdev) return -EINVAL; } - /* At the moment only the hardware variant iBT 3.0 (LnP/SfP) is - * supported by this firmware loading method. This check has been - * put in place to ensure correct forward compatibility options - * when newer hardware variants come along. + /* At the moment the iBT 3.0 hardware variants 0x0b (LnP/SfP) + * and 0x0c (WsP) are supported by this firmware loading method. + * + * This check has been put in place to ensure correct forward + * compatibility options when newer hardware variants come along. */ - if (ver->hw_variant != 0x0b) { + if (ver->hw_variant != 0x0b && ver->hw_variant != 0x0c) { BT_ERR("%s: Unsupported Intel hardware variant (%u)", hdev->name, ver->hw_variant); kfree_skb(skb); From cfd6e7fe434a378127e4964fc0b7ccf32ae2baed Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Jul 2016 11:44:54 +0100 Subject: [PATCH 704/813] netfilter: x_tables: check for size overflow [ Upstream commit d157bd761585605b7882935ffb86286919f62ea1 ] Ben Hawkes says: integer overflow in xt_alloc_table_info, which on 32-bit systems can lead to small structure allocation and a copy_from_user based heap corruption. Reported-by: Ben Hawkes Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/x_tables.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 25391fb25516..c7b7cecb5bd1 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -897,6 +897,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; + if (sz < sizeof(*info)) + return NULL; + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; From 5fb71611925f734f7fe03a45527e14b296fd5167 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Thu, 14 Jul 2016 15:02:06 +0100 Subject: [PATCH 705/813] tipc: fix an infoleak in tipc_nl_compat_link_dump [ Upstream commit 5d2be1422e02ccd697ccfcd45c85b4a26e6178e2 ] link_info.str is a char array of size 60. Memory after the NULL byte is not initialized. Sending the whole object out can cause a leak. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/tipc/netlink_compat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 2ed732bfe94b..f4f27c7c54fb 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -574,7 +574,8 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); - strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME])); + nla_strlcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]), + TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); From 8b2e345ed50bc140e2334e6b8fa9108ee66683cb Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 14 Jul 2016 15:02:07 +0100 Subject: [PATCH 706/813] tipc: fix nl compat regression for link statistics [ Upstream commit 55e77a3e8297581c919b45adcc4d0815b69afa84 ] Fix incorrect use of nla_strlcpy() where the first NLA_HDRLEN bytes of the link name where left out. Making the output of tipc-config -ls look something like: Link statistics: dcast-link 1:data0-1.1.2:data0 1:data0-1.1.3:data0 Also, for the record, the patch that introduce this regression claims "Sending the whole object out can cause a leak". Which isn't very likely as this is a compat layer, where the data we are parsing is generated by us and we know the string to be NULL terminated. But you can of course never be to secure. Fixes: 5d2be1422e02 (tipc: fix an infoleak in tipc_nl_compat_link_dump) Signed-off-by: Richard Alpe Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/tipc/netlink_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index f4f27c7c54fb..a0c90572d0e5 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -574,7 +574,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); - nla_strlcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]), + nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, From 610f1a8d3700e81d534adb3db07267fa8b20411c Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Tue, 30 Aug 2016 00:34:54 -0400 Subject: [PATCH 707/813] cxlflash: Fix to escalate LINK_RESET also on port 1 [ Upstream commit a9be294ecb3b9dc82b15625631b153f871181d16 ] The original fix to escalate a 'login timed out' error to a LINK_RESET was only made for one of the two ports on the card. This fix resolves the same issue for the second port (port 1). Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index ad8dc8d4d1c2..75ff7bdecca1 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -1141,7 +1141,7 @@ static const struct asyc_intr_info ainfo[] = { {SISL_ASTATUS_FC1_OTHER, "other error", 1, CLR_FC_ERROR | LINK_RESET}, {SISL_ASTATUS_FC1_LOGO, "target initiated LOGO", 1, 0}, {SISL_ASTATUS_FC1_CRC_T, "CRC threshold exceeded", 1, LINK_RESET}, - {SISL_ASTATUS_FC1_LOGI_R, "login timed out, retrying", 1, 0}, + {SISL_ASTATUS_FC1_LOGI_R, "login timed out, retrying", 1, LINK_RESET}, {SISL_ASTATUS_FC1_LOGI_F, "login failed", 1, CLR_FC_ERROR}, {SISL_ASTATUS_FC1_LOGI_S, "login succeeded", 1, SCAN_HOST}, {SISL_ASTATUS_FC1_LINK_DN, "link down", 1, 0}, From f88503578d823dfe503c9c6b80baf3864915f6a5 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Tue, 30 Aug 2016 00:34:55 -0400 Subject: [PATCH 708/813] cxlflash: Fix to avoid virtual LUN failover failure [ Upstream commit d5e26bb1d812ba74f29b6bcbc88c3dbfb3eed824 ] Applications which use virtual LUN's that are backed by a physical LUN over both adapter ports may experience an I/O failure in the event of a link loss (e.g. cable pull). Virtual LUNs may be accessed through one or both ports of the adapter. This access is encoded in the translation entries that comprise the virtual LUN and used by the AFU for load-balancing I/O and handling failover scenarios. In a link loss scenario, even though the AFU is able to maintain connectivity to the LUN, it is up to the application to retry the failed I/O. When applications are unaware of the virtual LUN's underlying topology, they are unable to make a sound decision of when to retry an I/O and therefore are forced to make their reaction to a failed I/O absolute. The result is either a failure to retry I/O or increased latency for scenarios where a retry is pointless. To remedy this scenario, provide feedback back to the application on virtual LUN creation as to which ports the LUN may be accessed. LUN's spanning both ports are candidates for a retry in a presence of an I/O failure. Signed-off-by: Matthew R. Ochs Acked-by: Manoj Kumar Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/vlun.c | 2 ++ include/uapi/scsi/cxlflash_ioctl.h | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c index a53f583e2d7b..50f8e9300770 100644 --- a/drivers/scsi/cxlflash/vlun.c +++ b/drivers/scsi/cxlflash/vlun.c @@ -1008,6 +1008,8 @@ int cxlflash_disk_virtual_open(struct scsi_device *sdev, void *arg) virt->last_lba = last_lba; virt->rsrc_handle = rsrc_handle; + if (lli->port_sel == BOTH_PORTS) + virt->hdr.return_flags |= DK_CXLFLASH_ALL_PORTS_ACTIVE; out: if (likely(ctxi)) put_context(ctxi); diff --git a/include/uapi/scsi/cxlflash_ioctl.h b/include/uapi/scsi/cxlflash_ioctl.h index 831351b2e660..2302f3ce5f86 100644 --- a/include/uapi/scsi/cxlflash_ioctl.h +++ b/include/uapi/scsi/cxlflash_ioctl.h @@ -30,6 +30,16 @@ struct dk_cxlflash_hdr { __u64 return_flags; /* Returned flags */ }; +/* + * Return flag definitions available to all ioctls + * + * Similar to the input flags, these are grown from the bottom-up with the + * intention that ioctl-specific return flag definitions would grow from the + * top-down, allowing the two sets to co-exist. While not required/enforced + * at this time, this provides future flexibility. + */ +#define DK_CXLFLASH_ALL_PORTS_ACTIVE 0x0000000000000001ULL + /* * Notes: * ----- From 4259821921698f26e4a2c67c72f00db4e48833b0 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 30 Aug 2016 00:34:57 -0400 Subject: [PATCH 709/813] crypto: nx-842 - Mask XERS0 bit in return value [ Upstream commit 6333ed8f26cf77311088d2e2b7cf16d8480bcbb2 ] NX842 coprocessor sets 3rd bit in CR register with XER[S0] which is nothing to do with NX request. Since this bit can be set with other valuable return status, mast this bit. One of other bits (INITIATED, BUSY or REJECTED) will be returned for any given NX request. Signed-off-by: Haren Myneni Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/icswx.h | 1 + drivers/crypto/nx/nx-842-powernv.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h index 9f8402b35115..27e588f6c72e 100644 --- a/arch/powerpc/include/asm/icswx.h +++ b/arch/powerpc/include/asm/icswx.h @@ -164,6 +164,7 @@ struct coprocessor_request_block { #define ICSWX_INITIATED (0x8) #define ICSWX_BUSY (0x4) #define ICSWX_REJECTED (0x2) +#define ICSWX_XERS0 (0x1) /* undefined or set from XERSO. */ static inline int icswx(__be32 ccw, struct coprocessor_request_block *crb) { diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c index 9ef51fafdbff..6e105e87b8ff 100644 --- a/drivers/crypto/nx/nx-842-powernv.c +++ b/drivers/crypto/nx/nx-842-powernv.c @@ -442,6 +442,14 @@ static int nx842_powernv_function(const unsigned char *in, unsigned int inlen, (unsigned int)ccw, (unsigned int)be32_to_cpu(crb->ccw)); + /* + * NX842 coprocessor sets 3rd bit in CR register with XER[S0]. + * XER[S0] is the integer summary overflow bit which is nothing + * to do NX. Since this bit can be set with other return values, + * mask this bit. + */ + ret &= ~ICSWX_XERS0; + switch (ret) { case ICSWX_INITIATED: ret = wait_for_csb(wmem, csb); @@ -454,10 +462,6 @@ static int nx842_powernv_function(const unsigned char *in, unsigned int inlen, pr_err_ratelimited("ICSWX rejected\n"); ret = -EPROTO; break; - default: - pr_err_ratelimited("Invalid ICSWX return code %x\n", ret); - ret = -EPROTO; - break; } if (!ret) From dfe2042d96065f044a794f684e9f7976a4ca6e24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2016 00:34:58 -0400 Subject: [PATCH 710/813] udp: properly support MSG_PEEK with truncated buffers [ Upstream commit 197c949e7798fbf28cfadc69d9ca0c2abbf93191 ] Backport of this upstream commit into stable kernels : 89c22d8c3b27 ("net: Fix skb csum races when peeking") exposed a bug in udp stack vs MSG_PEEK support, when user provides a buffer smaller than skb payload. In this case, skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); returns -EFAULT. This bug does not happen in upstream kernels since Al Viro did a great job to replace this into : skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg); This variant is safe vs short buffers. For the time being, instead reverting Herbert Xu patch and add back skb->ip_summed invalid changes, simply store the result of udp_lib_checksum_complete() so that we avoid computing the checksum a second time, and avoid the problematic skb_copy_and_csum_datagram_iovec() call. This patch can be applied on recent kernels as it avoids a double checksumming, then backported to stable kernels as a bug fix. Signed-off-by: Eric Dumazet Acked-by: Herbert Xu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/udp.c | 6 ++++-- net/ipv6/udp.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 44e1632370dd..0b1ea5abcc04 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1275,6 +1275,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); + bool checksum_valid = false; bool slow; if (flags & MSG_ERRQUEUE) @@ -1300,11 +1301,12 @@ try_again: */ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { - if (udp_lib_checksum_complete(skb)) + checksum_valid = !udp_lib_checksum_complete(skb); + if (!checksum_valid) goto csum_copy_err; } - if (skb_csum_unnecessary(skb)) + if (checksum_valid || skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 275af43306f9..e6092bd72ee2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -402,6 +402,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); + bool checksum_valid = false; int is_udp4; bool slow; @@ -433,11 +434,12 @@ try_again: */ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { - if (udp_lib_checksum_complete(skb)) + checksum_valid = !udp_lib_checksum_complete(skb); + if (!checksum_valid) goto csum_copy_err; } - if (skb_csum_unnecessary(skb)) + if (checksum_valid || skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { From 917f84b8df10b6959f0fb8e5019cdffb670c0362 Mon Sep 17 00:00:00 2001 From: Carol L Soto Date: Tue, 30 Aug 2016 00:34:59 -0400 Subject: [PATCH 711/813] IB/IPoIB: Do not set skb truesize since using one linearskb [ Upstream commit bb6a777369449d15a4a890306d2f925cae720e1c ] We are seeing this warning: at net/core/skbuff.c:4174 and before commit a44878d10063 ("IB/ipoib: Use one linear skb in RX flow") skb truesize was not being set when ipoib was using just one skb. Removing this line avoids the warning when running tcp tests like iperf. Fixes: a44878d10063 ("IB/ipoib: Use one linear skb in RX flow") Signed-off-by: Carol L Soto Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 5ea0c14070d1..fa9c42ff1fb0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -245,8 +245,6 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); - skb->truesize = SKB_TRUESIZE(skb->len); - ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; From 6a90aa44065afb34ad4f37676b734c0e3eeafe6a Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 30 Aug 2016 00:35:00 -0400 Subject: [PATCH 712/813] fs: Check for invalid i_uid in may_follow_link() [ Upstream commit 2d7f9e2ad35e4e7a3086231f19bfab33c6a8a64a ] Filesystem uids which don't map into a user namespace may result in inode->i_uid being INVALID_UID. A symlink and its parent could have different owners in the filesystem can both get mapped to INVALID_UID, which may result in following a symlink when this would not have otherwise been permitted when protected symlinks are enabled. Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 209ca7737cb2..0b0acba72a71 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -887,6 +887,7 @@ static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; + kuid_t puid; if (!sysctl_protected_symlinks) return 0; @@ -902,7 +903,8 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + puid = parent->i_uid; + if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) From 4666aa74a3797d23a2344033df41b2aa0cfe5023 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 30 Aug 2016 00:35:01 -0400 Subject: [PATCH 713/813] cred: Reject inodes with invalid ids in set_create_file_as() [ Upstream commit 5f65e5ca286126a60f62c8421b77c2018a482b8a ] Using INVALID_[UG]ID for the LSM file creation context doesn't make sense, so return an error if the inode passed to set_create_file_as() has an invalid id. Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/cred.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c1d6..ff8606f77d90 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); */ int set_create_files_as(struct cred *new, struct inode *inode) { + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); From b54698f045e805b577e14285d047ecfae1e706be Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 30 Aug 2016 00:35:03 -0400 Subject: [PATCH 714/813] drm/amdgpu/cz: enable/disable vce dpm even if vce pg is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b3dae7828399ef316e3fabf7e82c6415cb03a02e ] I missed this when cleaning up the vce pg handling. Reviewed-by: Christian König Reviewed-by: Rex Zhu Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/cz_dpm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c index 8035d4d6a4f5..653917a3bcc2 100644 --- a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c @@ -1955,10 +1955,8 @@ static void cz_dpm_powergate_vce(struct amdgpu_device *adev, bool gate) } } else { /*pi->caps_vce_pg*/ cz_update_vce_dpm(adev); - cz_enable_vce_dpm(adev, true); + cz_enable_vce_dpm(adev, !gate); } - - return; } const struct amd_ip_funcs cz_dpm_ip_funcs = { From 61610c63fa1025468c82be975e7d574e847d00de Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Aug 2016 00:35:04 -0400 Subject: [PATCH 715/813] netfilter: x_tables: check for size overflow [ Upstream commit d157bd761585605b7882935ffb86286919f62ea1 ] Ben Hawkes says: integer overflow in xt_alloc_table_info, which on 32-bit systems can lead to small structure allocation and a copy_from_user based heap corruption. Reported-by: Ben Hawkes Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/x_tables.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c7b7cecb5bd1..2fc6ca9d1286 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -897,6 +897,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; + if (sz < sizeof(*info)) + return NULL; + if (sz < sizeof(*info)) return NULL; From d4009e4b6e309e222c5245056f4d6e1a4da88026 Mon Sep 17 00:00:00 2001 From: "Manoj N. Kumar" Date: Tue, 30 Aug 2016 00:35:05 -0400 Subject: [PATCH 716/813] cxlflash: Move to exponential back-off when cmd_room is not available [ Upstream commit ea76543127da32dec28af0a13ea1b06625fc085e ] While profiling the cxlflash_queuecommand() path under a heavy load it was found that number of retries to find cmd_room was fairly high. There are two problems with the current back-off: a) It starts with a udelay of 0 b) It backs-off linearly Tried several approaches (a higher multiple 10*n, 100*n, as well as n^2, 2^n) and found that the exponential back-off(2^n) approach had the least overall cost. Cost as being defined as overall time spent waiting. The fix is to change the linear back-off to an exponential back-off. This solution also takes care of the problem with the initial delay (starts with 1 usec). Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Johannes Thumshirn Signed-off-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 75ff7bdecca1..c86847c68448 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -289,7 +289,7 @@ static void context_reset(struct afu_cmd *cmd) atomic64_set(&afu->room, room); if (room) goto write_rrin; - udelay(nretry); + udelay(1 << nretry); } while (nretry++ < MC_ROOM_RETRY_CNT); pr_err("%s: no cmd_room to send reset\n", __func__); @@ -303,7 +303,7 @@ write_rrin: if (rrin != 0x1) break; /* Double delay each time */ - udelay(2 << nretry); + udelay(1 << nretry); } while (nretry++ < MC_ROOM_RETRY_CNT); } @@ -338,7 +338,7 @@ retry: atomic64_set(&afu->room, room); if (room) goto write_ioarrin; - udelay(nretry); + udelay(1 << nretry); } while (nretry++ < MC_ROOM_RETRY_CNT); dev_err(dev, "%s: no cmd_room to send 0x%X\n", @@ -352,7 +352,7 @@ retry: * afu->room. */ if (nretry++ < MC_ROOM_RETRY_CNT) { - udelay(nretry); + udelay(1 << nretry); goto retry; } From 45a063156fc5822f3f72bee0bc986de63e7ed73b Mon Sep 17 00:00:00 2001 From: Jake Oshins Date: Tue, 30 Aug 2016 00:35:06 -0400 Subject: [PATCH 717/813] drivers:hv: Lock access to hyperv_mmio resource tree [ Upstream commit e16dad6bfe1437aaee565f875a6713ca7ce81bdf ] In existing code, this tree of resources is created in single-threaded code and never modified after it is created, and thus needs no locking. This patch introduces a semaphore for tree access, as other patches in this series introduce run-time modifications of this resource tree which can happen on multiple threads. Signed-off-by: Jake Oshins Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 55952d1ed336..509ed9731630 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -105,6 +105,7 @@ static struct notifier_block hyperv_panic_block = { }; struct resource *hyperv_mmio; +DEFINE_SEMAPHORE(hyperv_mmio_lock); static int vmbus_exists(void) { @@ -1140,7 +1141,10 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, resource_size_t range_min, range_max, start, local_min, local_max; const char *dev_n = dev_name(&device_obj->device); u32 fb_end = screen_info.lfb_base + (screen_info.lfb_size << 1); - int i; + int i, retval; + + retval = -ENXIO; + down(&hyperv_mmio_lock); for (iter = hyperv_mmio; iter; iter = iter->sibling) { if ((iter->start >= max) || (iter->end <= min)) @@ -1177,13 +1181,17 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, for (; start + size - 1 <= local_max; start += align) { *new = request_mem_region_exclusive(start, size, dev_n); - if (*new) - return 0; + if (*new) { + retval = 0; + goto exit; + } } } } - return -ENXIO; +exit: + up(&hyperv_mmio_lock); + return retval; } EXPORT_SYMBOL_GPL(vmbus_allocate_mmio); From 5afbd223e60a130f66bddf7598165ebe2b51f8db Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Aug 2016 00:35:07 -0400 Subject: [PATCH 718/813] KEYS: Fix ASN.1 indefinite length object parsing [ Upstream commit 23c8a812dc3c621009e4f0e5342aa4e2ede1ceaa ] This fixes CVE-2016-0758. In the ASN.1 decoder, when the length field of an ASN.1 value is extracted, it isn't validated against the remaining amount of data before being added to the cursor. With a sufficiently large size indicated, the check: datalen - dp < 2 may then fail due to integer overflow. Fix this by checking the length indicated against the amount of remaining data in both places a definite length is determined. Whilst we're at it, make the following changes: (1) Check the maximum size of extended length does not exceed the capacity of the variable it's being stored in (len) rather than the type that variable is assumed to be (size_t). (2) Compare the EOC tag to the symbolic constant ASN1_EOC rather than the integer 0. (3) To reduce confusion, move the initialisation of len outside of: for (len = 0; n > 0; n--) { since it doesn't have anything to do with the loop counter n. Signed-off-by: David Howells Reviewed-by: Mimi Zohar Acked-by: David Woodhouse Acked-by: Peter Jones Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- lib/asn1_decoder.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2b3f46c049d4..554522934c44 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -74,7 +74,7 @@ next_tag: /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -96,10 +96,8 @@ next_tag: /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -110,14 +108,18 @@ next_tag: } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; From ad7c1399b7d0c6788b8f5fdb5c274110f3ce6017 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 2 Jun 2016 23:43:21 -0500 Subject: [PATCH 719/813] kernel: Add noaudit variant of ns_capable() commit 98f368e9e2630a3ce3e80fb10fb2e02038cf9578 upstream. When checking the current cred for a capability in a specific user namespace, it isn't always desirable to have the LSMs audit the check. This patch adds a noaudit variant of ns_capable() for when those situations arise. The common logic between ns_capable() and the new ns_capable_noaudit() is moved into a single, shared function to keep duplicated code to a minimum and ease maintainability. Signed-off-by: Tyler Hicks Acked-by: Serge E. Hallyn Signed-off-by: James Morris Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/capability.h | 5 +++++ kernel/capability.c | 46 +++++++++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index af9f0b9e80e6..5f8249d378a2 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -214,6 +214,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); +extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { @@ -241,6 +242,10 @@ static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } +static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return true; +} #endif /* CONFIG_MULTIUSER */ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } +static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +{ + int capable; + + if (unlikely(!cap_valid(cap))) { + pr_crit("capable() called with invalid cap=%u\n", cap); + BUG(); + } + + capable = audit ? security_capable(current_cred(), ns, cap) : + security_capable_noaudit(current_cred(), ns, cap); + if (capable == 0) { + current->flags |= PF_SUPERPRIV; + return true; + } + return false; +} + /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap) */ bool ns_capable(struct user_namespace *ns, int cap) { - if (unlikely(!cap_valid(cap))) { - pr_crit("capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(current_cred(), ns, cap) == 0) { - current->flags |= PF_SUPERPRIV; - return true; - } - return false; + return ns_capable_common(ns, cap, true); } EXPORT_SYMBOL(ns_capable); +/** + * ns_capable_noaudit - Determine if the current task has a superior capability + * (unaudited) in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, false); +} +EXPORT_SYMBOL(ns_capable_noaudit); /** * capable - Determine if the current task has a superior capability in effect From 6af4737361aa55ac90ecb9b41915cd8fd5cade77 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 2 Jun 2016 23:43:22 -0500 Subject: [PATCH 720/813] net: Use ns_capable_noaudit() when determining net sysctl permissions commit d6e0d306449bcb5fa3c80e7a3edf11d45abf9ae9 upstream. The capability check should not be audited since it is only being used to determine the inode permissions. A failed check does not indicate a violation of security policy but, when an LSM is enabled, a denial audit message was being generated. The denial audit message caused confusion for some application authors because root-running Go applications always triggered the denial. To prevent this confusion, the capability check in net_ctl_permissions() is switched to the noaudit variant. BugLink: https://launchpad.net/bugs/1465724 Signed-off-by: Tyler Hicks Acked-by: Serge E. Hallyn Signed-off-by: James Morris Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/sysctl_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sysctl_net.c b/net/sysctl_net.c index ed98c1fc3de1..46a71c701e7c 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -46,7 +46,7 @@ static int net_ctl_permissions(struct ctl_table_header *head, kgid_t root_gid = make_kgid(net->user_ns, 0); /* Allow network administrator to have same access as root. */ - if (ns_capable(net->user_ns, CAP_NET_ADMIN) || + if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) || uid_eq(root_uid, current_euid())) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; From dfa0a22733804636fb8ac7963bd24e7ce6dbaa45 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Aug 2016 00:51:02 -0400 Subject: [PATCH 721/813] ext4: validate that metadata blocks do not overlap superblock commit 829fa70dddadf9dd041d62b82cd7cea63943899d upstream. A number of fuzzing failures seem to be caused by allocation bitmaps or other metadata blocks being pointed at the superblock. This can cause kernel BUG or WARNings once the superblock is overwritten, so validate the group descriptor blocks to make sure this doesn't happen. Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c542ebcf7a92..60707c9ece36 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2093,6 +2093,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, + ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2123,6 +2124,11 @@ static int ext4_check_descriptors(struct super_block *sb, grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "superblock", i); + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2130,6 +2136,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "superblock", i); + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2137,6 +2148,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_table = ext4_inode_table(sb, gdp); + if (inode_table == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "superblock", i); + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -3640,7 +3656,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; From f2c06c7321aa080b18f36090cc1c3b2fa03a5cc8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Aug 2016 11:50:30 -0400 Subject: [PATCH 722/813] ext4: fix xattr shifting when expanding inodes commit d0141191a20289f8955c1e03dad08e42e6f71ca9 upstream. The code in ext4_expand_extra_isize_ea() treated new_extra_isize argument sometimes as the desired target i_extra_isize and sometimes as the amount by which we need to grow current i_extra_isize. These happen to coincide when i_extra_isize is 0 which used to be the common case and so nobody noticed this until recently when we added i_projid to the inode and so i_extra_isize now needs to grow from 28 to 32 bytes. The result of these bugs was that we sometimes unnecessarily decided to move xattrs out of inode even if there was enough space and we often ended up corrupting in-inode xattrs because arguments to ext4_xattr_shift_entries() were just wrong. This could demonstrate itself as BUG_ON in ext4_xattr_shift_entries() triggering. Fix the problem by introducing new isize_diff variable and use it where appropriate. Reported-by: Dave Chinner Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6b6b3e751f8c..bf3130e275ee 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1264,11 +1264,13 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, size_t min_offs, free; int total_ino; void *base, *start, *end; - int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int isize_diff; /* How much do we need to grow i_extra_isize */ down_write(&EXT4_I(inode)->xattr_sem); retry: + isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { up_write(&EXT4_I(inode)->xattr_sem); return 0; @@ -1289,7 +1291,7 @@ retry: total_ino = sizeof(struct ext4_xattr_ibody_header); free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= new_extra_isize) { + if (free >= isize_diff) { entry = IFIRST(header); ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + @@ -1321,7 +1323,7 @@ retry: end = bh->b_data + bh->b_size; min_offs = end - base; free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < new_extra_isize) { + if (free < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1335,7 +1337,7 @@ retry: free = inode->i_sb->s_blocksize; } - while (new_extra_isize > 0) { + while (isize_diff > 0) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1366,7 +1368,7 @@ retry: EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); if (total_size <= free && total_size < min_total_size) { - if (total_size < new_extra_isize) { + if (total_size < isize_diff) { small_entry = last; } else { entry = last; @@ -1423,20 +1425,19 @@ retry: goto cleanup; entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) - shift_bytes = new_extra_isize; + if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) + shift_bytes = isize_diff; else shift_bytes = entry_size + size; /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - shift_bytes, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, + ext4_xattr_shift_entries(entry, -shift_bytes, + (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize + shift_bytes, (void *)header, total_ino - entry_size, inode->i_sb->s_blocksize); - extra_isize += shift_bytes; - new_extra_isize -= shift_bytes; - EXT4_I(inode)->i_extra_isize = extra_isize; + isize_diff -= shift_bytes; + EXT4_I(inode)->i_extra_isize += shift_bytes; i.name = b_entry_name; i.value = buffer; From e6abdbf8ac4adc14b6e91fdeb81492c112e9cc1e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Aug 2016 11:58:32 -0400 Subject: [PATCH 723/813] ext4: fix xattr shifting when expanding inodes part 2 commit 418c12d08dc64a45107c467ec1ba29b5e69b0715 upstream. When multiple xattrs need to be moved out of inode, we did not properly recompute total size of xattr headers in the inode and the new header position. Thus when moving the second and further xattr we asked ext4_xattr_shift_entries() to move too much and from the wrong place, resulting in possible xattr value corruption or general memory corruption. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index bf3130e275ee..5b7c078d6709 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1423,6 +1423,7 @@ retry: error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto cleanup; + total_ino -= entry_size; entry = IFIRST(header); if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) @@ -1433,11 +1434,11 @@ retry: ext4_xattr_shift_entries(entry, -shift_bytes, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + shift_bytes, - (void *)header, total_ino - entry_size, - inode->i_sb->s_blocksize); + (void *)header, total_ino, inode->i_sb->s_blocksize); isize_diff -= shift_bytes; EXT4_I(inode)->i_extra_isize += shift_bytes; + header = IHDR(inode, raw_inode); i.name = b_entry_name; i.value = buffer; From a79f1f7fcbe478e22cb4ff5f524847ec584177fe Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Aug 2016 12:00:01 -0400 Subject: [PATCH 724/813] ext4: properly align shifted xattrs when expanding inodes commit 443a8c41cd49de66a3fda45b32b9860ea0292b84 upstream. We did not count with the padding of xattr value when computing desired shift of xattrs in the inode when expanding i_extra_isize. As a result we could create unaligned start of inline xattrs. Account for alignment properly. Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5b7c078d6709..313bac633bd0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1429,7 +1429,7 @@ retry: if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) shift_bytes = isize_diff; else - shift_bytes = entry_size + size; + shift_bytes = entry_size + EXT4_XATTR_SIZE(size); /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(entry, -shift_bytes, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + From 77ae14d00650e7f7b2144019b855acbc6e27003f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Aug 2016 12:38:55 -0400 Subject: [PATCH 725/813] ext4: avoid deadlock when expanding inode size commit 2e81a4eeedcaa66e35f58b81e0755b87057ce392 upstream. When we need to move xattrs into external xattr block, we call ext4_xattr_block_set() from ext4_expand_extra_isize_ea(). That may end up calling ext4_mark_inode_dirty() again which will recurse back into the inode expansion code leading to deadlocks. Protect from recursion using EXT4_STATE_NO_EXPAND inode flag and move its management into ext4_expand_extra_isize_ea() since its manipulation is safe there (due to xattr_sem) from possible races with ext4_xattr_set_handle() which plays with it as well. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 -- fs/ext4/xattr.c | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9a5ad0f0d3ed..3eebea131164 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5186,8 +5186,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - ext4_set_inode_state(inode, - EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { ext4_warning(inode->i_sb, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 313bac633bd0..02fd78cd2cd5 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1269,12 +1269,14 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, int isize_diff; /* How much do we need to grow i_extra_isize */ down_write(&EXT4_I(inode)->xattr_sem); + /* + * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty + */ + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - } + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + goto out; header = IHDR(inode, raw_inode); entry = IFIRST(header); @@ -1299,8 +1301,7 @@ retry: (void *)header, total_ino, inode->i_sb->s_blocksize); EXT4_I(inode)->i_extra_isize = new_extra_isize; - error = 0; - goto cleanup; + goto out; } /* @@ -1460,6 +1461,8 @@ retry: kfree(bs); } brelse(bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return 0; @@ -1471,6 +1474,10 @@ cleanup: kfree(is); kfree(bs); brelse(bh); + /* + * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode + * size expansion failed. + */ up_write(&EXT4_I(inode)->xattr_sem); return error; } From 1d12bad745acfc02d452a13f32b4087865964c9a Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sun, 3 Jul 2016 17:51:39 -0400 Subject: [PATCH 726/813] ext4: avoid modifying checksum fields directly during checksum verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b47820edd1634dc1208f9212b7ecfb4230610a23 upstream. We temporally change checksum fields in buffers of some types of metadata into '0' for verifying the checksum values. By doing this without locking the buffer, some metadata's checksums, which are being committed or written back to the storage, could be damaged. In our test, several metadata blocks were found with damaged metadata checksum value during recovery process. When we only verify the checksum value, we have to avoid modifying checksum fields directly. Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Signed-off-by: Theodore Ts'o Reviewed-by: Darrick J. Wong Cc: Török Edwin Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 38 ++++++++++++++++++++++---------------- fs/ext4/namei.c | 9 ++++----- fs/ext4/super.c | 18 +++++++++--------- fs/ext4/xattr.c | 13 +++++++------ 4 files changed, 42 insertions(+), 36 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3eebea131164..28702932a908 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -51,26 +51,32 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u16 csum_lo; - __u16 csum_hi = 0; __u32 csum; + __u16 dummy_csum = 0; + int offset = offsetof(struct ext4_inode, i_checksum_lo); + unsigned int csum_size = sizeof(dummy_csum); - csum_lo = le16_to_cpu(raw->i_checksum_lo); - raw->i_checksum_lo = 0; - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = le16_to_cpu(raw->i_checksum_hi); - raw->i_checksum_hi = 0; + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_GOOD_OLD_INODE_SIZE - offset); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + offset = offsetof(struct ext4_inode, i_checksum_hi); + csum = ext4_chksum(sbi, csum, (__u8 *)raw + + EXT4_GOOD_OLD_INODE_SIZE, + offset - EXT4_GOOD_OLD_INODE_SIZE); + if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - + offset); + } } - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, - EXT4_INODE_SIZE(inode->i_sb)); - - raw->i_checksum_lo = cpu_to_le16(csum_lo); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = cpu_to_le16(csum_hi); - return csum; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 91bf36f22dbf..38eb0c8e43b9 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - __le32 save_csum; int size; + __u32 dummy_csum = 0; + int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - save_csum = t->dt_checksum; - t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = save_csum; + csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 60707c9ece36..5bab28caa9d4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2030,23 +2030,25 @@ failed: static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - int offset; + int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ - __le16 save_csum; __u32 csum32; + __u16 dummy_csum = 0; - save_csum = gdp->bg_checksum; - gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, - sbi->s_desc_size); - gdp->bg_checksum = save_csum; + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + if (offset < sbi->s_desc_size) + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; @@ -2056,8 +2058,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, if (!ext4_has_feature_gdt_csum(sb)) return 0; - offset = offsetof(struct ext4_group_desc, bg_checksum); - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 02fd78cd2cd5..263002f0389d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -123,17 +123,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - __le32 save_csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); + __u32 dummy_csum = 0; + int offset = offsetof(struct ext4_xattr_header, h_checksum); - save_csum = hdr->h_checksum; - hdr->h_checksum = 0; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, - EXT4_BLOCK_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + EXT4_BLOCK_SIZE(inode->i_sb) - offset); - hdr->h_checksum = save_csum; return cpu_to_le32(csum); } From d3a6bd7b77ce9c9c19b37226ab0640a8918c4663 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Aug 2016 16:48:36 -0700 Subject: [PATCH 727/813] block: Fix race triggered by blk_set_queue_dying() commit 1b856086813be9371929b6cc62045f9fd470f5a0 upstream. blk_set_queue_dying() can be called while another thread is submitting I/O or changing queue flags, e.g. through dm_stop_queue(). Hence protect the QUEUE_FLAG_DYING flag change with locking. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Mike Snitzer Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index f8e64cac981a..4fab5d610805 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -515,7 +515,9 @@ EXPORT_SYMBOL_GPL(blk_queue_bypass_end); void blk_set_queue_dying(struct request_queue *q) { - queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + spin_lock_irq(q->queue_lock); + queue_flag_set(QUEUE_FLAG_DYING, q); + spin_unlock_irq(q->queue_lock); if (q->mq_ops) blk_mq_wake_waiters(q); From 02989f49787e23855d989647d2e37826f841d844 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 23 Aug 2016 21:49:45 +0800 Subject: [PATCH 728/813] block: make sure a big bio is split into at most 256 bvecs commit 4d70dca4eadf2f95abe389116ac02b8439c2d16c upstream. After arbitrary bio size was introduced, the incoming bio may be very big. We have to split the bio into small bios so that each holds at most BIO_MAX_PAGES bvecs for safety reason, such as bio_clone(). This patch fixes the following kernel crash: > [ 172.660142] BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 > [ 172.660229] IP: [] bio_trim+0xf/0x2a > [ 172.660289] PGD 7faf3e067 PUD 7f9279067 PMD 0 > [ 172.660399] Oops: 0000 [#1] SMP > [...] > [ 172.664780] Call Trace: > [ 172.664813] [] ? raid1_make_request+0x2e8/0xad7 [raid1] > [ 172.664846] [] ? blk_queue_split+0x377/0x3d4 > [ 172.664880] [] ? md_make_request+0xf6/0x1e9 [md_mod] > [ 172.664912] [] ? generic_make_request+0xb5/0x155 > [ 172.664947] [] ? prio_io+0x85/0x95 [bcache] > [ 172.664981] [] ? register_cache_set+0x355/0x8d0 [bcache] > [ 172.665016] [] ? register_bcache+0x1006/0x1174 [bcache] The issue can be reproduced by the following steps: - create one raid1 over two virtio-blk - build bcache device over the above raid1 and another cache device and bucket size is set as 2Mbytes - set cache mode as writeback - run random write over ext4 on the bcache device Fixes: 54efd50(block: make generic_make_request handle arbitrarily sized bios) Reported-by: Sebastian Roesner Reported-by: Eric Wheeler Cc: Shaohua Li Acked-by: Kent Overstreet Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-merge.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index b966db8f3556..7225511cf0b4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -92,8 +92,30 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); + unsigned bvecs = 0; bio_for_each_segment(bv, bio, iter) { + /* + * With arbitrary bio size, the incoming bio may be very + * big. We have to split the bio into small bios so that + * each holds at most BIO_MAX_PAGES bvecs because + * bio_clone() can fail to allocate big bvecs. + * + * It should have been better to apply the limit per + * request queue in which bio_clone() is involved, + * instead of globally. The biggest blocker is the + * bio_clone() in bio bounce. + * + * If bio is splitted by this reason, we should have + * allowed to continue bios merging, but don't do + * that now for making the change simple. + * + * TODO: deal with bio bounce's bio_clone() gracefully + * and convert the global limit into per-queue limit. + */ + if (bvecs++ >= BIO_MAX_PAGES) + goto split; + /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. From db8c7fff99ef64e0f2702b5477104b6c3a6c6fee Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 10 Aug 2016 15:43:06 -0400 Subject: [PATCH 729/813] cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork commit 568ac888215c7fb2fabe8ea739b00ec3c1f5d440 upstream. cgroup_threadgroup_rwsem is acquired in read mode during process exit and fork. It is also grabbed in write mode during __cgroups_proc_write(). I've recently run into a scenario with lots of memory pressure and OOM and I am beginning to see systemd __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 percpu_down_write+0x114/0x170 __cgroup_procs_write.isra.12+0xb8/0x3c0 cgroup_file_write+0x74/0x1a0 kernfs_fop_write+0x188/0x200 __vfs_write+0x6c/0xe0 vfs_write+0xc0/0x230 SyS_write+0x6c/0x110 system_call+0x38/0xb4 This thread is waiting on the reader of cgroup_threadgroup_rwsem to exit. The reader itself is under memory pressure and has gone into reclaim after fork. There are times the reader also ends up waiting on oom_lock as well. __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 jbd2_log_wait_commit+0xd4/0x180 ext4_evict_inode+0x88/0x5c0 evict+0xf8/0x2a0 dispose_list+0x50/0x80 prune_icache_sb+0x6c/0x90 super_cache_scan+0x190/0x210 shrink_slab.part.15+0x22c/0x4c0 shrink_zone+0x288/0x3c0 do_try_to_free_pages+0x1dc/0x590 try_to_free_pages+0xdc/0x260 __alloc_pages_nodemask+0x72c/0xc90 alloc_pages_current+0xb4/0x1a0 page_table_alloc+0xc0/0x170 __pte_alloc+0x58/0x1f0 copy_page_range+0x4ec/0x950 copy_process.isra.5+0x15a0/0x1870 _do_fork+0xa8/0x4b0 ppc_clone+0x8/0xc In the meanwhile, all processes exiting/forking are blocked almost stalling the system. This patch moves the threadgroup_change_begin from before cgroup_fork() to just before cgroup_canfork(). There is no nee to worry about threadgroup changes till the task is actually added to the threadgroup. This avoids having to call reclaim with cgroup_threadgroup_rwsem held. tj: Subject and description edits. Signed-off-by: Balbir Singh Acked-by: Zefan Li Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 1155eac61687..c485cb156772 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1369,7 +1369,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1521,6 +1520,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1621,6 +1621,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: @@ -1651,7 +1652,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); From 81e9a969c441d43b1a82b7d27848c0c7e1a5d90d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 8 Sep 2016 18:10:23 -0300 Subject: [PATCH 730/813] nvme: Call pci_disable_device on the error path. Commit 5706aca74fe4 ("NVMe: Don't unmap controller registers on reset"), which backported b00a726a9fd8 to the 4.4.y kernel introduced a regression in which it didn't call pci_disable_device in the error path of nvme_pci_enable. Reported-by: Jiri Slaby Embarassed-developer: Gabriel Krisman Bertazi Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 289a5df0d44a..c851bc53831c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2725,7 +2725,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) return 0; disable: - pci_release_regions(pdev); + pci_disable_device(pdev); return result; } From 0e324f6d66549b6a98122f2bd8da5ae56b018956 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 28 Jun 2016 13:01:04 +1000 Subject: [PATCH 731/813] powerpc/tm: Avoid SLB faults in treclaim/trecheckpoint when RI=0 commit 190ce8693c23eae09ba5f303a83bf2fbeb6478b1 upstream. Currently we have 2 segments that are bolted for the kernel linear mapping (ie 0xc000... addresses). This is 0 to 1TB and also the kernel stacks. Anything accessed outside of these regions may need to be faulted in. (In practice machines with TM always have 1T segments) If a machine has < 2TB of memory we never fault on the kernel linear mapping as these two segments cover all physical memory. If a machine has > 2TB of memory, there may be structures outside of these two segments that need to be faulted in. This faulting can occur when running as a guest as the hypervisor may remove any SLB that's not bolted. When we treclaim and trecheckpoint we have a window where we need to run with the userspace GPRs. This means that we no longer have a valid stack pointer in r1. For this window we therefore clear MSR RI to indicate that any exceptions taken at this point won't be able to be handled. This means that we can't take segment misses in this RI=0 window. In this RI=0 region, we currently access the thread_struct for the process being context switched to or from. This thread_struct access may cause a segment fault since it's not guaranteed to be covered by the two bolted segment entries described above. We've seen this with a crash when running as a guest with > 2TB of memory on PowerVM: Unrecoverable exception 4100 at c00000000004f138 Oops: Unrecoverable exception, sig: 6 [#1] SMP NR_CPUS=2048 NUMA pSeries CPU: 1280 PID: 7755 Comm: kworker/1280:1 Tainted: G X 4.4.13-46-default #1 task: c000189001df4210 ti: c000189001d5c000 task.ti: c000189001d5c000 NIP: c00000000004f138 LR: 0000000010003a24 CTR: 0000000010001b20 REGS: c000189001d5f730 TRAP: 4100 Tainted: G X (4.4.13-46-default) MSR: 8000000100001031 CR: 24000048 XER: 00000000 CFAR: c00000000004ed18 SOFTE: 0 GPR00: ffffffffc58d7b60 c000189001d5f9b0 00000000100d7d00 000000003a738288 GPR04: 0000000000002781 0000000000000006 0000000000000000 c0000d1f4d889620 GPR08: 000000000000c350 00000000000008ab 00000000000008ab 00000000100d7af0 GPR12: 00000000100d7ae8 00003ffe787e67a0 0000000000000000 0000000000000211 GPR16: 0000000010001b20 0000000000000000 0000000000800000 00003ffe787df110 GPR20: 0000000000000001 00000000100d1e10 0000000000000000 00003ffe787df050 GPR24: 0000000000000003 0000000000010000 0000000000000000 00003fffe79e2e30 GPR28: 00003fffe79e2e68 00000000003d0f00 00003ffe787e67a0 00003ffe787de680 NIP [c00000000004f138] restore_gprs+0xd0/0x16c LR [0000000010003a24] 0x10003a24 Call Trace: [c000189001d5f9b0] [c000189001d5f9f0] 0xc000189001d5f9f0 (unreliable) [c000189001d5fb90] [c00000000001583c] tm_recheckpoint+0x6c/0xa0 [c000189001d5fbd0] [c000000000015c40] __switch_to+0x2c0/0x350 [c000189001d5fc30] [c0000000007e647c] __schedule+0x32c/0x9c0 [c000189001d5fcb0] [c0000000007e6b58] schedule+0x48/0xc0 [c000189001d5fce0] [c0000000000deabc] worker_thread+0x22c/0x5b0 [c000189001d5fd80] [c0000000000e7000] kthread+0x110/0x130 [c000189001d5fe30] [c000000000009538] ret_from_kernel_thread+0x5c/0xa4 Instruction dump: 7cb103a6 7cc0e3a6 7ca222a6 78a58402 38c00800 7cc62838 08860000 7cc000a6 38a00006 78c60022 7cc62838 0b060000 7ccff120 e8270078 e8a70098 ---[ end trace 602126d0a1dedd54 ]--- This fixes this by copying the required data from the thread_struct to the stack before we clear MSR RI. Then once we clear RI, we only access the stack, guaranteeing there's no segment miss. We also tighten the region over which we set RI=0 on the treclaim() path. This may have a slight performance impact since we're adding an mtmsr instruction. Fixes: 090b9284d725 ("powerpc/tm: Clear MSR RI in non-recoverable TM code") Signed-off-by: Michael Neuling Reviewed-by: Cyril Bur Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/tm.S | 61 +++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index bf8f34a58670..b7019b559ddb 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -110,17 +110,11 @@ _GLOBAL(tm_reclaim) std r3, STK_PARAM(R3)(r1) SAVE_NVGPRS(r1) - /* We need to setup MSR for VSX register save instructions. Here we - * also clear the MSR RI since when we do the treclaim, we won't have a - * valid kernel pointer for a while. We clear RI here as it avoids - * adding another mtmsr closer to the treclaim. This makes the region - * maked as non-recoverable wider than it needs to be but it saves on - * inserting another mtmsrd later. - */ + /* We need to setup MSR for VSX register save instructions. */ mfmsr r14 mr r15, r14 ori r15, r15, MSR_FP - li r16, MSR_RI + li r16, 0 ori r16, r16, MSR_EE /* IRQs hard off */ andc r15, r15, r16 oris r15, r15, MSR_VEC@h @@ -176,7 +170,17 @@ dont_backup_fp: 1: tdeqi r6, 0 EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 - /* The moment we treclaim, ALL of our GPRs will switch + /* Clear MSR RI since we are about to change r1, EE is already off. */ + li r4, 0 + mtmsrd r4, 1 + + /* + * BE CAREFUL HERE: + * At this point we can't take an SLB miss since we have MSR_RI + * off. Load only to/from the stack/paca which are in SLB bolted regions + * until we turn MSR RI back on. + * + * The moment we treclaim, ALL of our GPRs will switch * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ @@ -197,6 +201,11 @@ dont_backup_fp: /* Store the PPR in r11 and reset to decent value */ std r11, GPR11(r1) /* Temporary stash */ + + /* Reset MSR RI so we can take SLB faults again */ + li r11, MSR_RI + mtmsrd r11, 1 + mfspr r11, SPRN_PPR HMT_MEDIUM @@ -397,11 +406,6 @@ restore_gprs: ld r5, THREAD_TM_DSCR(r3) ld r6, THREAD_TM_PPR(r3) - /* Clear the MSR RI since we are about to change R1. EE is already off - */ - li r4, 0 - mtmsrd r4, 1 - REST_GPR(0, r7) /* GPR0 */ REST_2GPRS(2, r7) /* GPR2-3 */ REST_GPR(4, r7) /* GPR4 */ @@ -439,10 +443,33 @@ restore_gprs: ld r6, _CCR(r7) mtcr r6 - REST_GPR(1, r7) /* GPR1 */ - REST_GPR(5, r7) /* GPR5-7 */ REST_GPR(6, r7) - ld r7, GPR7(r7) + + /* + * Store r1 and r5 on the stack so that we can access them + * after we clear MSR RI. + */ + + REST_GPR(5, r7) + std r5, -8(r1) + ld r5, GPR1(r7) + std r5, -16(r1) + + REST_GPR(7, r7) + + /* Clear MSR RI since we are about to change r1. EE is already off */ + li r5, 0 + mtmsrd r5, 1 + + /* + * BE CAREFUL HERE: + * At this point we can't take an SLB miss since we have MSR_RI + * off. Load only to/from the stack/paca which are in SLB bolted regions + * until we turn MSR RI back on. + */ + + ld r5, -8(r1) + ld r1, -16(r1) /* Commit register state as checkpointed state: */ TRECHKPT From ffd5ce2ad5fd140ddd492ab2064e29e86aaa64ea Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Thu, 2 Jun 2016 04:11:20 -0400 Subject: [PATCH 732/813] rds: fix an infoleak in rds_inc_info_copy commit 4116def2337991b39919f3b448326e21c40e0dbb upstream. The last field "flags" of object "minfo" is not initialized. Copying this object out may leak kernel stack data. Assign 0 to it to avoid leak. Signed-off-by: Kangjie Lu Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller Signed-off-by: Juerg Haefliger Signed-off-by: Greg Kroah-Hartman --- net/rds/recv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/rds/recv.c b/net/rds/recv.c index a00462b0d01d..0514af3ab378 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -545,5 +545,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, minfo.fport = inc->i_hdr.h_dport; } + minfo.flags = 0; + rds_info_copy(iter, &minfo, sizeof(minfo)); } From 2d29d6cec3bc5473efdad3b143404d9e32817c86 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 25 Apr 2016 17:54:28 +0200 Subject: [PATCH 733/813] s390/sclp_ctl: fix potential information leak with /dev/sclp commit 532c34b5fbf1687df63b3fcd5b2846312ac943c6 upstream. The sclp_ctl_ioctl_sccb function uses two copy_from_user calls to retrieve the sclp request from user space. The first copy_from_user fetches the length of the request which is stored in the first two bytes of the request. The second copy_from_user gets the complete sclp request, but this copies the length field a second time. A malicious user may have changed the length in the meantime. Reported-by: Pengfei Wang Reviewed-by: Michael Holzheu Signed-off-by: Martin Schwidefsky Signed-off-by: Juerg Haefliger Signed-off-by: Greg Kroah-Hartman --- drivers/s390/char/sclp_ctl.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/s390/char/sclp_ctl.c b/drivers/s390/char/sclp_ctl.c index 648cb86afd42..ea607a4a1bdd 100644 --- a/drivers/s390/char/sclp_ctl.c +++ b/drivers/s390/char/sclp_ctl.c @@ -56,6 +56,7 @@ static int sclp_ctl_ioctl_sccb(void __user *user_area) { struct sclp_ctl_sccb ctl_sccb; struct sccb_header *sccb; + unsigned long copied; int rc; if (copy_from_user(&ctl_sccb, user_area, sizeof(ctl_sccb))) @@ -65,14 +66,15 @@ static int sclp_ctl_ioctl_sccb(void __user *user_area) sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; - if (copy_from_user(sccb, u64_to_uptr(ctl_sccb.sccb), sizeof(*sccb))) { + copied = PAGE_SIZE - + copy_from_user(sccb, u64_to_uptr(ctl_sccb.sccb), PAGE_SIZE); + if (offsetof(struct sccb_header, length) + + sizeof(sccb->length) > copied || sccb->length > copied) { rc = -EFAULT; goto out_free; } - if (sccb->length > PAGE_SIZE || sccb->length < 8) - return -EINVAL; - if (copy_from_user(sccb, u64_to_uptr(ctl_sccb.sccb), sccb->length)) { - rc = -EFAULT; + if (sccb->length < 8) { + rc = -EINVAL; goto out_free; } rc = sclp_sync_request(ctl_sccb.cmdw, sccb); From 314c7e8a09e8f23715e0fcd08e8659a67447c0d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 17 Aug 2016 09:46:42 +0200 Subject: [PATCH 734/813] drm/radeon: fix radeon_move_blit on 32bit systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 13f479b9df4e2bbf2d16e7e1b02f3f55f70e2455 upstream. This bug seems to be present for a very long time. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_ttm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index f342aad79cc6..35310336dd0a 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -263,8 +263,8 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, rdev = radeon_get_rdev(bo->bdev); ridx = radeon_copy_ring_index(rdev); - old_start = old_mem->start << PAGE_SHIFT; - new_start = new_mem->start << PAGE_SHIFT; + old_start = (u64)old_mem->start << PAGE_SHIFT; + new_start = (u64)new_mem->start << PAGE_SHIFT; switch (old_mem->mem_type) { case TTM_PL_VRAM: From b7e99f782e6bd2606adbbce0c90804fb8aa4f5c8 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sat, 20 Aug 2016 12:22:11 +0200 Subject: [PATCH 735/813] drm: Reject page_flip for !DRIVER_MODESET commit 6f00975c619064a18c23fd3aced325ae165a73b9 upstream. Somehow this one slipped through, which means drivers without modeset support can be oopsed (since those also don't call drm_mode_config_init, which means the crtc lookup will chase an uninitalized idr). Reported-by: Alexander Potapenko Cc: Alexander Potapenko Signed-off-by: Daniel Vetter Reviewed-by: Chris Wilson Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_crtc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index dc84003f694e..5e4bb4837bae 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -5231,6 +5231,9 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev, unsigned long flags; int ret = -EINVAL; + if (!drm_core_check_feature(dev, DRIVER_MODESET)) + return -EINVAL; + if (page_flip->flags & ~DRM_MODE_PAGE_FLIP_FLAGS || page_flip->reserved != 0) return -EINVAL; From 103898dd7701bf65ba35c5337ae82f82834bb0c5 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 22 Aug 2016 15:15:23 -0400 Subject: [PATCH 736/813] drm/msm: fix use of copy_from_user() while holding spinlock commit 89f82cbb0d5c0ab768c8d02914188aa2211cd2e3 upstream. Use instead __copy_from_user_inatomic() and fallback to slow-path where we drop and re-aquire the lock in case of fault. Reported-by: Vaishali Thakkar Signed-off-by: Rob Clark Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/msm/msm_gem_submit.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 6d7cd3fe21e7..1847f83b1e33 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -55,6 +55,14 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, return submit; } +static inline unsigned long __must_check +copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) +{ + if (access_ok(VERIFY_READ, from, n)) + return __copy_from_user_inatomic(to, from, n); + return -EFAULT; +} + static int submit_lookup_objects(struct msm_gem_submit *submit, struct drm_msm_gem_submit *args, struct drm_file *file) { @@ -62,6 +70,7 @@ static int submit_lookup_objects(struct msm_gem_submit *submit, int ret = 0; spin_lock(&file->table_lock); + pagefault_disable(); for (i = 0; i < args->nr_bos; i++) { struct drm_msm_gem_submit_bo submit_bo; @@ -70,10 +79,15 @@ static int submit_lookup_objects(struct msm_gem_submit *submit, void __user *userptr = to_user_ptr(args->bos + (i * sizeof(submit_bo))); - ret = copy_from_user(&submit_bo, userptr, sizeof(submit_bo)); - if (ret) { - ret = -EFAULT; - goto out_unlock; + ret = copy_from_user_inatomic(&submit_bo, userptr, sizeof(submit_bo)); + if (unlikely(ret)) { + pagefault_enable(); + spin_unlock(&file->table_lock); + ret = copy_from_user(&submit_bo, userptr, sizeof(submit_bo)); + if (ret) + goto out; + spin_lock(&file->table_lock); + pagefault_disable(); } if (submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) { @@ -113,9 +127,12 @@ static int submit_lookup_objects(struct msm_gem_submit *submit, } out_unlock: - submit->nr_bos = i; + pagefault_enable(); spin_unlock(&file->table_lock); +out: + submit->nr_bos = i; + return ret; } From 4757f7ed8e9473d03074e3a315158492856a9f56 Mon Sep 17 00:00:00 2001 From: Christoph Huber Date: Mon, 15 Aug 2016 18:59:25 +0200 Subject: [PATCH 737/813] ASoC: atmel_ssc_dai: Don't unconditionally reset SSC on stream startup commit 3e103a65514c2947e53f3171b21255fbde8b60c6 upstream. commit cbaadf0f90d6 ("ASoC: atmel_ssc_dai: refactor the startup and shutdown") refactored code such that the SSC is reset on every startup; this breaks duplex audio (e.g. first start audio playback, then start record, causing the playback to stop/hang) Fixes: cbaadf0f90d6 (ASoC: atmel_ssc_dai: refactor the startup and shutdown) Signed-off-by: Christoph Huber Signed-off-by: Peter Meerwald-Stadler Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/atmel/atmel_ssc_dai.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/atmel/atmel_ssc_dai.c b/sound/soc/atmel/atmel_ssc_dai.c index ba8def5665c4..6726143c7fc5 100644 --- a/sound/soc/atmel/atmel_ssc_dai.c +++ b/sound/soc/atmel/atmel_ssc_dai.c @@ -298,8 +298,9 @@ static int atmel_ssc_startup(struct snd_pcm_substream *substream, clk_enable(ssc_p->ssc->clk); ssc_p->mck_rate = clk_get_rate(ssc_p->ssc->clk); - /* Reset the SSC to keep it at a clean status */ - ssc_writel(ssc_p->ssc->regs, CR, SSC_BIT(CR_SWRST)); + /* Reset the SSC unless initialized to keep it in a clean state */ + if (!ssc_p->initialized) + ssc_writel(ssc_p->ssc->regs, CR, SSC_BIT(CR_SWRST)); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { dir = 0; From f5edb04b45076f498a96e5a6f5b1c37e943b501e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 26 Aug 2016 16:01:30 +1000 Subject: [PATCH 738/813] xfs: fix superblock inprogress check commit f3d7ebdeb2c297bd26272384e955033493ca291c upstream. From inspection, the superblock sb_inprogress check is done in the verifier and triggered only for the primary superblock via a "bp->b_bn == XFS_SB_DADDR" check. Unfortunately, the primary superblock is an uncached buffer, and hence it is configured by xfs_buf_read_uncached() with: bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ And so this check never triggers. Fix it. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_sb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8a53eaa349f4..7088be6afb3c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -581,7 +581,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + return xfs_mount_validate_sb(mp, &sb, + bp->b_maps[0].bm_bn == XFS_SB_DADDR, check_version); } From 42ef9015e0adec4b5cf8142846a596a7adb8cadc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 23 Aug 2016 16:08:22 -0700 Subject: [PATCH 739/813] timekeeping: Cap array access in timekeeping_debug commit a4f8f6667f099036c88f231dcad4cf233652c824 upstream. It was reported that hibernation could fail on the 2nd attempt, where the system hangs at hibernate() -> syscore_resume() -> i8237A_resume() -> claim_dma_lock(), because the lock has already been taken. However there is actually no other process would like to grab this lock on that problematic platform. Further investigation showed that the problem is triggered by setting /sys/power/pm_trace to 1 before the 1st hibernation. Since once pm_trace is enabled, the rtc becomes unmeaningful after suspend, and meanwhile some BIOSes would like to adjust the 'invalid' RTC (e.g, smaller than 1970) to the release date of that motherboard during POST stage, thus after resumed, it may seem that the system had a significant long sleep time which is a completely meaningless value. Then in timekeeping_resume -> tk_debug_account_sleep_time, if the bit31 of the sleep time happened to be set to 1, fls() returns 32 and we add 1 to sleep_time_bin[32], which causes an out of bounds array access and therefor memory being overwritten. As depicted by System.map: 0xffffffff81c9d080 b sleep_time_bin 0xffffffff81c9d100 B dma_spin_lock the dma_spin_lock.val is set to 1, which caused this problem. This patch adds a sanity check in tk_debug_account_sleep_time() to ensure we don't index past the sleep_time_bin array. [jstultz: Problem diagnosed and original patch by Chen Yu, I've solved the issue slightly differently, but borrowed his excelent explanation of the issue here.] Fixes: 5c83545f24ab "power: Add option to log time spent in suspend" Reported-by: Janek Kozicki Reported-by: Chen Yu Signed-off-by: John Stultz Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Xunlei Pang Cc: "Rafael J. Wysocki" Cc: Zhang Rui Link: http://lkml.kernel.org/r/1471993702-29148-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping_debug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } From 4eca11dbd272076f05c934dfd3005afc4b77c521 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 23 Aug 2016 16:08:21 -0700 Subject: [PATCH 740/813] timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING commit 27727df240c7cc84f2ba6047c6f18d5addfd25ef upstream. When I added some extra sanity checking in timekeeping_get_ns() under CONFIG_DEBUG_TIMEKEEPING, I missed that the NMI safe __ktime_get_fast_ns() method was using timekeeping_get_ns(). Thus the locking added to the debug checks broke the NMI-safety of __ktime_get_fast_ns(). This patch open-codes the timekeeping_get_ns() logic for __ktime_get_fast_ns(), so can avoid any deadlocks in NMI. Fixes: 4ca22c2648f9 "timekeeping: Add warnings when overflows or underflows are observed" Reported-by: Steven Rostedt Reported-by: Peter Zijlstra Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1471993702-29148-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d9249daf14ba..4ff237dbc006 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -383,7 +383,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; From 09ca40a6afc93c21ec2ecc2ff9ba5b7c7360a0b7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Jan 2016 15:34:16 -0500 Subject: [PATCH 741/813] lustre: remove unused declaration commit 57b8f112cfe6622ddddb8c2641206bb5fa8a112d upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- drivers/staging/lustre/lustre/llite/llite_internal.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index 9096d311e45d..c2d9b793759d 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -631,8 +631,6 @@ struct ll_file_data { struct lov_stripe_md; -extern spinlock_t inode_lock; - extern struct dentry *llite_root; extern struct kset *llite_kset; From d72e9b2566e79f0ca6fe128d8bb6972209a816fc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Jan 2016 15:40:57 -0500 Subject: [PATCH 742/813] wrappers for ->i_mutex access commit 5955102c9984fa081b2d570cfac75c97eecf8f3b upstream parallel to mutex_{lock,unlock,trylock,is_locked,lock_nested}, inode_foo(inode) being mutex_foo(&inode->i_mutex). Please, use those for access to ->i_mutex; over the coming cycle ->i_mutex will become rwsem, with ->lookup() done with it held only shared. Signed-off-by: Al Viro [only the fs.h change included to make backports easier - gregkh] Signed-off-by: Greg Kroah-Hartman --- include/linux/fs.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ab3d8d9bb3ef..0166582c4d78 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -710,6 +710,31 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT2, }; +static inline void inode_lock(struct inode *inode) +{ + mutex_lock(&inode->i_mutex); +} + +static inline void inode_unlock(struct inode *inode) +{ + mutex_unlock(&inode->i_mutex); +} + +static inline int inode_trylock(struct inode *inode) +{ + return mutex_trylock(&inode->i_mutex); +} + +static inline int inode_is_locked(struct inode *inode) +{ + return mutex_is_locked(&inode->i_mutex); +} + +static inline void inode_lock_nested(struct inode *inode, unsigned subclass) +{ + mutex_lock_nested(&inode->i_mutex, subclass); +} + void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); @@ -3029,8 +3054,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) } static inline bool dir_relax(struct inode *inode) { - mutex_unlock(&inode->i_mutex); - mutex_lock(&inode->i_mutex); + inode_unlock(inode); + inode_lock(inode); return !IS_DEADDIR(inode); } From 48fd20d7237bfefd18750b1b38f426c7e210462d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 8 Aug 2016 15:08:49 +0200 Subject: [PATCH 743/813] ovl: don't copy up opaqueness commit 0956254a2d5b9e2141385514553aeef694dfe3b5 upstream. When a copy up of a directory occurs which has the opaque xattr set, the xattr remains in the upper directory. The immediate behavior with overlayfs is that the upper directory is not treated as opaque, however after a remount the opaque flag is used and upper directory is treated as opaque. This causes files created in the lower layer to be hidden when using multiple lower directories. Fix by not copying up the opaque flag. To reproduce: ----8<---------8<---------8<---------8<---------8<---------8<---- mkdir -p l/d/s u v w mnt mount -t overlay overlay -olowerdir=l,upperdir=u,workdir=w mnt rm -rf mnt/d/ mkdir -p mnt/d/n umount mnt mount -t overlay overlay -olowerdir=u:l,upperdir=v,workdir=w mnt touch mnt/d/foo umount mnt mount -t overlay overlay -olowerdir=u:l,upperdir=v,workdir=w mnt ls mnt/d ----8<---------8<---------8<---------8<---------8<---------8<---- output should be: "foo n" Reported-by: Derek McGowan Link: https://bugzilla.kernel.org/show_bug.cgi?id=151291 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/copy_up.c | 2 ++ fs/overlayfs/inode.c | 2 +- fs/overlayfs/overlayfs.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eff6319d5037..9e52609cd683 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -48,6 +48,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) } for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + if (ovl_is_private_xattr(name)) + continue; retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 4f729ffff75d..8f8bce4d62e1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -219,7 +219,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) } -static bool ovl_is_private_xattr(const char *name) +bool ovl_is_private_xattr(const char *name) { return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 735e1d49b301..c319d5eaabcf 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -174,6 +174,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); +bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); From d57a6c748059eb1f90cde4b40091f57617b72609 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 1 Sep 2016 11:11:59 +0200 Subject: [PATCH 744/813] ovl: remove posix_acl_default from workdir commit c11b9fdd6a612f376a5e886505f1c54c16d8c380 upstream. Clear out posix acl xattrs on workdir and also reset the mode after creation so that an inherited sgid bit is cleared. Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/super.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 70a7bbe199d0..350905f17bcc 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -763,6 +763,10 @@ retry: struct kstat stat = { .mode = S_IFDIR | 0, }; + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat.mode, + }; if (work->d_inode) { err = -EEXIST; @@ -778,6 +782,21 @@ retry: err = ovl_create_real(dir, work, &stat, NULL, NULL, true); if (err) goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA) + goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = notify_change(work, &attr, NULL); + inode_unlock(work->d_inode); + if (err) + goto out_dput; } out_unlock: mutex_unlock(&dir->i_mutex); From 708cb42fcaef0874a81100b461833096564db6cf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 1 Sep 2016 11:12:00 +0200 Subject: [PATCH 745/813] ovl: listxattr: use strnlen() commit 7cb35119d067191ce9ebc380a599db0b03cbd9d9 upstream. Be defensive about what underlying fs provides us in the returned xattr list buffer. If it's not properly null terminated, bail out with a warning insead of BUG. Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/inode.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 8f8bce4d62e1..220b04f04523 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -277,7 +277,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) struct path realpath; enum ovl_path_type type = ovl_path_real(dentry, &realpath); ssize_t res; - int off; + size_t len; + char *s; res = vfs_listxattr(realpath.dentry, list, size); if (res <= 0 || size == 0) @@ -287,17 +288,19 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; /* filter out private xattrs */ - for (off = 0; off < res;) { - char *s = list + off; - size_t slen = strlen(s) + 1; + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; - BUG_ON(off + slen > res); + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + len -= slen; if (ovl_is_private_xattr(s)) { res -= slen; - memmove(s, s + slen, res - off); + memmove(s, s + slen, len); } else { - off += slen; + s += slen; } } From 2f949da9c0203bb8b309daddcf532593d019fc2b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 5 Sep 2016 13:55:20 +0200 Subject: [PATCH 746/813] ovl: fix workdir creation commit e1ff3dd1ae52cef5b5373c8cc4ad949c2c25a71c upstream. Workdir creation fails in latest kernel. Fix by allowing EOPNOTSUPP as a valid return value from vfs_removexattr(XATTR_NAME_POSIX_ACL_*). Upper filesystem may not support ACL and still be perfectly able to support overlayfs. Reported-by: Martin Ziegler Signed-off-by: Miklos Szeredi Fixes: c11b9fdd6a61 ("ovl: remove posix_acl_default from workdir") Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 350905f17bcc..d70208c0de84 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -784,11 +784,11 @@ retry: goto out_dput; err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); - if (err && err != -ENODATA) + if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); - if (err && err != -ENODATA) + if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ From 4c10981673bb8aa798dac491bef41389646d02b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vincent=20Stehl=C3=A9?= Date: Fri, 12 Aug 2016 15:26:30 +0200 Subject: [PATCH 747/813] ubifs: Fix assertion in layout_in_gaps() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c0082e985fdf77b02fc9e0dac3b58504dcf11b7a upstream. An assertion in layout_in_gaps() verifies that the gap_lebs pointer is below the maximum bound. When computing this maximum bound the idx_lebs count is multiplied by sizeof(int), while C pointers arithmetic does take into account the size of the pointed elements implicitly already. Remove the multiplication to fix the assertion. Fixes: 1e51764a3c2ac05a ("UBIFS: add new flash file system") Signed-off-by: Vincent Stehlé Cc: Artem Bityutskiy Signed-off-by: Artem Bityutskiy Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/tnc_commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index b45345d701e7..51157da3f76e 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) p = c->gap_lebs; do { - ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs); written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; From 2d64cbc819b13ebd503780f39552827516f4ce4a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 17 Aug 2016 18:21:24 -0700 Subject: [PATCH 748/813] bcache: RESERVE_PRIO is too small by one when prio_buckets() is a power of two. commit acc9cf8c66c66b2cbbdb4a375537edee72be64df upstream. This patch fixes a cachedev registration-time allocation deadlock. This can deadlock on boot if your initrd auto-registeres bcache devices: Allocator thread: [ 720.727614] INFO: task bcache_allocato:3833 blocked for more than 120 seconds. [ 720.732361] [] schedule+0x37/0x90 [ 720.732963] [] bch_bucket_alloc+0x188/0x360 [bcache] [ 720.733538] [] ? prepare_to_wait_event+0xf0/0xf0 [ 720.734137] [] bch_prio_write+0x19d/0x340 [bcache] [ 720.734715] [] bch_allocator_thread+0x3ff/0x470 [bcache] [ 720.735311] [] ? __schedule+0x2dc/0x950 [ 720.735884] [] ? invalidate_buckets+0x980/0x980 [bcache] Registration thread: [ 720.710403] INFO: task bash:3531 blocked for more than 120 seconds. [ 720.715226] [] schedule+0x37/0x90 [ 720.715805] [] __bch_btree_map_nodes+0x12d/0x150 [bcache] [ 720.716409] [] ? bch_btree_insert_check_key+0x1c0/0x1c0 [bcache] [ 720.717008] [] bch_btree_insert+0xf4/0x170 [bcache] [ 720.717586] [] ? prepare_to_wait_event+0xf0/0xf0 [ 720.718191] [] bch_journal_replay+0x14a/0x290 [bcache] [ 720.718766] [] ? ttwu_do_activate.constprop.94+0x5d/0x70 [ 720.719369] [] ? try_to_wake_up+0x1d4/0x350 [ 720.719968] [] run_cache_set+0x580/0x8e0 [bcache] [ 720.720553] [] register_bcache+0xe2e/0x13b0 [bcache] [ 720.721153] [] kobj_attr_store+0xf/0x20 [ 720.721730] [] sysfs_kf_write+0x3d/0x50 [ 720.722327] [] kernfs_fop_write+0x12a/0x180 [ 720.722904] [] __vfs_write+0x37/0x110 [ 720.723503] [] ? __sb_start_write+0x58/0x110 [ 720.724100] [] ? security_file_permission+0x23/0xa0 [ 720.724675] [] vfs_write+0xa9/0x1b0 [ 720.725275] [] ? do_audit_syscall_entry+0x6c/0x70 [ 720.725849] [] SyS_write+0x55/0xd0 [ 720.726451] [] ? do_page_fault+0x30/0x80 [ 720.727045] [] system_call_fastpath+0x12/0x71 The fifo code in upstream bcache can't use the last element in the buffer, which was the cause of the bug: if you asked for a power of two size, it'd give you a fifo that could hold one less than what you asked for rather than allocating a buffer twice as big. Signed-off-by: Kent Overstreet Tested-by: Eric Wheeler Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a296425a7270..3d5c0ba13181 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1818,7 +1818,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || From 9cb2e06aa5ce916bd77cb1c4ec75cbb2a3d0b736 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 6 Jun 2016 18:07:59 -0400 Subject: [PATCH 749/813] vhost/scsi: fix reuse of &vq->iov[out] in response commit a77ec83a57890240c546df00ca5df1cdeedb1cc3 upstream. The address of the iovec &vq->iov[out] is not guaranteed to contain the scsi command's response iovec throughout the lifetime of the command. Rather, it is more likely to contain an iovec from an immediately following command after looping back around to vhost_get_vq_desc(). Pass along the iovec entirely instead. Fixes: 79c14141a487 ("vhost/scsi: Convert completion path to use copy_to_iter") Signed-off-by: Benjamin Coddington Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/scsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 29cfc57d496e..e4110d6de0b5 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -88,7 +88,7 @@ struct vhost_scsi_cmd { struct scatterlist *tvc_prot_sgl; struct page **tvc_upages; /* Pointer to response header iovec */ - struct iovec *tvc_resp_iov; + struct iovec tvc_resp_iov; /* Pointer to vhost_scsi for our device */ struct vhost_scsi *tvc_vhost; /* Pointer to vhost_virtqueue for the cmd */ @@ -557,7 +557,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) memcpy(v_rsp.sense, cmd->tvc_sense_buf, se_cmd->scsi_sense_length); - iov_iter_init(&iov_iter, READ, cmd->tvc_resp_iov, + iov_iter_init(&iov_iter, READ, &cmd->tvc_resp_iov, cmd->tvc_in_iovs, sizeof(v_rsp)); ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter); if (likely(ret == sizeof(v_rsp))) { @@ -1054,7 +1054,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) } cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - cmd->tvc_resp_iov = &vq->iov[out]; + cmd->tvc_resp_iov = vq->iov[out]; cmd->tvc_in_iovs = in; pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", From 2e133f2d2cf40736ebc028ffa11dd907dc0753eb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 23 Aug 2016 20:07:19 +0800 Subject: [PATCH 750/813] x86/apic: Do not init irq remapping if ioapic is disabled commit 2e63ad4bd5dd583871e6602f9d398b9322d358d9 upstream. native_smp_prepare_cpus -> default_setup_apic_routing -> enable_IR_x2apic -> irq_remapping_prepare -> intel_prepare_irq_remapping -> intel_setup_irq_remapping So IR table is setup even if "noapic" boot parameter is added. As a result we crash later when the interrupt affinity is set due to a half initialized remapping infrastructure. Prevent remap initialization when IOAPIC is disabled. Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1471954039-3942-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/apic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2f69e3b184f6..a3e1f8497f8c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1587,6 +1587,9 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; + if (skip_ioapic_setup) + return; + ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) return; From f964b3b368b59cc03e26d3f5b5226432b22d9a0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 17 Aug 2016 17:36:29 +0200 Subject: [PATCH 751/813] uprobes: Fix the memcg accounting commit 6c4687cc17a788a6dd8de3e27dbeabb7cbd3e066 upstream. __replace_page() wronlgy calls mem_cgroup_cancel_charge() in "success" path, it should only do this if page_check_address() fails. This means that every enable/disable leads to unbalanced mem_cgroup_uncharge() from put_page(old_page), it is trivial to underflow the page_counter->count and trigger OOM. Reported-and-tested-by: Brenden Blanco Signed-off-by: Oleg Nesterov Reviewed-by: Johannes Weiner Acked-by: Michal Hocko Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Fixes: 00501b531c47 ("mm: memcontrol: rewrite charge API") Link: http://lkml.kernel.org/r/20160817153629.GB29724@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/uprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913abf..da0c09ff6112 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -171,8 +171,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -199,7 +201,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; From f973851a595969f2a57f81e3c915cc116ae5a74f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Horia=20Geant=C4=83?= Date: Mon, 29 Aug 2016 14:52:14 +0300 Subject: [PATCH 752/813] crypto: caam - fix IV loading for authenc (giv)decryption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8b18e2359aff2ab810aba84cebffc9da07fef78f upstream. For algorithms that implement IV generators before the crypto ops, the IV needed for decryption is initially located in req->src scatterlist, not in req->iv. Avoid copying the IV into req->iv by modifying the (givdecrypt) descriptors to load it directly from req->src. aead_givdecrypt() is no longer needed and goes away. Fixes: 479bcc7c5b9e ("crypto: caam - Convert authenc to new AEAD interface") Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/caam/caamalg.c | 77 +++++++++++++++++------------------ 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index 6dc597126b79..b3044219772c 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -556,7 +556,10 @@ skip_enc: /* Read and write assoclen bytes */ append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ); - append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ); + if (alg->caam.geniv) + append_math_add_imm_u32(desc, VARSEQOUTLEN, REG3, IMM, ivsize); + else + append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ); /* Skip assoc data */ append_seq_fifo_store(desc, 0, FIFOST_TYPE_SKIP | FIFOLDST_VLF); @@ -565,6 +568,14 @@ skip_enc: append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_MSG | KEY_VLF); + if (alg->caam.geniv) { + append_seq_load(desc, ivsize, LDST_CLASS_1_CCB | + LDST_SRCDST_BYTE_CONTEXT | + (ctx1_iv_off << LDST_OFFSET_SHIFT)); + append_move(desc, MOVE_SRC_CLASS1CTX | MOVE_DEST_CLASS2INFIFO | + (ctx1_iv_off << MOVE_OFFSET_SHIFT) | ivsize); + } + /* Load Counter into CONTEXT1 reg */ if (is_rfc3686) append_load_imm_u32(desc, be32_to_cpu(1), LDST_IMM | @@ -2150,7 +2161,7 @@ static void init_authenc_job(struct aead_request *req, init_aead_job(req, edesc, all_contig, encrypt); - if (ivsize && (is_rfc3686 || !(alg->caam.geniv && encrypt))) + if (ivsize && ((is_rfc3686 && encrypt) || !alg->caam.geniv)) append_load_as_imm(desc, req->iv, ivsize, LDST_CLASS_1_CCB | LDST_SRCDST_BYTE_CONTEXT | @@ -2537,20 +2548,6 @@ static int aead_decrypt(struct aead_request *req) return ret; } -static int aead_givdecrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - unsigned int ivsize = crypto_aead_ivsize(aead); - - if (req->cryptlen < ivsize) - return -EINVAL; - - req->cryptlen -= ivsize; - req->assoclen += ivsize; - - return aead_decrypt(req); -} - /* * allocate and map the ablkcipher extended descriptor for ablkcipher */ @@ -3210,7 +3207,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = AES_BLOCK_SIZE, .maxauthsize = MD5_DIGEST_SIZE, }, @@ -3256,7 +3253,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = AES_BLOCK_SIZE, .maxauthsize = SHA1_DIGEST_SIZE, }, @@ -3302,7 +3299,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = AES_BLOCK_SIZE, .maxauthsize = SHA224_DIGEST_SIZE, }, @@ -3348,7 +3345,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = AES_BLOCK_SIZE, .maxauthsize = SHA256_DIGEST_SIZE, }, @@ -3394,7 +3391,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = AES_BLOCK_SIZE, .maxauthsize = SHA384_DIGEST_SIZE, }, @@ -3440,7 +3437,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = AES_BLOCK_SIZE, .maxauthsize = SHA512_DIGEST_SIZE, }, @@ -3486,7 +3483,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES3_EDE_BLOCK_SIZE, .maxauthsize = MD5_DIGEST_SIZE, }, @@ -3534,7 +3531,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES3_EDE_BLOCK_SIZE, .maxauthsize = SHA1_DIGEST_SIZE, }, @@ -3582,7 +3579,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES3_EDE_BLOCK_SIZE, .maxauthsize = SHA224_DIGEST_SIZE, }, @@ -3630,7 +3627,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES3_EDE_BLOCK_SIZE, .maxauthsize = SHA256_DIGEST_SIZE, }, @@ -3678,7 +3675,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES3_EDE_BLOCK_SIZE, .maxauthsize = SHA384_DIGEST_SIZE, }, @@ -3726,7 +3723,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES3_EDE_BLOCK_SIZE, .maxauthsize = SHA512_DIGEST_SIZE, }, @@ -3772,7 +3769,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES_BLOCK_SIZE, .maxauthsize = MD5_DIGEST_SIZE, }, @@ -3818,7 +3815,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES_BLOCK_SIZE, .maxauthsize = SHA1_DIGEST_SIZE, }, @@ -3864,7 +3861,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES_BLOCK_SIZE, .maxauthsize = SHA224_DIGEST_SIZE, }, @@ -3910,7 +3907,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES_BLOCK_SIZE, .maxauthsize = SHA256_DIGEST_SIZE, }, @@ -3956,7 +3953,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES_BLOCK_SIZE, .maxauthsize = SHA384_DIGEST_SIZE, }, @@ -4002,7 +3999,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = DES_BLOCK_SIZE, .maxauthsize = SHA512_DIGEST_SIZE, }, @@ -4051,7 +4048,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = CTR_RFC3686_IV_SIZE, .maxauthsize = MD5_DIGEST_SIZE, }, @@ -4102,7 +4099,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = CTR_RFC3686_IV_SIZE, .maxauthsize = SHA1_DIGEST_SIZE, }, @@ -4153,7 +4150,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = CTR_RFC3686_IV_SIZE, .maxauthsize = SHA224_DIGEST_SIZE, }, @@ -4204,7 +4201,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = CTR_RFC3686_IV_SIZE, .maxauthsize = SHA256_DIGEST_SIZE, }, @@ -4255,7 +4252,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = CTR_RFC3686_IV_SIZE, .maxauthsize = SHA384_DIGEST_SIZE, }, @@ -4306,7 +4303,7 @@ static struct caam_aead_alg driver_aeads[] = { .setkey = aead_setkey, .setauthsize = aead_setauthsize, .encrypt = aead_encrypt, - .decrypt = aead_givdecrypt, + .decrypt = aead_decrypt, .ivsize = CTR_RFC3686_IV_SIZE, .maxauthsize = SHA512_DIGEST_SIZE, }, From 9947ec2c04d30412a06e45a4af03f28c2d76d1f6 Mon Sep 17 00:00:00 2001 From: Ken Lin Date: Fri, 12 Aug 2016 14:08:47 -0400 Subject: [PATCH 753/813] ALSA: usb-audio: Add sample rate inquiry quirk for B850V3 CP2114 commit 83d9956b7e6b310c1062df7894257251c625b22e upstream. Avoid getting sample rate on B850V3 CP2114 as it is unsupported and causes noisy "current rate is different from the runtime rate" messages when playback starts. Signed-off-by: Ken Lin Signed-off-by: Akshay Bhat Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a3e1252ce242..3039e907f1f8 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1142,6 +1142,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ + case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */ case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */ case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */ case USB_ID(0x1de7, 0x0114): /* Phoenix Audio MT202pcs */ From ad3bccfd7d6c0d9b263977492f2f712f1859b5b2 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 31 Aug 2016 20:15:32 +0900 Subject: [PATCH 754/813] ALSA: firewire-tascam: accessing to user space outside spinlock commit 04b2d9c9c319277ad4fbbb71855c256a9f4d5f98 upstream. In hwdep interface of firewire-tascam driver, accessing to user space is in a critical section with disabled local interrupt. Depending on architecture, accessing to user space can cause page fault exception. Then local processor stores machine status and handle the synchronous event. A handler corresponding to the event can call task scheduler to wait for preparing pages. In a case of usage of single core processor, the state to disable local interrupt is worse because it doesn't handle usual interrupts from hardware. This commit fixes this bug, by performing the accessing outside spinlock. Reported-by: Vaishali Thakkar Fixes: e5e0c3dd257b('ALSA: firewire-tascam: add hwdep interface') Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/firewire/tascam/tascam-hwdep.c | 33 ++++++++++------------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/sound/firewire/tascam/tascam-hwdep.c b/sound/firewire/tascam/tascam-hwdep.c index 131267c3a042..106406cbfaa3 100644 --- a/sound/firewire/tascam/tascam-hwdep.c +++ b/sound/firewire/tascam/tascam-hwdep.c @@ -16,31 +16,14 @@ #include "tascam.h" -static long hwdep_read_locked(struct snd_tscm *tscm, char __user *buf, - long count) -{ - union snd_firewire_event event; - - memset(&event, 0, sizeof(event)); - - event.lock_status.type = SNDRV_FIREWIRE_EVENT_LOCK_STATUS; - event.lock_status.status = (tscm->dev_lock_count > 0); - tscm->dev_lock_changed = false; - - count = min_t(long, count, sizeof(event.lock_status)); - - if (copy_to_user(buf, &event, count)) - return -EFAULT; - - return count; -} - static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, loff_t *offset) { struct snd_tscm *tscm = hwdep->private_data; DEFINE_WAIT(wait); - union snd_firewire_event event; + union snd_firewire_event event = { + .lock_status.type = SNDRV_FIREWIRE_EVENT_LOCK_STATUS, + }; spin_lock_irq(&tscm->lock); @@ -54,10 +37,16 @@ static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, spin_lock_irq(&tscm->lock); } - memset(&event, 0, sizeof(event)); - count = hwdep_read_locked(tscm, buf, count); + event.lock_status.status = (tscm->dev_lock_count > 0); + tscm->dev_lock_changed = false; + spin_unlock_irq(&tscm->lock); + count = min_t(long, count, sizeof(event.lock_status)); + + if (copy_to_user(buf, &event, count)) + return -EFAULT; + return count; } From e6c4138bc1e6021be12eba70bb90a2ea2a3dc96a Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 31 Aug 2016 22:58:42 +0900 Subject: [PATCH 755/813] ALSA: fireworks: accessing to user space outside spinlock commit 6b1ca4bcadf9ef077cc5f03c6822ba276ed14902 upstream. In hwdep interface of fireworks driver, accessing to user space is in a critical section with disabled local interrupt. Depending on architecture, accessing to user space can cause page fault exception. Then local processor stores machine status and handles the synchronous event. A handler corresponding to the event can call task scheduler to wait for preparing pages. In a case of usage of single core processor, the state to disable local interrupt is worse because it don't handle usual interrupts from hardware. This commit fixes this bug, performing the accessing outside spinlock. This commit also gives up counting the number of queued response messages to simplify ring-buffer management. Reported-by: Vaishali Thakkar Fixes: 555e8a8f7f14('ALSA: fireworks: Add command/response functionality into hwdep interface') Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/firewire/fireworks/fireworks.h | 1 - sound/firewire/fireworks/fireworks_hwdep.c | 73 ++++++++++++++----- sound/firewire/fireworks/fireworks_proc.c | 4 +- .../fireworks/fireworks_transaction.c | 5 +- 4 files changed, 57 insertions(+), 26 deletions(-) diff --git a/sound/firewire/fireworks/fireworks.h b/sound/firewire/fireworks/fireworks.h index c7cb7deafe48..2c316a9bc7f6 100644 --- a/sound/firewire/fireworks/fireworks.h +++ b/sound/firewire/fireworks/fireworks.h @@ -106,7 +106,6 @@ struct snd_efw { u8 *resp_buf; u8 *pull_ptr; u8 *push_ptr; - unsigned int resp_queues; }; int snd_efw_transaction_cmd(struct fw_unit *unit, diff --git a/sound/firewire/fireworks/fireworks_hwdep.c b/sound/firewire/fireworks/fireworks_hwdep.c index 33df8655fe81..2e1d9a23920c 100644 --- a/sound/firewire/fireworks/fireworks_hwdep.c +++ b/sound/firewire/fireworks/fireworks_hwdep.c @@ -25,6 +25,7 @@ hwdep_read_resp_buf(struct snd_efw *efw, char __user *buf, long remained, { unsigned int length, till_end, type; struct snd_efw_transaction *t; + u8 *pull_ptr; long count = 0; if (remained < sizeof(type) + sizeof(struct snd_efw_transaction)) @@ -38,8 +39,17 @@ hwdep_read_resp_buf(struct snd_efw *efw, char __user *buf, long remained, buf += sizeof(type); /* write into buffer as many responses as possible */ - while (efw->resp_queues > 0) { - t = (struct snd_efw_transaction *)(efw->pull_ptr); + spin_lock_irq(&efw->lock); + + /* + * When another task reaches here during this task's access to user + * space, it picks up current position in buffer and can read the same + * series of responses. + */ + pull_ptr = efw->pull_ptr; + + while (efw->push_ptr != pull_ptr) { + t = (struct snd_efw_transaction *)(pull_ptr); length = be32_to_cpu(t->length) * sizeof(__be32); /* confirm enough space for this response */ @@ -49,26 +59,39 @@ hwdep_read_resp_buf(struct snd_efw *efw, char __user *buf, long remained, /* copy from ring buffer to user buffer */ while (length > 0) { till_end = snd_efw_resp_buf_size - - (unsigned int)(efw->pull_ptr - efw->resp_buf); + (unsigned int)(pull_ptr - efw->resp_buf); till_end = min_t(unsigned int, length, till_end); - if (copy_to_user(buf, efw->pull_ptr, till_end)) + spin_unlock_irq(&efw->lock); + + if (copy_to_user(buf, pull_ptr, till_end)) return -EFAULT; - efw->pull_ptr += till_end; - if (efw->pull_ptr >= efw->resp_buf + - snd_efw_resp_buf_size) - efw->pull_ptr -= snd_efw_resp_buf_size; + spin_lock_irq(&efw->lock); + + pull_ptr += till_end; + if (pull_ptr >= efw->resp_buf + snd_efw_resp_buf_size) + pull_ptr -= snd_efw_resp_buf_size; length -= till_end; buf += till_end; count += till_end; remained -= till_end; } - - efw->resp_queues--; } + /* + * All of tasks can read from the buffer nearly simultaneously, but the + * last position for each task is different depending on the length of + * given buffer. Here, for simplicity, a position of buffer is set by + * the latest task. It's better for a listening application to allow one + * thread to read from the buffer. Unless, each task can read different + * sequence of responses depending on variation of buffer length. + */ + efw->pull_ptr = pull_ptr; + + spin_unlock_irq(&efw->lock); + return count; } @@ -76,14 +99,17 @@ static long hwdep_read_locked(struct snd_efw *efw, char __user *buf, long count, loff_t *offset) { - union snd_firewire_event event; + union snd_firewire_event event = { + .lock_status.type = SNDRV_FIREWIRE_EVENT_LOCK_STATUS, + }; - memset(&event, 0, sizeof(event)); + spin_lock_irq(&efw->lock); - event.lock_status.type = SNDRV_FIREWIRE_EVENT_LOCK_STATUS; event.lock_status.status = (efw->dev_lock_count > 0); efw->dev_lock_changed = false; + spin_unlock_irq(&efw->lock); + count = min_t(long, count, sizeof(event.lock_status)); if (copy_to_user(buf, &event, count)) @@ -98,10 +124,15 @@ hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, { struct snd_efw *efw = hwdep->private_data; DEFINE_WAIT(wait); + bool dev_lock_changed; + bool queued; spin_lock_irq(&efw->lock); - while ((!efw->dev_lock_changed) && (efw->resp_queues == 0)) { + dev_lock_changed = efw->dev_lock_changed; + queued = efw->push_ptr != efw->pull_ptr; + + while (!dev_lock_changed && !queued) { prepare_to_wait(&efw->hwdep_wait, &wait, TASK_INTERRUPTIBLE); spin_unlock_irq(&efw->lock); schedule(); @@ -109,15 +140,17 @@ hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, if (signal_pending(current)) return -ERESTARTSYS; spin_lock_irq(&efw->lock); + dev_lock_changed = efw->dev_lock_changed; + queued = efw->push_ptr != efw->pull_ptr; } - if (efw->dev_lock_changed) - count = hwdep_read_locked(efw, buf, count, offset); - else if (efw->resp_queues > 0) - count = hwdep_read_resp_buf(efw, buf, count, offset); - spin_unlock_irq(&efw->lock); + if (dev_lock_changed) + count = hwdep_read_locked(efw, buf, count, offset); + else if (queued) + count = hwdep_read_resp_buf(efw, buf, count, offset); + return count; } @@ -160,7 +193,7 @@ hwdep_poll(struct snd_hwdep *hwdep, struct file *file, poll_table *wait) poll_wait(file, &efw->hwdep_wait, wait); spin_lock_irq(&efw->lock); - if (efw->dev_lock_changed || (efw->resp_queues > 0)) + if (efw->dev_lock_changed || efw->pull_ptr != efw->push_ptr) events = POLLIN | POLLRDNORM; else events = 0; diff --git a/sound/firewire/fireworks/fireworks_proc.c b/sound/firewire/fireworks/fireworks_proc.c index 0639dcb13f7d..beb0a0ffee57 100644 --- a/sound/firewire/fireworks/fireworks_proc.c +++ b/sound/firewire/fireworks/fireworks_proc.c @@ -188,8 +188,8 @@ proc_read_queues_state(struct snd_info_entry *entry, else consumed = (unsigned int)(efw->push_ptr - efw->pull_ptr); - snd_iprintf(buffer, "%d %d/%d\n", - efw->resp_queues, consumed, snd_efw_resp_buf_size); + snd_iprintf(buffer, "%d/%d\n", + consumed, snd_efw_resp_buf_size); } static void diff --git a/sound/firewire/fireworks/fireworks_transaction.c b/sound/firewire/fireworks/fireworks_transaction.c index f550808d1784..36a08ba51ec7 100644 --- a/sound/firewire/fireworks/fireworks_transaction.c +++ b/sound/firewire/fireworks/fireworks_transaction.c @@ -121,11 +121,11 @@ copy_resp_to_buf(struct snd_efw *efw, void *data, size_t length, int *rcode) size_t capacity, till_end; struct snd_efw_transaction *t; - spin_lock_irq(&efw->lock); - t = (struct snd_efw_transaction *)data; length = min_t(size_t, be32_to_cpu(t->length) * sizeof(u32), length); + spin_lock_irq(&efw->lock); + if (efw->push_ptr < efw->pull_ptr) capacity = (unsigned int)(efw->pull_ptr - efw->push_ptr); else @@ -155,7 +155,6 @@ copy_resp_to_buf(struct snd_efw *efw, void *data, size_t length, int *rcode) } /* for hwdep */ - efw->resp_queues++; wake_up(&efw->hwdep_wait); *rcode = RCODE_COMPLETE; From 42c51d003bfd58b3d1bc966ad25f69c5b4974f9b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 30 Aug 2016 14:45:46 +0200 Subject: [PATCH 756/813] ALSA: rawmidi: Fix possible deadlock with virmidi registration commit 816f318b2364262a51024096da7ca3b84e78e3b5 upstream. When a seq-virmidi driver is initialized, it registers a rawmidi instance with its callback to create an associated seq kernel client. Currently it's done throughly in rawmidi's register_mutex context. Recently it was found that this may lead to a deadlock another rawmidi device that is being attached with the sequencer is accessed, as both open with the same register_mutex. This was actually triggered by syzkaller, as Dmitry Vyukov reported: ====================================================== [ INFO: possible circular locking dependency detected ] 4.8.0-rc1+ #11 Not tainted ------------------------------------------------------- syz-executor/7154 is trying to acquire lock: (register_mutex#5){+.+.+.}, at: [] snd_rawmidi_kernel_open+0x4b/0x260 sound/core/rawmidi.c:341 but task is already holding lock: (&grp->list_mutex){++++.+}, at: [] check_and_subscribe_port+0x5b/0x5c0 sound/core/seq/seq_ports.c:495 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&grp->list_mutex){++++.+}: [] lock_acquire+0x208/0x430 kernel/locking/lockdep.c:3746 [] down_read+0x49/0xc0 kernel/locking/rwsem.c:22 [< inline >] deliver_to_subscribers sound/core/seq/seq_clientmgr.c:681 [] snd_seq_deliver_event+0x35e/0x890 sound/core/seq/seq_clientmgr.c:822 [] > snd_seq_kernel_client_dispatch+0x126/0x170 sound/core/seq/seq_clientmgr.c:2418 [] snd_seq_system_broadcast+0xb2/0xf0 sound/core/seq/seq_system.c:101 [] snd_seq_create_kernel_client+0x24a/0x330 sound/core/seq/seq_clientmgr.c:2297 [< inline >] snd_virmidi_dev_attach_seq sound/core/seq/seq_virmidi.c:383 [] snd_virmidi_dev_register+0x29f/0x750 sound/core/seq/seq_virmidi.c:450 [] snd_rawmidi_dev_register+0x30c/0xd40 sound/core/rawmidi.c:1645 [] __snd_device_register.part.0+0x63/0xc0 sound/core/device.c:164 [< inline >] __snd_device_register sound/core/device.c:162 [] snd_device_register_all+0xad/0x110 sound/core/device.c:212 [] snd_card_register+0xef/0x6c0 sound/core/init.c:749 [] snd_virmidi_probe+0x3ef/0x590 sound/drivers/virmidi.c:123 [] platform_drv_probe+0x8b/0x170 drivers/base/platform.c:564 ...... -> #0 (register_mutex#5){+.+.+.}: [< inline >] check_prev_add kernel/locking/lockdep.c:1829 [< inline >] check_prevs_add kernel/locking/lockdep.c:1939 [< inline >] validate_chain kernel/locking/lockdep.c:2266 [] __lock_acquire+0x4d44/0x4d80 kernel/locking/lockdep.c:3335 [] lock_acquire+0x208/0x430 kernel/locking/lockdep.c:3746 [< inline >] __mutex_lock_common kernel/locking/mutex.c:521 [] mutex_lock_nested+0xb1/0xa20 kernel/locking/mutex.c:621 [] snd_rawmidi_kernel_open+0x4b/0x260 sound/core/rawmidi.c:341 [] midisynth_subscribe+0xf7/0x350 sound/core/seq/seq_midi.c:188 [< inline >] subscribe_port sound/core/seq/seq_ports.c:427 [] check_and_subscribe_port+0x467/0x5c0 sound/core/seq/seq_ports.c:510 [] snd_seq_port_connect+0x2c9/0x500 sound/core/seq/seq_ports.c:579 [] snd_seq_ioctl_subscribe_port+0x1d8/0x2b0 sound/core/seq/seq_clientmgr.c:1480 [] snd_seq_do_ioctl+0x184/0x1e0 sound/core/seq/seq_clientmgr.c:2225 [] snd_seq_kernel_client_ctl+0xa8/0x110 sound/core/seq/seq_clientmgr.c:2440 [] snd_seq_oss_midi_open+0x3b4/0x610 sound/core/seq/oss/seq_oss_midi.c:375 [] snd_seq_oss_synth_setup_midi+0x107/0x4c0 sound/core/seq/oss/seq_oss_synth.c:281 [] snd_seq_oss_open+0x748/0x8d0 sound/core/seq/oss/seq_oss_init.c:274 [] odev_open+0x6a/0x90 sound/core/seq/oss/seq_oss.c:138 [] soundcore_open+0x30f/0x640 sound/sound_core.c:639 ...... other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&grp->list_mutex); lock(register_mutex#5); lock(&grp->list_mutex); lock(register_mutex#5); *** DEADLOCK *** ====================================================== The fix is to simply move the registration parts in snd_rawmidi_dev_register() to the outside of the register_mutex lock. The lock is needed only to manage the linked list, and it's not necessarily to cover the whole initialization process. Reported-by: Dmitry Vyukov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/rawmidi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 795437b10082..b450a27588c8 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -1633,11 +1633,13 @@ static int snd_rawmidi_dev_register(struct snd_device *device) return -EBUSY; } list_add_tail(&rmidi->list, &snd_rawmidi_devices); + mutex_unlock(®ister_mutex); err = snd_register_device(SNDRV_DEVICE_TYPE_RAWMIDI, rmidi->card, rmidi->device, &snd_rawmidi_f_ops, rmidi, &rmidi->dev); if (err < 0) { rmidi_err(rmidi, "unable to register\n"); + mutex_lock(®ister_mutex); list_del(&rmidi->list); mutex_unlock(®ister_mutex); return err; @@ -1645,6 +1647,7 @@ static int snd_rawmidi_dev_register(struct snd_device *device) if (rmidi->ops && rmidi->ops->dev_register && (err = rmidi->ops->dev_register(rmidi)) < 0) { snd_unregister_device(&rmidi->dev); + mutex_lock(®ister_mutex); list_del(&rmidi->list); mutex_unlock(®ister_mutex); return err; @@ -1677,7 +1680,6 @@ static int snd_rawmidi_dev_register(struct snd_device *device) } } #endif /* CONFIG_SND_OSSEMUL */ - mutex_unlock(®ister_mutex); sprintf(name, "midi%d", rmidi->device); entry = snd_info_create_card_entry(rmidi->card, name, rmidi->card->proc_root); if (entry) { From eea36eb2c42be7e1eba65bb17cfa1018270184b1 Mon Sep 17 00:00:00 2001 From: Shrirang Bagul Date: Mon, 29 Aug 2016 15:19:27 +0800 Subject: [PATCH 757/813] ALSA: hda - Add headset mic quirk for Dell Inspiron 5468 commit 311042d1b67d9a1856a8e1294e7729fb86f64014 upstream. This patch enables headset microphone on some variants of Dell Inspiron 5468. (Dell SSID 0x07ad) BugLink: https://bugs.launchpad.net/bugs/1617900 Signed-off-by: Shrirang Bagul Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f25479ba3981..ad9dd8f9a3cd 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5879,6 +5879,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60170}, {0x14, 0x90170120}, {0x21, 0x02211030}), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell Inspiron 5468", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x12, 0x90a60180}, + {0x14, 0x90170120}, + {0x21, 0x02211030}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, ALC256_STANDARD_PINS), SND_HDA_PIN_QUIRK(0x10ec0280, 0x103c, "HP", ALC280_FIXUP_HP_GPIO4, From 857fbd7a74d2cb1a396f4733536e10e74c4908a6 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 30 Aug 2016 15:36:34 +0800 Subject: [PATCH 758/813] ALSA: hda - Enable subwoofer on Dell Inspiron 7559 commit fd06c77eb9200b53d421da5fffe0dcd894b5d72a upstream. The subwoofer on Inspiron 7559 was disabled originally. Applying a pin fixup to node 0x1b can enable it and make it work. Old pin: 0x411111f0 New pin: 0x90170151 Signed-off-by: Kai-Heng Feng Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ad9dd8f9a3cd..eaee626ab185 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4840,6 +4840,7 @@ enum { ALC221_FIXUP_HP_FRONT_MIC, ALC292_FIXUP_TPT460, ALC298_FIXUP_SPK_VOLUME, + ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER, }; static const struct hda_fixup alc269_fixups[] = { @@ -5501,6 +5502,15 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, }, + [ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1b, 0x90170151 }, + { } + }, + .chained = true, + .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -5545,6 +5555,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK), SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK), SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13 9350", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), + SND_PCI_QUIRK(0x1028, 0x0706, "Dell Inspiron 7559", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE), SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), From 6664ad65f31ecb35f4f7d8208036b5dbc7640740 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 28 Aug 2016 10:13:07 +0200 Subject: [PATCH 759/813] ALSA: timer: fix NULL pointer dereference in read()/ioctl() race commit 11749e086b2766cccf6217a527ef5c5604ba069c upstream. I got this with syzkaller: ================================================================== BUG: KASAN: null-ptr-deref on address 0000000000000020 Read of size 32 by task syz-executor/22519 CPU: 1 PID: 22519 Comm: syz-executor Not tainted 4.8.0-rc2+ #169 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2 014 0000000000000001 ffff880111a17a00 ffffffff81f9f141 ffff880111a17a90 ffff880111a17c50 ffff880114584a58 ffff880114584a10 ffff880111a17a80 ffffffff8161fe3f ffff880100000000 ffff880118d74a48 ffff880118d74a68 Call Trace: [] dump_stack+0x83/0xb2 [] kasan_report_error+0x41f/0x4c0 [] kasan_report+0x34/0x40 [] ? snd_timer_user_read+0x554/0x790 [] check_memory_region+0x13e/0x1a0 [] kasan_check_read+0x11/0x20 [] snd_timer_user_read+0x554/0x790 [] ? snd_timer_user_info_compat.isra.5+0x2b0/0x2b0 [] ? proc_fault_inject_write+0x1c1/0x250 [] ? next_tgid+0x2a0/0x2a0 [] ? do_group_exit+0x108/0x330 [] ? fsnotify+0x72a/0xca0 [] __vfs_read+0x10e/0x550 [] ? snd_timer_user_info_compat.isra.5+0x2b0/0x2b0 [] ? do_sendfile+0xc50/0xc50 [] ? __fsnotify_update_child_dentry_flags+0x60/0x60 [] ? kcov_ioctl+0x56/0x190 [] ? common_file_perm+0x2e2/0x380 [] ? __fsnotify_parent+0x5e/0x2b0 [] ? security_file_permission+0x86/0x1e0 [] ? rw_verify_area+0xe5/0x2b0 [] vfs_read+0x115/0x330 [] SyS_read+0xd1/0x1a0 [] ? vfs_write+0x4b0/0x4b0 [] ? __this_cpu_preempt_check+0x1c/0x20 [] ? __context_tracking_exit.part.4+0x3a/0x1e0 [] ? vfs_write+0x4b0/0x4b0 [] do_syscall_64+0x1c4/0x4e0 [] ? syscall_return_slowpath+0x16c/0x1d0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================== There are a couple of problems that I can see: - ioctl(SNDRV_TIMER_IOCTL_SELECT), which potentially sets tu->queue/tu->tqueue to NULL on memory allocation failure, so read() would get a NULL pointer dereference like the above splat - the same ioctl() can free tu->queue/to->tqueue which means read() could potentially see (and dereference) the freed pointer We can fix both by taking the ioctl_lock mutex when dereferencing ->queue/->tqueue, since that's always held over all the ioctl() code. Just looking at the code I find it likely that there are more problems here such as tu->qhead pointing outside the buffer if the size is changed concurrently using SNDRV_TIMER_IOCTL_PARAMS. Signed-off-by: Vegard Nossum Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 637d034bb084..8de029c17f90 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1967,6 +1967,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, tu->qused--; spin_unlock_irq(&tu->qlock); + mutex_lock(&tu->ioctl_lock); if (tu->tread) { if (copy_to_user(buffer, &tu->tqueue[qhead], sizeof(struct snd_timer_tread))) @@ -1976,6 +1977,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, sizeof(struct snd_timer_read))) err = -EFAULT; } + mutex_unlock(&tu->ioctl_lock); spin_lock_irq(&tu->qlock); if (err < 0) From 6fd91313bfcfd44cebcb2a55dbeeb606ee3cd710 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 29 Aug 2016 00:33:50 +0200 Subject: [PATCH 760/813] ALSA: timer: fix division by zero after SNDRV_TIMER_IOCTL_CONTINUE commit 6b760bb2c63a9e322c0e4a0b5daf335ad93d5a33 upstream. I got this: divide error: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 1327 Comm: a.out Not tainted 4.8.0-rc2+ #189 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 task: ffff8801120a9580 task.stack: ffff8801120b0000 RIP: 0010:[] [] snd_hrtimer_callback+0x1da/0x3f0 RSP: 0018:ffff88011aa87da8 EFLAGS: 00010006 RAX: 0000000000004f76 RBX: ffff880112655e88 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff880112655ea0 RDI: 0000000000000001 RBP: ffff88011aa87e00 R08: ffff88013fff905c R09: ffff88013fff9048 R10: ffff88013fff9050 R11: 00000001050a7b8c R12: ffff880114778a00 R13: ffff880114778ab4 R14: ffff880114778b30 R15: 0000000000000000 FS: 00007f071647c700(0000) GS:ffff88011aa80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000603001 CR3: 0000000112021000 CR4: 00000000000006e0 Stack: 0000000000000000 ffff880114778ab8 ffff880112655ea0 0000000000004f76 ffff880112655ec8 ffff880112655e80 ffff880112655e88 ffff88011aa98fc0 00000000b97ccf2b dffffc0000000000 ffff88011aa98fc0 ffff88011aa87ef0 Call Trace: [] __hrtimer_run_queues+0x347/0xa00 [] ? snd_hrtimer_close+0x130/0x130 [] ? retrigger_next_event+0x1b0/0x1b0 [] ? hrtimer_interrupt+0x136/0x4b0 [] hrtimer_interrupt+0x1b0/0x4b0 [] local_apic_timer_interrupt+0x6e/0xf0 [] ? kvm_guest_apic_eoi_write+0x13/0xc0 [] smp_apic_timer_interrupt+0x76/0xa0 [] apic_timer_interrupt+0x8c/0xa0 [] ? _raw_spin_unlock_irqrestore+0x2c/0x60 [] snd_timer_start1+0xdd/0x670 [] snd_timer_continue+0x45/0x80 [] snd_timer_user_ioctl+0x1030/0x2830 [] ? __follow_pte.isra.49+0x430/0x430 [] ? snd_timer_pause+0x80/0x80 [] ? do_wp_page+0x3aa/0x1c90 [] ? handle_mm_fault+0xbc8/0x27f0 [] ? __pmd_alloc+0x370/0x370 [] ? snd_timer_pause+0x80/0x80 [] do_vfs_ioctl+0x193/0x1050 [] ? ioctl_preallocate+0x200/0x200 [] ? syscall_trace_enter+0x3cf/0xdb0 [] ? __context_tracking_exit.part.4+0x9a/0x1e0 [] ? exit_to_usermode_loop+0x190/0x190 [] ? check_preemption_disabled+0x37/0x1e0 [] ? security_file_ioctl+0x89/0xb0 [] SyS_ioctl+0x8f/0xc0 [] ? do_vfs_ioctl+0x1050/0x1050 [] do_syscall_64+0x1c4/0x4e0 [] entry_SYSCALL64_slow_path+0x25/0x25 Code: e8 fc 42 7b fe 8b 0d 06 8a 50 03 49 0f af cf 48 85 c9 0f 88 7c 01 00 00 48 89 4d a8 e8 e0 42 7b fe 48 8b 45 c0 48 8b 4d a8 48 99 <48> f7 f9 49 01 c7 e8 cb 42 7b fe 48 8b 55 d0 48 b8 00 00 00 00 RIP [] snd_hrtimer_callback+0x1da/0x3f0 RSP ---[ end trace 6aa380f756a21074 ]--- The problem happens when you call ioctl(SNDRV_TIMER_IOCTL_CONTINUE) on a completely new/unused timer -- it will have ->sticks == 0, which causes a divide by 0 in snd_hrtimer_callback(). Signed-off-by: Vegard Nossum Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 8de029c17f90..2bf701fac50c 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -837,6 +837,7 @@ int snd_timer_new(struct snd_card *card, char *id, struct snd_timer_id *tid, timer->tmr_subdevice = tid->subdevice; if (id) strlcpy(timer->id, id, sizeof(timer->id)); + timer->sticks = 1; INIT_LIST_HEAD(&timer->device_list); INIT_LIST_HEAD(&timer->open_list_head); INIT_LIST_HEAD(&timer->active_list_head); From f5b004e4edb87ea4c0a72a72191f33d372d08251 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 29 Aug 2016 00:33:51 +0200 Subject: [PATCH 761/813] ALSA: timer: fix NULL pointer dereference on memory allocation failure commit 8ddc05638ee42b18ba4fe99b5fb647fa3ad20456 upstream. I hit this with syzkaller: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 1327 Comm: a.out Not tainted 4.8.0-rc2+ #190 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 task: ffff88011278d600 task.stack: ffff8801120c0000 RIP: 0010:[] [] snd_hrtimer_start+0x77/0x100 RSP: 0018:ffff8801120c7a60 EFLAGS: 00010006 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000007 RDX: 0000000000000009 RSI: 1ffff10023483091 RDI: 0000000000000048 RBP: ffff8801120c7a78 R08: ffff88011a5cf768 R09: ffff88011a5ba790 R10: 0000000000000002 R11: ffffed00234b9ef1 R12: ffff880114843980 R13: ffffffff84213c00 R14: ffff880114843ab0 R15: 0000000000000286 FS: 00007f72958f3700(0000) GS:ffff88011aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000603001 CR3: 00000001126ab000 CR4: 00000000000006f0 Stack: ffff880114843980 ffff880111eb2dc0 ffff880114843a34 ffff8801120c7ad0 ffffffff82c81ab1 0000000000000000 ffffffff842138e0 0000000100000000 ffff880111eb2dd0 ffff880111eb2dc0 0000000000000001 ffff880111eb2dc0 Call Trace: [] snd_timer_start1+0x331/0x670 [] snd_timer_start+0x5d/0xa0 [] snd_timer_user_ioctl+0x88e/0x2830 [] ? __follow_pte.isra.49+0x430/0x430 [] ? snd_timer_pause+0x80/0x80 [] ? do_wp_page+0x3aa/0x1c90 [] ? put_prev_entity+0x108f/0x21a0 [] ? snd_timer_pause+0x80/0x80 [] do_vfs_ioctl+0x193/0x1050 [] ? cpuacct_account_field+0x12f/0x1a0 [] ? ioctl_preallocate+0x200/0x200 [] ? syscall_trace_enter+0x3cf/0xdb0 [] ? __context_tracking_exit.part.4+0x9a/0x1e0 [] ? exit_to_usermode_loop+0x190/0x190 [] ? check_preemption_disabled+0x37/0x1e0 [] ? security_file_ioctl+0x89/0xb0 [] SyS_ioctl+0x8f/0xc0 [] ? do_vfs_ioctl+0x1050/0x1050 [] do_syscall_64+0x1c4/0x4e0 [] entry_SYSCALL64_slow_path+0x25/0x25 Code: c7 c7 c4 b9 c8 82 48 89 d9 4c 89 ee e8 63 88 7f fe e8 7e 46 7b fe 48 8d 7b 48 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 04 84 c0 7e 65 80 7b 48 00 74 0e e8 52 46 RIP [] snd_hrtimer_start+0x77/0x100 RSP ---[ end trace 5955b08db7f2b029 ]--- This can happen if snd_hrtimer_open() fails to allocate memory and returns an error, which is currently not checked by snd_timer_open(): ioctl(SNDRV_TIMER_IOCTL_SELECT) - snd_timer_user_tselect() - snd_timer_close() - snd_hrtimer_close() - (struct snd_timer *) t->private_data = NULL - snd_timer_open() - snd_hrtimer_open() - kzalloc() fails; t->private_data is still NULL ioctl(SNDRV_TIMER_IOCTL_START) - snd_timer_user_start() - snd_timer_start() - snd_timer_start1() - snd_hrtimer_start() - t->private_data == NULL // boom Signed-off-by: Vegard Nossum Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 2bf701fac50c..ae4ea2e2e7fe 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -296,8 +296,21 @@ int snd_timer_open(struct snd_timer_instance **ti, get_device(&timer->card->card_dev); timeri->slave_class = tid->dev_sclass; timeri->slave_id = slave_id; - if (list_empty(&timer->open_list_head) && timer->hw.open) - timer->hw.open(timer); + + if (list_empty(&timer->open_list_head) && timer->hw.open) { + int err = timer->hw.open(timer); + if (err) { + kfree(timeri->owner); + kfree(timeri); + + if (timer->card) + put_device(&timer->card->card_dev); + module_put(timer->module); + mutex_unlock(®ister_mutex); + return err; + } + } + list_add_tail(&timeri->open_list, &timer->open_list_head); snd_timer_check_master(timeri); mutex_unlock(®ister_mutex); From 6f0caecda50f411841d9791e1fe54c30d33969b7 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Fri, 12 Aug 2016 17:20:07 -0500 Subject: [PATCH 762/813] scsi: fix upper bounds check of sense key in scsi_sense_key_string() commit a87eeb900dbb9f8202f96604d56e47e67c936b9d upstream. Commit 655ee63cf371 ("scsi constants: command, sense key + additional sense string") added a "Completed" sense string with key 0xF to snstext[], but failed to updated the upper bounds check of the sense key in scsi_sense_key_string(). Fixes: 655ee63cf371 ("[SCSI] scsi constants: command, sense key + additional sense strings") Signed-off-by: Tyrel Datwyler Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/constants.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/constants.c b/drivers/scsi/constants.c index fa09d4be2b53..2b456ca69d5c 100644 --- a/drivers/scsi/constants.c +++ b/drivers/scsi/constants.c @@ -1181,8 +1181,9 @@ static const char * const snstext[] = { /* Get sense key string or NULL if not available */ const char * -scsi_sense_key_string(unsigned char key) { - if (key <= 0xE) +scsi_sense_key_string(unsigned char key) +{ + if (key < ARRAY_SIZE(snstext)) return snstext[key]; return NULL; } From 73e6305cf0f89ee66abe522a75eb614212b1c8a2 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 3 May 2016 09:11:21 +0100 Subject: [PATCH 763/813] metag: Fix atomic_*_return inline asm constraints commit 096a8b6d5e7ab9f8ca3d2474b3ca6a1fe79e0371 upstream. The argument i of atomic_*_return() operations is given to inline asm with the "bd" constraint, which means "An Op2 register where Op1 is a data unit register and the instruction supports O2R", however Op1 is constrained by "da" which allows an address unit register to be used. Fix the constraint to use "br", meaning "An Op2 register and the instruction supports O2R", i.e. not requiring Op1 to be a data unit register. Fixes: d6dfe2509da9 ("locking,arch,metag: Fold atomic_ops") Signed-off-by: James Hogan Cc: Peter Zijlstra Cc: linux-metag@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/metag/include/asm/atomic_lnkget.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h index a62581815624..88fa25fae8bd 100644 --- a/arch/metag/include/asm/atomic_lnkget.h +++ b/arch/metag/include/asm/atomic_lnkget.h @@ -61,7 +61,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \ " CMPT %0, #HI(0x02000000)\n" \ " BNZ 1b\n" \ : "=&d" (temp), "=&da" (result) \ - : "da" (&v->counter), "bd" (i) \ + : "da" (&v->counter), "br" (i) \ : "cc"); \ \ smp_mb(); \ From aaac8cab2f92aa235f6f79e8cff52bf11c0fbf1d Mon Sep 17 00:00:00 2001 From: Sai Gurrappadi Date: Fri, 29 Apr 2016 14:44:37 -0700 Subject: [PATCH 764/813] cpufreq: Fix GOV_LIMITS handling for the userspace governor commit e43e94c1eda76dabd686ddf6f7825f54d747b310 upstream. Currently, the userspace governor only updates frequency on GOV_LIMITS if policy->cur falls outside policy->{min/max}. However, it is also necessary to update current frequency on GOV_LIMITS to match the user requested value if it can be achieved within the new policy->{max/min}. This was previously the behaviour in the governor until commit d1922f0 ("cpufreq: Simplify userspace governor") which incorrectly assumed that policy->cur == user requested frequency via scaling_setspeed. This won't be true if the user requested frequency falls outside policy->{min/max}. Ex: a temporary thermal cap throttled the user requested frequency. Fix this by storing the user requested frequency in a seperate variable. The governor will then try to achieve this request on every GOV_LIMITS change. Fixes: d1922f02562f (cpufreq: Simplify userspace governor) Signed-off-by: Sai Gurrappadi Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/cpufreq_userspace.c | 43 +++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 4dbf1db16aca..9cc8abd3d116 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -17,6 +17,7 @@ #include #include #include +#include static DEFINE_PER_CPU(unsigned int, cpu_is_managed); static DEFINE_MUTEX(userspace_mutex); @@ -31,6 +32,7 @@ static DEFINE_MUTEX(userspace_mutex); static int cpufreq_set(struct cpufreq_policy *policy, unsigned int freq) { int ret = -EINVAL; + unsigned int *setspeed = policy->governor_data; pr_debug("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); @@ -38,6 +40,8 @@ static int cpufreq_set(struct cpufreq_policy *policy, unsigned int freq) if (!per_cpu(cpu_is_managed, policy->cpu)) goto err; + *setspeed = freq; + ret = __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); err: mutex_unlock(&userspace_mutex); @@ -49,19 +53,45 @@ static ssize_t show_speed(struct cpufreq_policy *policy, char *buf) return sprintf(buf, "%u\n", policy->cur); } +static int cpufreq_userspace_policy_init(struct cpufreq_policy *policy) +{ + unsigned int *setspeed; + + setspeed = kzalloc(sizeof(*setspeed), GFP_KERNEL); + if (!setspeed) + return -ENOMEM; + + policy->governor_data = setspeed; + return 0; +} + static int cpufreq_governor_userspace(struct cpufreq_policy *policy, unsigned int event) { + unsigned int *setspeed = policy->governor_data; unsigned int cpu = policy->cpu; int rc = 0; + if (event == CPUFREQ_GOV_POLICY_INIT) + return cpufreq_userspace_policy_init(policy); + + if (!setspeed) + return -EINVAL; + switch (event) { + case CPUFREQ_GOV_POLICY_EXIT: + mutex_lock(&userspace_mutex); + policy->governor_data = NULL; + kfree(setspeed); + mutex_unlock(&userspace_mutex); + break; case CPUFREQ_GOV_START: BUG_ON(!policy->cur); pr_debug("started managing cpu %u\n", cpu); mutex_lock(&userspace_mutex); per_cpu(cpu_is_managed, cpu) = 1; + *setspeed = policy->cur; mutex_unlock(&userspace_mutex); break; case CPUFREQ_GOV_STOP: @@ -69,20 +99,23 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy, mutex_lock(&userspace_mutex); per_cpu(cpu_is_managed, cpu) = 0; + *setspeed = 0; mutex_unlock(&userspace_mutex); break; case CPUFREQ_GOV_LIMITS: mutex_lock(&userspace_mutex); - pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", - cpu, policy->min, policy->max, - policy->cur); + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz, last set to %u kHz\n", + cpu, policy->min, policy->max, policy->cur, *setspeed); - if (policy->max < policy->cur) + if (policy->max < *setspeed) __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) + else if (policy->min > *setspeed) __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + else + __cpufreq_driver_target(policy, *setspeed, + CPUFREQ_RELATION_L); mutex_unlock(&userspace_mutex); break; } From 390bddc4a36d2ce015a9d78d8624e575cbac9cd3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 14 Mar 2016 09:07:14 +0900 Subject: [PATCH 765/813] hwrng: exynos - Disable runtime PM on probe failure commit 48a61e1e2af8020f11a2b8f8dc878144477623c6 upstream. Add proper error path (for disabling runtime PM) when registering of hwrng fails. Fixes: b329669ea0b5 ("hwrng: exynos - Add support for Exynos random number generator") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/char/hw_random/exynos-rng.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/exynos-rng.c b/drivers/char/hw_random/exynos-rng.c index aa30af5f0f2b..7845a38b6604 100644 --- a/drivers/char/hw_random/exynos-rng.c +++ b/drivers/char/hw_random/exynos-rng.c @@ -118,6 +118,7 @@ static int exynos_rng_probe(struct platform_device *pdev) { struct exynos_rng *exynos_rng; struct resource *res; + int ret; exynos_rng = devm_kzalloc(&pdev->dev, sizeof(struct exynos_rng), GFP_KERNEL); @@ -145,7 +146,13 @@ static int exynos_rng_probe(struct platform_device *pdev) pm_runtime_use_autosuspend(&pdev->dev); pm_runtime_enable(&pdev->dev); - return devm_hwrng_register(&pdev->dev, &exynos_rng->rng); + ret = devm_hwrng_register(&pdev->dev, &exynos_rng->rng); + if (ret) { + pm_runtime_dont_use_autosuspend(&pdev->dev); + pm_runtime_disable(&pdev->dev); + } + + return ret; } #ifdef CONFIG_PM From 3838b04f6d5868f5ec5ee52ce1391b6253c6ce16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20B=C3=A5tsman?= Date: Fri, 17 Jun 2016 13:31:37 +0300 Subject: [PATCH 766/813] regulator: anatop: allow regulator to be in bypass mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8a092e682f20f193f2070dba2ea1904e95814126 upstream. Bypass support was added in commit d38018f2019c ("regulator: anatop: Add bypass support to digital LDOs"). A check for valid voltage selectors was added in commit da0607c8df5c ("regulator: anatop: Fail on invalid voltage selector") but it also discards all regulators that are in bypass mode. Add check for the bypass setting. Errors below were seen on a Variscite mx6 board. anatop_regulator 20c8000.anatop:regulator-vddcore@140: Failed to read a valid default voltage selector. anatop_regulator: probe of 20c8000.anatop:regulator-vddcore@140 failed with error -22 anatop_regulator 20c8000.anatop:regulator-vddsoc@140: Failed to read a valid default voltage selector. anatop_regulator: probe of 20c8000.anatop:regulator-vddsoc@140 failed with error -22 Fixes: da0607c8df5c ("regulator: anatop: Fail on invalid voltage selector") Signed-off-by: Mika Båtsman Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/regulator/anatop-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/anatop-regulator.c b/drivers/regulator/anatop-regulator.c index 63cd5e68c864..3a6d0290c54c 100644 --- a/drivers/regulator/anatop-regulator.c +++ b/drivers/regulator/anatop-regulator.c @@ -296,7 +296,7 @@ static int anatop_regulator_probe(struct platform_device *pdev) if (!sreg->sel && !strcmp(sreg->name, "vddpu")) sreg->sel = 22; - if (!sreg->sel) { + if (!sreg->bypass && !sreg->sel) { dev_err(&pdev->dev, "Failed to read a valid default voltage selector.\n"); return -EINVAL; } From 11c27e850d90372d79c1337be299923b5fb353e8 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 22 Mar 2016 13:12:35 +0100 Subject: [PATCH 767/813] lib/mpi: mpi_write_sgl(): fix skipping of leading zero limbs commit f2d1362ff7d266b3d2b1c764d6c2ef4a3b457f23 upstream. Currently, if the number of leading zeros is greater than fits into a complete limb, mpi_write_sgl() skips them by iterating over them limb-wise. However, it fails to adjust its internal leading zeros tracking variable, lzeros, accordingly: it does a p -= sizeof(alimb); continue; which should really have been a lzeros -= sizeof(alimb); continue; Since lzeros never decreases if its initial value >= sizeof(alimb), nothing gets copied by mpi_write_sgl() in that case. Instead of skipping the high order zero limbs within the loop as shown above, fix the issue by adjusting the copying loop's bounds. Fixes: 2d4d1eea540b ("lib/mpi: Add mpi sgl helpers") Signed-off-by: Nicolai Stange Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- lib/mpi/mpicoder.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index e00ff00e861c..e37dbf53e226 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -367,7 +367,9 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, buf_len = sgl->length; p2 = sg_virt(sgl); - for (i = a->nlimbs - 1; i >= 0; i--) { + for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, + lzeros %= BYTES_PER_MPI_LIMB; + i >= 0; i--) { alimb = a->d[i]; p = (u8 *)&alimb2; #if BYTES_PER_MPI_LIMB == 4 @@ -388,17 +390,12 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, #error please implement for this limb size. #endif if (lzeros > 0) { - if (lzeros >= sizeof(alimb)) { - p -= sizeof(alimb); - continue; - } else { - mpi_limb_t *limb1 = (void *)p - sizeof(alimb); - mpi_limb_t *limb2 = (void *)p - sizeof(alimb) - + lzeros; - *limb1 = *limb2; - p -= lzeros; - y = lzeros; - } + mpi_limb_t *limb1 = (void *)p - sizeof(alimb); + mpi_limb_t *limb2 = (void *)p - sizeof(alimb) + + lzeros; + *limb1 = *limb2; + p -= lzeros; + y = lzeros; lzeros -= sizeof(alimb); } From 1d074db69c46d62ce82b331c2080e2fcb710bf4a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 15 Sep 2016 08:29:29 +0200 Subject: [PATCH 768/813] Linux 4.4.21 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b74d60081a16..d1cc9e0b7473 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 20 +SUBLEVEL = 21 EXTRAVERSION = NAME = Blurry Fish Butt From dacbe1f2be8f8d84a39a7926c21a2ac0d1bd5263 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: [PATCH 769/813] mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d86bd1bece6fc41d59253002db5441fe960a37f6) Signed-off-by: Alex Shi --- include/linux/slub_def.h | 1 + mm/slub.c | 100 +++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33885118523c..f4e857e920cd 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,6 +81,7 @@ struct kmem_cache { int reserved; /* Reserved bytes at the end of slabs */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ + int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif diff --git a/mm/slub.c b/mm/slub.c index fbadb3753d4d..41f7cae64a49 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -489,6 +497,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -817,6 +850,10 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; @@ -1468,7 +1505,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -3283,7 +3320,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3292,6 +3329,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* From 455003e59e7981b5fc2e660b3d93b74b85a1a55f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Aug 2016 11:53:59 -0500 Subject: [PATCH 770/813] usercopy: fix overlap check for kernel text When running with a local patch which moves the '_stext' symbol to the very beginning of the kernel text area, I got the following panic with CONFIG_HARDENED_USERCOPY: usercopy: kernel memory exposure attempt detected from ffff88103dfff000 () (4096 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:79! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1 Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016 task: ffff880817444140 task.stack: ffff880816274000 RIP: 0010:[] __check_object_size+0x76/0x413 RSP: 0018:ffff880816277c40 EFLAGS: 00010246 RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8 RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000 R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000 R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001 FS: 00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0 Stack: ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000 0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28 000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6 Call Trace: [] copy_page_to_iter_iovec+0xa6/0x1c0 [] copy_page_to_iter+0x16/0x90 [] generic_file_read_iter+0x3e3/0x7c0 [] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs] [] ? down_read+0x12/0x40 [] xfs_file_buffered_aio_read+0x51/0xc0 [xfs] [] xfs_file_read_iter+0x62/0xb0 [xfs] [] __vfs_read+0xdf/0x130 [] vfs_read+0x8e/0x140 [] SyS_read+0x55/0xc0 [] do_syscall_64+0x67/0x160 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00 RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00 RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004 RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000 R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000 Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3 RIP [] __check_object_size+0x76/0x413 RSP The checked object's range [ffff88103dfff000, ffff88103e000000) is valid, so there shouldn't have been a BUG. The hardened usercopy code got confused because the range's ending address is the same as the kernel's text starting address at 0xffff88103e000000. The overlap check is slightly off. Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Josh Poimboeuf Signed-off-by: Kees Cook (cherry picked from commit 94cd97af690dd9537818dc9841d0ec68bb1dd877) Signed-off-by: Alex Shi --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 8ebae91a6b55..6b1c20f9d486 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -83,7 +83,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; From 4663a70283d6b3bb1270627d2dd3fc3ba94cd723 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:54:34 -0700 Subject: [PATCH 771/813] usercopy: remove page-spanning test for now A custom allocator without __GFP_COMP that copies to userspace has been found in vmw_execbuf_process[1], so this disables the page-span checker by placing it behind a CONFIG for future work where such things can be tracked down later. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1373326 Reported-by: Vinson Lee Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Kees Cook (cherry picked from commit 8e1f74ea02cf4562404c48c6882214821552c13f) Signed-off-by: Alex Shi --- mm/usercopy.c | 61 +++++++++++++++++++++++++++--------------------- security/Kconfig | 11 +++++++++ 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 6b1c20f9d486..c56b97b7c49c 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -134,30 +134,15 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n) return NULL; } -static inline const char *check_heap_object(const void *ptr, unsigned long n, - bool to_user) +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) { - struct page *page, *endpage; +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN const void *end = ptr + n - 1; + struct page *endpage; bool is_reserved, is_cma; - /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - */ - if (is_vmalloc_addr(ptr)) - return NULL; - - if (!virt_addr_valid(ptr)) - return NULL; - - page = virt_to_head_page(ptr); - - /* Check slab allocator for flags and size. */ - if (PageSlab(page)) - return __check_heap_object(ptr, n, page); - /* * Sometimes the kernel data regions are not marked Reserved (see * check below). And sometimes [_sdata,_edata) does not cover @@ -186,7 +171,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, ((unsigned long)end & (unsigned long)PAGE_MASK))) return NULL; - /* Allow if start and end are inside the same compound page. */ + /* Allow if fully inside the same compound (__GFP_COMP) page. */ endpage = virt_to_head_page(end); if (likely(endpage == page)) return NULL; @@ -199,20 +184,44 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, is_reserved = PageReserved(page); is_cma = is_migrate_cma_page(page); if (!is_reserved && !is_cma) - goto reject; + return ""; for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { page = virt_to_head_page(ptr); if (is_reserved && !PageReserved(page)) - goto reject; + return ""; if (is_cma && !is_migrate_cma_page(page)) - goto reject; + return ""; } +#endif return NULL; +} -reject: - return ""; +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); } /* diff --git a/security/Kconfig b/security/Kconfig index 46c00a674eec..ddb3e8a8d9bd 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -146,6 +146,17 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. +config HARDENED_USERCOPY_PAGESPAN + bool "Refuse to copy allocations that span multiple pages" + depends on HARDENED_USERCOPY + depends on !COMPILE_TEST + help + When a multi-page allocation is done without __GFP_COMP, + hardened usercopy will reject attempts to copy it. There are, + however, several cases of this in the kernel that have not all + been removed. This config is intended to be used only while + trying to find such users. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From ed67fb82b17db1f5e3c7818e10560814c7d2e019 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: [PATCH 772/813] unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds (cherry picked from commit 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51) Signed-off-by: Alex Shi --- arch/x86/include/asm/uaccess.h | 16 ++++++++-------- include/linux/uaccess.h | 4 ++-- lib/strncpy_from_user.c | 8 ++++---- lib/strnlen_user.c | 7 +++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index dd73cf90fb18..be439e246d91 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -773,21 +773,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 349557825428..f30c187ed785 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33840324138c..5a003a2ebd96 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -55,8 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -75,6 +74,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } From cb2d7844a28eff4b546b9de8aa95d081fd61b775 Mon Sep 17 00:00:00 2001 From: Ping Li Date: Thu, 22 Sep 2016 12:54:42 -0700 Subject: [PATCH 773/813] msm: mdss: Optimize Gamut Mapping programming sequence Gamut mapping feature has a huge size 3D LUT table, which consumes a lot of time to program. This patch optimizes the programming of the 3D LUT to reduce about half of the register writes. Change-Id: I4a9fcee6c1cd70d9c04426c394e480c11a6f4648 Signed-off-by: Ping Li --- drivers/video/fbdev/msm/mdss_mdp_pp_v1_7.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/video/fbdev/msm/mdss_mdp_pp_v1_7.c b/drivers/video/fbdev/msm/mdss_mdp_pp_v1_7.c index 1e4adc984802..71cab148e1c3 100644 --- a/drivers/video/fbdev/msm/mdss_mdp_pp_v1_7.c +++ b/drivers/video/fbdev/msm/mdss_mdp_pp_v1_7.c @@ -833,6 +833,8 @@ static int pp_gamut_set_config(char __iomem *base_addr, struct mdp_gamut_cfg_data *gamut_cfg_data = NULL; struct mdp_gamut_data_v1_7 *gamut_data = NULL; char __iomem *base_addr_scale = base_addr; + uint64_t gamut_val; + if (!base_addr || !cfg_data || !pp_sts) { pr_err("invalid params base_addr %pK cfg_data %pK pp_sts_type %pK\n", base_addr, cfg_data, pp_sts); @@ -900,12 +902,18 @@ static int pp_gamut_set_config(char __iomem *base_addr, val = index_start; val |= GAMUT_TABLE_SELECT(i); writel_relaxed(val, (base_addr + GAMUT_TABLE_INDEX)); - for (j = 0; j < gamut_data->tbl_size[i]; j++) { - writel_relaxed(gamut_data->c1_c2_data[i][j], - base_addr + GAMUT_TABLE_LOWER_GB); - writel_relaxed(gamut_data->c0_data[i][j], - base_addr + GAMUT_TABLE_UPPER_R); + + writel_relaxed(gamut_data->c1_c2_data[i][0], + base_addr + GAMUT_TABLE_LOWER_GB); + for (j = 0; j < gamut_data->tbl_size[i] - 1 ; j++) { + gamut_val = gamut_data->c1_c2_data[i][j + 1]; + gamut_val = (gamut_val << 32) | + gamut_data->c0_data[i][j]; + writeq_relaxed(gamut_val, + base_addr + GAMUT_TABLE_UPPER_R); } + writel_relaxed(gamut_data->c0_data[i][j], + base_addr + GAMUT_TABLE_UPPER_R); if ((i >= MDP_GAMUT_SCALE_OFF_TABLE_NUM) || (!gamut_data->map_en)) continue; From 757c94f42fc66dac25cc5dd5177ee1c2df552a4b Mon Sep 17 00:00:00 2001 From: Shubhraprakash Das Date: Thu, 17 Nov 2016 23:03:17 -0800 Subject: [PATCH 774/813] msm: camera: isp: Initialize registers after reset Initialize the qos registers and turn on interrupts after reset. CRs-Fixed: 1089171 Change-Id: I8ed92a835fec1d5297448f440c19cc22ba52728b Signed-off-by: Shubhraprakash Das --- drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c index 1bf628de4df0..aa3a0e239d87 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c @@ -2186,6 +2186,7 @@ static void msm_isp_input_disable(struct vfe_device *vfe_dev, int cmd_type) if (msm_vfe_is_vfe48(vfe_dev)) vfe_dev->hw_info->vfe_ops.core_ops.reset_hw(vfe_dev, 0, 1); + vfe_dev->hw_info->vfe_ops.core_ops.init_hw_reg(vfe_dev); } } From 97587bf5cb1b60e49b24af480ddad47571784da5 Mon Sep 17 00:00:00 2001 From: Shubhraprakash Das Date: Fri, 2 Sep 2016 01:02:45 -0700 Subject: [PATCH 775/813] msm: camera: isp: Add secure mode to isp Add option to put the isp hardware in secure smmu mode. The isp stats will still be in non secure mode. Add ioctl to indicate which buffer queue will be in secure mode so that they can be mapped in secure mode CRs-Fixed: 1060631 Change-Id: Ibf2050d0814cc2aaf22a6f510847054d78fd7477 Signed-off-by: Shubhraprakash Das --- .../platform/msm/camera_v2/isp/msm_buf_mgr.c | 157 +++++++++++++++--- .../platform/msm/camera_v2/isp/msm_buf_mgr.h | 10 +- .../platform/msm/camera_v2/isp/msm_isp.c | 2 - .../msm/camera_v2/isp/msm_isp_stats_util.c | 2 +- .../platform/msm/camera_v2/isp/msm_isp_util.c | 1 + include/uapi/media/msmb_isp.h | 22 ++- 6 files changed, 157 insertions(+), 37 deletions(-) diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.c b/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.c index 4200215705d0..de29692414d2 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.c @@ -188,11 +188,27 @@ static int msm_isp_prepare_v4l2_buf(struct msm_isp_buf_mgr *buf_mgr, int ret; struct msm_isp_buffer_mapped_info *mapped_info; uint32_t accu_length = 0; + struct msm_isp_bufq *bufq = NULL; + bufq = msm_isp_get_bufq(buf_mgr, buf_info->bufq_handle); + if (!bufq) { + pr_err("%s: Invalid bufq, stream id %x\n", + __func__, stream_id); + return -EINVAL; + } for (i = 0; i < qbuf_buf->num_planes; i++) { mapped_info = &buf_info->mapped_info[i]; mapped_info->buf_fd = qbuf_buf->planes[i].addr; - ret = cam_smmu_get_phy_addr(buf_mgr->iommu_hdl, + + if (bufq->security_mode == SECURE_MODE) + ret = cam_smmu_get_stage2_phy_addr(buf_mgr->iommu_hdl, + mapped_info->buf_fd, + CAM_SMMU_MAP_RW, + buf_mgr->client, + &(mapped_info->paddr), + &(mapped_info->len)); + else + ret = cam_smmu_get_phy_addr(buf_mgr->iommu_hdl, mapped_info->buf_fd, CAM_SMMU_MAP_RW, &(mapped_info->paddr), @@ -242,8 +258,13 @@ static void msm_isp_unprepare_v4l2_buf( for (i = 0; i < buf_info->num_planes; i++) { mapped_info = &buf_info->mapped_info[i]; - - cam_smmu_put_phy_addr(buf_mgr->iommu_hdl, mapped_info->buf_fd); + /* SEC_CAM: check any change is needed for secure_mode */ + if (bufq->security_mode == SECURE_MODE) + cam_smmu_put_stage2_phy_addr(buf_mgr->iommu_hdl, + mapped_info->buf_fd); + else + cam_smmu_put_phy_addr(buf_mgr->iommu_hdl, + mapped_info->buf_fd); } return; } @@ -259,7 +280,15 @@ static int msm_isp_map_buf(struct msm_isp_buf_mgr *buf_mgr, __func__, __LINE__, buf_mgr, mapped_info); return -EINVAL; } - ret = cam_smmu_get_phy_addr(buf_mgr->iommu_hdl, + if (buf_mgr->secure_enable == SECURE_MODE) + ret = cam_smmu_get_stage2_phy_addr(buf_mgr->iommu_hdl, + fd, + CAM_SMMU_MAP_RW, + buf_mgr->client, + &(mapped_info->paddr), + &(mapped_info->len)); + else + ret = cam_smmu_get_phy_addr(buf_mgr->iommu_hdl, fd, CAM_SMMU_MAP_RW, &(mapped_info->paddr), @@ -275,7 +304,11 @@ static int msm_isp_map_buf(struct msm_isp_buf_mgr *buf_mgr, return rc; smmu_map_error: - cam_smmu_put_phy_addr(buf_mgr->iommu_hdl, + if (buf_mgr->secure_enable == SECURE_MODE) + cam_smmu_put_stage2_phy_addr(buf_mgr->iommu_hdl, + fd); + else + cam_smmu_put_phy_addr(buf_mgr->iommu_hdl, fd); return rc; } @@ -289,7 +322,12 @@ static int msm_isp_unmap_buf(struct msm_isp_buf_mgr *buf_mgr, return -EINVAL; } - cam_smmu_put_phy_addr(buf_mgr->iommu_hdl, + /* SEC_CAMERA: recheck Put part for stats */ + if (buf_mgr->secure_enable == SECURE_MODE) + cam_smmu_put_stage2_phy_addr(buf_mgr->iommu_hdl, + fd); + else + cam_smmu_put_phy_addr(buf_mgr->iommu_hdl, fd); return 0; @@ -921,7 +959,7 @@ static int msm_isp_get_buf_src(struct msm_isp_buf_mgr *buf_mgr, } static int msm_isp_request_bufq(struct msm_isp_buf_mgr *buf_mgr, - struct msm_isp_buf_request *buf_request) + struct msm_isp_buf_request_ver2 *buf_request) { int i; struct msm_isp_bufq *bufq = NULL; @@ -961,6 +999,7 @@ static int msm_isp_request_bufq(struct msm_isp_buf_mgr *buf_mgr, bufq->num_bufs = buf_request->num_buf; bufq->buf_type = buf_request->buf_type; INIT_LIST_HEAD(&bufq->head); + bufq->security_mode = buf_request->security_mode; for (i = 0; i < buf_request->num_buf; i++) { bufq->bufs[i].state = MSM_ISP_BUFFER_STATE_INITIALIZED; @@ -1032,15 +1071,25 @@ static int msm_isp_buf_put_scratch(struct msm_isp_buf_mgr *buf_mgr) if (!buf_mgr->scratch_buf_addr) return 0; - rc = cam_smmu_put_phy_addr_scratch(buf_mgr->iommu_hdl, + if (buf_mgr->secure_enable == SECURE_MODE) { + rc = cam_smmu_free_stage2_scratch_mem(buf_mgr->iommu_hdl, + buf_mgr->client, buf_mgr->sc_handle); + if (buf_mgr->scratch_buf_stats_addr) + rc = cam_smmu_put_phy_addr_scratch(buf_mgr->iommu_hdl, + buf_mgr->scratch_buf_stats_addr); + } else { + rc = cam_smmu_put_phy_addr_scratch(buf_mgr->iommu_hdl, buf_mgr->scratch_buf_addr); + } if (rc) pr_err("%s: failed to put scratch buffer to img iommu: %d\n", __func__, rc); - if (!rc) + if (!rc) { buf_mgr->scratch_buf_addr = 0; + buf_mgr->scratch_buf_stats_addr = 0; + } return rc; } @@ -1057,17 +1106,40 @@ static int msm_isp_buf_put_scratch(struct msm_isp_buf_mgr *buf_mgr) static int msm_isp_buf_get_scratch(struct msm_isp_buf_mgr *buf_mgr) { int rc; + size_t range = buf_mgr->scratch_buf_range; if (buf_mgr->scratch_buf_addr || !buf_mgr->scratch_buf_range) /* already mapped or not supported */ return 0; - rc = cam_smmu_get_phy_addr_scratch( + if (buf_mgr->secure_enable == SECURE_MODE) { + rc = cam_smmu_alloc_get_stage2_scratch_mem(buf_mgr->iommu_hdl, + CAM_SMMU_MAP_RW, + buf_mgr->client, + &buf_mgr->sc_handle, + &buf_mgr->scratch_buf_addr, + &range); + if (rc) + goto done; + + rc = cam_smmu_get_phy_addr_scratch( + buf_mgr->iommu_hdl, + CAM_SMMU_MAP_RW, + &buf_mgr->scratch_buf_stats_addr, + buf_mgr->scratch_buf_range, + SZ_4K); + if (rc) + msm_isp_buf_put_scratch(buf_mgr); + } else { + rc = cam_smmu_get_phy_addr_scratch( buf_mgr->iommu_hdl, CAM_SMMU_MAP_RW, &buf_mgr->scratch_buf_addr, buf_mgr->scratch_buf_range, SZ_4K); + buf_mgr->scratch_buf_stats_addr = buf_mgr->scratch_buf_addr; + } +done: if (rc) { pr_err("%s: failed to map scratch buffer to img iommu: %d\n", __func__, rc); @@ -1085,20 +1157,23 @@ int msm_isp_smmu_attach(struct msm_isp_buf_mgr *buf_mgr, pr_debug("%s: cmd->security_mode : %d\n", __func__, cmd->security_mode); mutex_lock(&buf_mgr->lock); if (cmd->iommu_attach_mode == IOMMU_ATTACH) { - buf_mgr->secure_enable = cmd->security_mode; - /* * Call hypervisor thru scm call to notify secure or * non-secure mode */ if (buf_mgr->attach_ref_cnt == 0) { - rc = cam_smmu_ops(buf_mgr->iommu_hdl, - CAM_SMMU_ATTACH); + if (cmd->security_mode == SECURE_MODE) + rc = cam_smmu_ops(buf_mgr->iommu_hdl, + CAM_SMMU_ATTACH_SEC_VFE_NS_STATS); + else + rc = cam_smmu_ops(buf_mgr->iommu_hdl, + CAM_SMMU_ATTACH); if (rc < 0) { pr_err("%s: img smmu attach error, rc :%d\n", __func__, rc); - goto err1; + goto err1; } + buf_mgr->secure_enable = cmd->security_mode; } buf_mgr->attach_ref_cnt++; rc = msm_isp_buf_get_scratch(buf_mgr); @@ -1113,8 +1188,12 @@ int msm_isp_smmu_attach(struct msm_isp_buf_mgr *buf_mgr, if (buf_mgr->attach_ref_cnt == 0) { rc = msm_isp_buf_put_scratch(buf_mgr); - rc |= cam_smmu_ops(buf_mgr->iommu_hdl, - CAM_SMMU_DETACH); + if (buf_mgr->secure_enable == SECURE_MODE) + rc |= cam_smmu_ops(buf_mgr->iommu_hdl, + CAM_SMMU_DETACH_SEC_VFE_NS_STATS); + else + rc |= cam_smmu_ops(buf_mgr->iommu_hdl, + CAM_SMMU_DETACH); if (rc < 0) { pr_err("%s: img/stats smmu detach error, rc :%d\n", __func__, rc); @@ -1126,8 +1205,11 @@ int msm_isp_smmu_attach(struct msm_isp_buf_mgr *buf_mgr, return rc; err2: - if (cam_smmu_ops(buf_mgr->iommu_hdl, CAM_SMMU_DETACH)) - pr_err("%s: img smmu detach error\n", __func__); + if (buf_mgr->secure_enable == SECURE_MODE) + cam_smmu_ops(buf_mgr->iommu_hdl, + CAM_SMMU_DETACH_SEC_VFE_NS_STATS); + else + cam_smmu_ops(buf_mgr->iommu_hdl, CAM_SMMU_DETACH); err1: mutex_unlock(&buf_mgr->lock); return rc; @@ -1162,12 +1244,11 @@ static int msm_isp_init_isp_buf_mgr(struct msm_isp_buf_mgr *buf_mgr, buf_mgr->pagefault_debug_disable = 0; buf_mgr->frameId_mismatch_recovery = 0; - mutex_unlock(&buf_mgr->lock); - return 0; - + /* create ION client */ + buf_mgr->client = msm_ion_client_create("vfe"); get_handle_error: mutex_unlock(&buf_mgr->lock); - return rc; + return 0; } static int msm_isp_deinit_isp_buf_mgr( @@ -1186,10 +1267,21 @@ static int msm_isp_deinit_isp_buf_mgr( buf_mgr->pagefault_debug_disable = 0; msm_isp_buf_put_scratch(buf_mgr); - cam_smmu_ops(buf_mgr->iommu_hdl, CAM_SMMU_DETACH); + if (buf_mgr->attach_ref_cnt != 0) { + if (buf_mgr->secure_enable == SECURE_MODE) + cam_smmu_ops(buf_mgr->iommu_hdl, + CAM_SMMU_DETACH_SEC_VFE_NS_STATS); + else + cam_smmu_ops(buf_mgr->iommu_hdl, CAM_SMMU_DETACH); + } cam_smmu_destroy_handle(buf_mgr->iommu_hdl); - buf_mgr->attach_ref_cnt = 0; + buf_mgr->secure_enable = 0; + buf_mgr->attach_ref_cnt = 0; + if (buf_mgr->client) { + ion_client_destroy(buf_mgr->client); + buf_mgr->client = NULL; + } mutex_unlock(&buf_mgr->lock); return 0; } @@ -1200,8 +1292,20 @@ int msm_isp_proc_buf_cmd(struct msm_isp_buf_mgr *buf_mgr, switch (cmd) { case VIDIOC_MSM_ISP_REQUEST_BUF: { struct msm_isp_buf_request *buf_req = arg; + struct msm_isp_buf_request_ver2 buf_req_ver2; - buf_mgr->ops->request_buf(buf_mgr, buf_req); + memcpy(&buf_req_ver2, buf_req, + sizeof(struct msm_isp_buf_request)); + buf_req_ver2.security_mode = NON_SECURE_MODE; + buf_mgr->ops->request_buf(buf_mgr, &buf_req_ver2); + memcpy(buf_req, &buf_req_ver2, + sizeof(struct msm_isp_buf_request)); + break; + } + case VIDIOC_MSM_ISP_REQUEST_BUF_VER2: { + struct msm_isp_buf_request_ver2 *buf_req_ver2 = arg; + + buf_mgr->ops->request_buf(buf_mgr, buf_req_ver2); break; } case VIDIOC_MSM_ISP_ENQUEUE_BUF: { @@ -1393,7 +1497,6 @@ int msm_isp_create_isp_buf_mgr( buf_mgr->open_count = 0; buf_mgr->pagefault_debug_disable = 0; buf_mgr->secure_enable = NON_SECURE_MODE; - buf_mgr->attach_state = MSM_ISP_BUF_MGR_DETACH; buf_mgr->scratch_buf_range = scratch_buf_range; mutex_init(&buf_mgr->lock); diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.h b/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.h index 43519ee74062..21fab0590b55 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.h +++ b/drivers/media/platform/msm/camera_v2/isp/msm_buf_mgr.h @@ -119,11 +119,12 @@ struct msm_isp_bufq { spinlock_t bufq_lock; /*Native buffer queue*/ struct list_head head; + enum smmu_attach_mode security_mode; }; struct msm_isp_buf_ops { int (*request_buf)(struct msm_isp_buf_mgr *buf_mgr, - struct msm_isp_buf_request *buf_request); + struct msm_isp_buf_request_ver2 *buf_request); int (*enqueue_buf)(struct msm_isp_buf_mgr *buf_mgr, struct msm_isp_qbuf_info *info); @@ -191,21 +192,20 @@ struct msm_isp_buf_mgr { struct msm_sd_req_vb2_q *vb2_ops; - /*IOMMU driver*/ - int iommu_hdl; /*Add secure mode*/ int secure_enable; - int num_iommu_ctx; - int num_iommu_secure_ctx; int attach_ref_cnt; enum msm_isp_buf_mgr_state attach_state; struct device *isp_dev; struct mutex lock; /* Scratch buffer */ dma_addr_t scratch_buf_addr; + dma_addr_t scratch_buf_stats_addr; uint32_t scratch_buf_range; + int iommu_hdl; + struct ion_handle *sc_handle; }; int msm_isp_create_isp_buf_mgr(struct msm_isp_buf_mgr *buf_mgr, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp.c index 094996b2d60b..35daf30bac63 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp.c @@ -655,8 +655,6 @@ int vfe_hw_probe(struct platform_device *pdev) goto probe_fail3; } msm_isp_enable_debugfs(vfe_dev, msm_isp_bw_request_history); - vfe_dev->buf_mgr->num_iommu_secure_ctx = - vfe_dev->hw_info->num_iommu_secure_ctx; vfe_dev->buf_mgr->init_done = 1; vfe_dev->vfe_open_cnt = 0; return rc; diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c index 22a7f6886964..da4ceb400683 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c @@ -23,7 +23,7 @@ static inline void msm_isp_stats_cfg_wm_scratch(struct vfe_device *vfe_dev, { vfe_dev->hw_info->vfe_ops.stats_ops.update_ping_pong_addr( vfe_dev, stream_info, - pingpong_status, vfe_dev->buf_mgr->scratch_buf_addr); + pingpong_status, vfe_dev->buf_mgr->scratch_buf_stats_addr); } static inline void msm_isp_stats_cfg_stream_scratch( diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c index 59b875d6e464..1baba9658523 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c @@ -821,6 +821,7 @@ static long msm_isp_ioctl_unlocked(struct v4l2_subdev *sd, break; } case VIDIOC_MSM_ISP_REQUEST_BUF: + case VIDIOC_MSM_ISP_REQUEST_BUF_VER2: /* fallthrough */ case VIDIOC_MSM_ISP_ENQUEUE_BUF: /* fallthrough */ diff --git a/include/uapi/media/msmb_isp.h b/include/uapi/media/msmb_isp.h index 9399f6e84004..44d75aa107d9 100644 --- a/include/uapi/media/msmb_isp.h +++ b/include/uapi/media/msmb_isp.h @@ -2,6 +2,7 @@ #define __UAPI_MSMB_ISP__ #include +#include #define MAX_PLANES_PER_STREAM 3 #define MAX_NUM_STREAM 7 @@ -556,6 +557,16 @@ struct msm_isp_buf_request { enum msm_isp_buf_type buf_type; }; +struct msm_isp_buf_request_ver2 { + uint32_t session_id; + uint32_t stream_id; + uint8_t num_buf; + uint32_t handle; + enum msm_isp_buf_type buf_type; + enum smmu_attach_mode security_mode; + uint32_t reserved[4]; +}; + struct msm_isp_qbuf_plane { uint32_t addr; uint32_t offset; @@ -884,8 +895,11 @@ enum msm_isp_ioctl_cmd_code { MSM_ISP_SET_DUAL_HW_MASTER_SLAVE, MSM_ISP_MAP_BUF_START_FE, MSM_ISP_UNMAP_BUF, + MSM_ISP_AHB_CLK_CFG, + MSM_ISP_DUAL_HW_MASTER_SLAVE_SYNC, MSM_ISP_FETCH_ENG_MULTI_PASS_START, MSM_ISP_MAP_BUF_START_MULTI_PASS_FE, + MSM_ISP_REQUEST_BUF_VER2, }; #define VIDIOC_MSM_VFE_REG_CFG \ @@ -989,10 +1003,10 @@ enum msm_isp_ioctl_cmd_code { struct msm_isp_unmap_buf_req) #define VIDIOC_MSM_ISP_AHB_CLK_CFG \ - _IOWR('V', BASE_VIDIOC_PRIVATE+25, struct msm_isp_ahb_clk_cfg) + _IOWR('V', MSM_ISP_AHB_CLK_CFG, struct msm_isp_ahb_clk_cfg) #define VIDIOC_MSM_ISP_DUAL_HW_MASTER_SLAVE_SYNC \ - _IOWR('V', BASE_VIDIOC_PRIVATE+26, \ + _IOWR('V', MSM_ISP_DUAL_HW_MASTER_SLAVE_SYNC, \ struct msm_isp_dual_hw_master_slave_sync) #define VIDIOC_MSM_ISP_FETCH_ENG_MULTI_PASS_START \ @@ -1002,4 +1016,8 @@ enum msm_isp_ioctl_cmd_code { #define VIDIOC_MSM_ISP_MAP_BUF_START_MULTI_PASS_FE \ _IOWR('V', MSM_ISP_MAP_BUF_START_MULTI_PASS_FE, \ struct msm_vfe_fetch_eng_multi_pass_start) + +#define VIDIOC_MSM_ISP_REQUEST_BUF_VER2 \ + _IOWR('V', MSM_ISP_REQUEST_BUF_VER2, struct msm_isp_buf_request_ver2) + #endif /* __MSMB_ISP__ */ From f230f0f67bdd137a02230b0ba32cd5ff9075c231 Mon Sep 17 00:00:00 2001 From: Shubhraprakash Das Date: Tue, 22 Nov 2016 15:21:28 -0800 Subject: [PATCH 776/813] msm: camera: isp: Stop stats stream properly When the camif input is disabled the stats streams need to be turned off as well. Stop the stats stream by following the correct stop sequence instead of just turning off the stats write masters. CRs-Fixed: 1098562 Change-Id: I4789bf9e837b1c0af7288e26ff02c4068638337a Signed-off-by: Shubhraprakash Das --- .../msm/camera_v2/isp/msm_isp_axi_util.c | 10 ++-- .../msm/camera_v2/isp/msm_isp_stats_util.c | 49 ++++++++++++++----- .../msm/camera_v2/isp/msm_isp_stats_util.h | 1 + 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c index aa3a0e239d87..ae032c588888 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c @@ -13,6 +13,7 @@ #include #include #include "msm_isp_util.h" +#include "msm_isp_stats_util.h" #include "msm_isp_axi_util.h" #include "msm_isp48.h" @@ -2759,12 +2760,11 @@ static void __msm_isp_stop_axi_streams(struct vfe_device *vfe_dev, if (!update_vfes[k]) continue; vfe_dev = update_vfes[k]; - axi_data = &vfe_dev->axi_data; - if (axi_data->src_info[VFE_PIX_0].active == 0) { - vfe_dev->hw_info->vfe_ops.stats_ops.enable_module( - vfe_dev, 0xFF, 0); - } + /* make sure all stats are stopped if camif is stopped */ + if (vfe_dev->axi_data.src_info[VFE_PIX_0].active == 0) + msm_isp_stop_all_stats_stream(vfe_dev); } + for (i = 0; i < num_streams; i++) { stream_info = streams[i]; spin_lock_irqsave(&stream_info->lock, flags); diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c index da4ceb400683..38ce78d941c9 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.c @@ -548,19 +548,52 @@ int msm_isp_release_stats_stream(struct vfe_device *vfe_dev, void *arg) return 0; } +void msm_isp_stop_all_stats_stream(struct vfe_device *vfe_dev) +{ + struct msm_vfe_stats_stream_cfg_cmd stream_cfg_cmd; + struct msm_vfe_stats_stream *stream_info; + int i; + int vfe_idx; + unsigned long flags; + + stream_cfg_cmd.enable = 0; + stream_cfg_cmd.num_streams = 0; + + for (i = 0; i < MSM_ISP_STATS_MAX; i++) { + stream_info = msm_isp_get_stats_stream_common_data(vfe_dev, i); + spin_lock_irqsave(&stream_info->lock, flags); + if (stream_info->state == STATS_AVAILABLE || + stream_info->state == STATS_INACTIVE) { + spin_unlock_irqrestore(&stream_info->lock, flags); + continue; + } + vfe_idx = msm_isp_get_vfe_idx_for_stats_stream_user(vfe_dev, + stream_info); + if (vfe_idx == -ENOTTY) { + spin_unlock_irqrestore(&stream_info->lock, flags); + continue; + } + stream_cfg_cmd.stream_handle[ + stream_cfg_cmd.num_streams] = + stream_info->stream_handle[vfe_idx]; + stream_cfg_cmd.num_streams++; + spin_unlock_irqrestore(&stream_info->lock, flags); + } + if (stream_cfg_cmd.num_streams) + msm_isp_cfg_stats_stream(vfe_dev, &stream_cfg_cmd); +} + void msm_isp_release_all_stats_stream(struct vfe_device *vfe_dev) { struct msm_vfe_stats_stream_release_cmd stream_release_cmd[MSM_ISP_STATS_MAX]; - struct msm_vfe_stats_stream_cfg_cmd stream_cfg_cmd; struct msm_vfe_stats_stream *stream_info; int i; int vfe_idx; int num_stream = 0; unsigned long flags; - stream_cfg_cmd.enable = 0; - stream_cfg_cmd.num_streams = 0; + msm_isp_stop_all_stats_stream(vfe_dev); for (i = 0; i < MSM_ISP_STATS_MAX; i++) { stream_info = msm_isp_get_stats_stream_common_data(vfe_dev, i); @@ -577,18 +610,8 @@ void msm_isp_release_all_stats_stream(struct vfe_device *vfe_dev) } stream_release_cmd[num_stream++].stream_handle = stream_info->stream_handle[vfe_idx]; - if (stream_info->state == STATS_INACTIVE) { - spin_unlock_irqrestore(&stream_info->lock, flags); - continue; - } - stream_cfg_cmd.stream_handle[ - stream_cfg_cmd.num_streams] = - stream_info->stream_handle[vfe_idx]; - stream_cfg_cmd.num_streams++; spin_unlock_irqrestore(&stream_info->lock, flags); } - if (stream_cfg_cmd.num_streams) - msm_isp_cfg_stats_stream(vfe_dev, &stream_cfg_cmd); for (i = 0; i < num_stream; i++) msm_isp_release_stats_stream(vfe_dev, &stream_release_cmd[i]); diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.h b/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.h index e9728f33fae1..2e3a24dd1f0d 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.h +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_stats_util.h @@ -29,6 +29,7 @@ int msm_isp_stats_restart(struct vfe_device *vfe_dev); void msm_isp_release_all_stats_stream(struct vfe_device *vfe_dev); void msm_isp_process_stats_reg_upd_epoch_irq(struct vfe_device *vfe_dev, enum msm_isp_comp_irq_types irq); +void msm_isp_stop_all_stats_stream(struct vfe_device *vfe_dev); static inline int msm_isp_get_vfe_idx_for_stats_stream_user( struct vfe_device *vfe_dev, From b9a96eb2e5208651ddb94c457dcdb64b7e2d02f0 Mon Sep 17 00:00:00 2001 From: Shubhraprakash Das Date: Fri, 2 Dec 2016 17:53:31 -0800 Subject: [PATCH 777/813] msm: camera: isp: Ignore bus error from RDI write master Bus error can be generated on RDI write master even if there is no data sent on it. This is not actually an error hence ignore it. CRs-Fixed: 1098568 Change-Id: I8dc24f3c4926f008d114778c890ad2c2902f84b9 Signed-off-by: Shubhraprakash Das --- .../platform/msm/camera_v2/isp/msm_isp.h | 6 ++++++ .../platform/msm/camera_v2/isp/msm_isp32.c | 2 ++ .../platform/msm/camera_v2/isp/msm_isp40.c | 2 ++ .../platform/msm/camera_v2/isp/msm_isp44.c | 2 ++ .../platform/msm/camera_v2/isp/msm_isp46.c | 2 ++ .../platform/msm/camera_v2/isp/msm_isp47.c | 2 ++ .../platform/msm/camera_v2/isp/msm_isp48.c | 21 +++++++++++++++++++ .../msm/camera_v2/isp/msm_isp_axi_util.c | 14 +++++++++++++ .../platform/msm/camera_v2/isp/msm_isp_util.c | 14 +++++++------ 9 files changed, 59 insertions(+), 6 deletions(-) diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp.h b/drivers/media/platform/msm/camera_v2/isp/msm_isp.h index 1f860f2c5b12..9c7eba21fde1 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp.h +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp.h @@ -243,6 +243,10 @@ struct msm_vfe_core_ops { int (*start_fetch_eng_multi_pass)(struct vfe_device *vfe_dev, void *arg); void (*set_halt_restart_mask)(struct vfe_device *vfe_dev); + void (*set_bus_err_ign_mask)(struct vfe_device *vfe_dev, + int wm, int enable); + void (*get_bus_err_mask)(struct vfe_device *vfe_dev, + uint32_t *bus_err, uint32_t *irq_status1); }; struct msm_vfe_stats_ops { @@ -786,6 +790,8 @@ struct vfe_device { /* irq info */ uint32_t irq0_mask; uint32_t irq1_mask; + + uint32_t bus_err_ign_mask; }; struct vfe_parent_device { diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c index 8275f8cedf2e..43f562b18209 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp32.c @@ -1474,6 +1474,8 @@ struct msm_vfe_hardware_info vfe32_hw_info = { .is_module_cfg_lock_needed = msm_vfe32_is_module_cfg_lock_needed, .ahb_clk_cfg = NULL, + .set_bus_err_ign_mask = NULL, + .get_bus_err_mask = NULL, }, .stats_ops = { .get_stats_idx = msm_vfe32_get_stats_idx, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp40.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp40.c index 2d937fc3ed05..a1fb307b09c1 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp40.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp40.c @@ -2263,6 +2263,8 @@ struct msm_vfe_hardware_info vfe40_hw_info = { msm_vfe40_start_fetch_engine_multi_pass, .set_halt_restart_mask = msm_vfe40_set_halt_restart_mask, + .set_bus_err_ign_mask = NULL, + .get_bus_err_mask = NULL, }, .stats_ops = { .get_stats_idx = msm_vfe40_get_stats_idx, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp44.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp44.c index 15820b5f398b..0a72a041de28 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp44.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp44.c @@ -1869,6 +1869,8 @@ struct msm_vfe_hardware_info vfe44_hw_info = { .ahb_clk_cfg = NULL, .set_halt_restart_mask = msm_vfe44_set_halt_restart_mask, + .set_bus_err_ign_mask = NULL, + .get_bus_err_mask = NULL, }, .stats_ops = { .get_stats_idx = msm_vfe44_get_stats_idx, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp46.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp46.c index 23fbc4f5e33a..f2d53c956fdc 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp46.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp46.c @@ -1945,6 +1945,8 @@ struct msm_vfe_hardware_info vfe46_hw_info = { .ahb_clk_cfg = NULL, .set_halt_restart_mask = msm_vfe46_set_halt_restart_mask, + .set_bus_err_ign_mask = NULL, + .get_bus_err_mask = NULL, }, .stats_ops = { .get_stats_idx = msm_vfe46_get_stats_idx, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp47.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp47.c index 56056849e140..efea930a36a7 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp47.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp47.c @@ -2858,6 +2858,8 @@ struct msm_vfe_hardware_info vfe47_hw_info = { msm_vfe47_start_fetch_engine_multi_pass, .set_halt_restart_mask = msm_vfe47_set_halt_restart_mask, + .set_bus_err_ign_mask = NULL, + .get_bus_err_mask = NULL, }, .stats_ops = { .get_stats_idx = msm_vfe47_get_stats_idx, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp48.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp48.c index c533f23c1163..f346ceb6c9e5 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp48.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp48.c @@ -241,6 +241,25 @@ static void msm_vfe48_put_regulators(struct vfe_device *vfe_dev) vfe_dev->vfe_num_regulators = 0; } +static void msm_vfe48_get_bus_err_mask(struct vfe_device *vfe_dev, + uint32_t *bus_err, uint32_t *irq_status1) +{ + *bus_err = msm_camera_io_r(vfe_dev->vfe_base + 0xC94); + + *bus_err &= ~vfe_dev->bus_err_ign_mask; + if (*bus_err == 0) + *irq_status1 &= ~(1 << 4); +} + +static void msm_vfe48_set_bus_err_ign_mask(struct vfe_device *vfe_dev, + int wm, int enable) +{ + if (enable) + vfe_dev->bus_err_ign_mask |= (1 << wm); + else + vfe_dev->bus_err_ign_mask &= ~(1 << wm); +} + struct msm_vfe_hardware_info vfe48_hw_info = { .num_iommu_ctx = 1, .num_iommu_secure_ctx = 0, @@ -315,6 +334,8 @@ struct msm_vfe_hardware_info vfe48_hw_info = { msm_vfe47_start_fetch_engine_multi_pass, .set_halt_restart_mask = msm_vfe47_set_halt_restart_mask, + .set_bus_err_ign_mask = msm_vfe48_set_bus_err_ign_mask, + .get_bus_err_mask = msm_vfe48_get_bus_err_mask, }, .stats_ops = { .get_stats_idx = msm_vfe47_get_stats_idx, diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c index ae032c588888..f6e0d9083b22 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util.c @@ -430,6 +430,13 @@ static void msm_isp_axi_reserve_wm(struct vfe_device *vfe_dev, vfe_dev->pdev->id, stream_info->stream_handle[vfe_idx], j); stream_info->wm[vfe_idx][i] = j; + /* setup var to ignore bus error from RDI wm */ + if (stream_info->stream_src >= RDI_INTF_0) { + if (vfe_dev->hw_info->vfe_ops.core_ops. + set_bus_err_ign_mask) + vfe_dev->hw_info->vfe_ops.core_ops. + set_bus_err_ign_mask(vfe_dev, j, 1); + } } } @@ -443,6 +450,13 @@ void msm_isp_axi_free_wm(struct vfe_device *vfe_dev, for (i = 0; i < stream_info->num_planes; i++) { axi_data->free_wm[stream_info->wm[vfe_idx][i]] = 0; axi_data->num_used_wm--; + if (stream_info->stream_src >= RDI_INTF_0) { + if (vfe_dev->hw_info->vfe_ops.core_ops. + set_bus_err_ign_mask) + vfe_dev->hw_info->vfe_ops.core_ops. + set_bus_err_ign_mask(vfe_dev, + stream_info->wm[vfe_idx][i], 0); + } } if (stream_info->stream_src <= IDEAL_RAW) axi_data->num_pix_stream++; diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c index 1baba9658523..a8825dacd8ab 100644 --- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c +++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_util.c @@ -1791,12 +1791,17 @@ static int msm_isp_process_overflow_irq( uint32_t *irq_status0, uint32_t *irq_status1) { uint32_t overflow_mask; + uint32_t bus_err = 0; /* if there are no active streams - do not start recovery */ if (!vfe_dev->axi_data.num_active_stream) return 0; - /*Mask out all other irqs if recovery is started*/ + if (vfe_dev->hw_info->vfe_ops.core_ops. + get_bus_err_mask) + vfe_dev->hw_info->vfe_ops.core_ops.get_bus_err_mask( + vfe_dev, &bus_err, irq_status1); + /* Mask out all other irqs if recovery is started */ if (atomic_read(&vfe_dev->error_info.overflow_state) != NO_OVERFLOW) { uint32_t halt_restart_mask0, halt_restart_mask1; vfe_dev->hw_info->vfe_ops.core_ops. @@ -1808,14 +1813,13 @@ static int msm_isp_process_overflow_irq( return 0; } - /*Check if any overflow bit is set*/ + /* Check if any overflow bit is set */ vfe_dev->hw_info->vfe_ops.core_ops. get_overflow_mask(&overflow_mask); overflow_mask &= *irq_status1; if (overflow_mask) { struct msm_isp_event_data error_event; - uint32_t val = 0; int i; struct msm_vfe_axi_shared_data *axi_data = &vfe_dev->axi_data; @@ -1830,10 +1834,8 @@ static int msm_isp_process_overflow_irq( *irq_status1 &= ~overflow_mask; return 0; } - if (msm_vfe_is_vfe48(vfe_dev)) - val = msm_camera_io_r(vfe_dev->vfe_base + 0xC94); pr_err("%s: vfe %d overflow mask %x, bus_error %x\n", - __func__, vfe_dev->pdev->id, overflow_mask, val); + __func__, vfe_dev->pdev->id, overflow_mask, bus_err); for (i = 0; i < axi_data->hw_info->num_wm; i++) { if (!axi_data->free_wm[i]) continue; From 28b53604a0490f50f62d654a24a87e0552bf8739 Mon Sep 17 00:00:00 2001 From: Maheshwar Ajja Date: Fri, 9 Dec 2016 11:23:32 -0800 Subject: [PATCH 778/813] msm: vidc: fix issue when video session opening failed There is null pointer access occurring when video session opening failed and power collapse thread sending sys_error to all opened sessions in parallel. Fix the issue by doing v4l2_fh_exit() afer removing new session from core->instances list. CRs-Fixed: 1088562 Change-Id: I2523a46487a03ade40afeddd394a4572283d91a2 Signed-off-by: Maheshwar Ajja --- drivers/media/platform/msm/vidc/msm_vidc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/msm/vidc/msm_vidc.c b/drivers/media/platform/msm/vidc/msm_vidc.c index ac5f83f1d034..689af175e2b2 100644 --- a/drivers/media/platform/msm/vidc/msm_vidc.c +++ b/drivers/media/platform/msm/vidc/msm_vidc.c @@ -1239,14 +1239,13 @@ void *msm_vidc_open(int core_id, int session_type) return inst; fail_init: - v4l2_fh_del(&inst->event_handler); - v4l2_fh_exit(&inst->event_handler); - vb2_queue_release(&inst->bufq[OUTPUT_PORT].vb2_bufq); - mutex_lock(&core->lock); list_del(&inst->list); mutex_unlock(&core->lock); + v4l2_fh_del(&inst->event_handler); + v4l2_fh_exit(&inst->event_handler); + vb2_queue_release(&inst->bufq[OUTPUT_PORT].vb2_bufq); fail_bufq_output: vb2_queue_release(&inst->bufq[CAPTURE_PORT].vb2_bufq); fail_bufq_capture: From 7e4424a1b5f6a6536066cca7aac2c3a23fd39f6f Mon Sep 17 00:00:00 2001 From: Krishnankutty Kolathappilly Date: Wed, 16 Nov 2016 15:10:18 -0800 Subject: [PATCH 779/813] msm: camera: Synchronize jpeg ISR and userspace call This will fix the race between jpeg dma ISR and userspace call. Without this fix jpeg dma may randomly crash due to invalid pointer access. Change-Id: I559ae08b9a46d5d3c35f8be509976a25faa967f9 CRs-Fixed: 1083323 Signed-off-by: Krishnankutty Kolathappilly --- .../msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.c | 14 +++++++++++--- .../msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.c b/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.c index 3301fc446193..4b4846907d0f 100644 --- a/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.c +++ b/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.c @@ -537,6 +537,7 @@ static int msm_jpegdma_open(struct file *file) if (!ctx) return -ENOMEM; + mutex_init(&ctx->lock); ctx->jdma_device = device; dev_dbg(ctx->jdma_device->dev, "Jpeg v4l2 dma open\n"); /* Set ctx defaults */ @@ -835,12 +836,13 @@ static int msm_jpegdma_qbuf(struct file *file, void *fh, int ret; msm_jpegdma_cast_long_to_buff_ptr(buf->m.userptr, &up_buff); - + mutex_lock(&ctx->lock); if (!access_ok(VERIFY_READ, up_buff, sizeof(struct msm_jpeg_dma_buff)) || get_user(kp_buff.fd, &up_buff->fd) || get_user(kp_buff.offset, &up_buff->offset)) { dev_err(ctx->jdma_device->dev, "Error getting user data\n"); + mutex_unlock(&ctx->lock); return -EFAULT; } @@ -849,6 +851,7 @@ static int msm_jpegdma_qbuf(struct file *file, void *fh, put_user(kp_buff.fd, &up_buff->fd) || put_user(kp_buff.offset, &up_buff->offset)) { dev_err(ctx->jdma_device->dev, "Error putting user data\n"); + mutex_unlock(&ctx->lock); return -EFAULT; } @@ -871,7 +874,7 @@ static int msm_jpegdma_qbuf(struct file *file, void *fh, ret = v4l2_m2m_qbuf(file, ctx->m2m_ctx, buf); if (ret < 0) dev_err(ctx->jdma_device->dev, "QBuf fail\n"); - + mutex_unlock(&ctx->lock); return ret; } @@ -1032,10 +1035,11 @@ static int msm_jpegdma_s_crop(struct file *file, void *fh, if (crop->c.height % formats[ctx->format_idx].v_align) return -EINVAL; + mutex_lock(&ctx->lock); ctx->crop = crop->c; if (atomic_read(&ctx->active)) ret = msm_jpegdma_update_hw_config(ctx); - + mutex_unlock(&ctx->lock); return ret; } @@ -1240,12 +1244,14 @@ void msm_jpegdma_isr_processing_done(struct msm_jpegdma_device *dma) ctx = v4l2_m2m_get_curr_priv(dma->m2m_dev); if (ctx) { + mutex_lock(&ctx->lock); ctx->plane_idx++; if (ctx->plane_idx >= formats[ctx->format_idx].num_planes) { src_buf = v4l2_m2m_src_buf_remove(ctx->m2m_ctx); dst_buf = v4l2_m2m_dst_buf_remove(ctx->m2m_ctx); if (src_buf == NULL || dst_buf == NULL) { dev_err(ctx->jdma_device->dev, "Error, buffer list empty\n"); + mutex_unlock(&ctx->lock); mutex_unlock(&dma->lock); return; } @@ -1261,11 +1267,13 @@ void msm_jpegdma_isr_processing_done(struct msm_jpegdma_device *dma) src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx); if (src_buf == NULL || dst_buf == NULL) { dev_err(ctx->jdma_device->dev, "Error, buffer list empty\n"); + mutex_unlock(&ctx->lock); mutex_unlock(&dma->lock); return; } msm_jpegdma_process_buffers(ctx, src_buf, dst_buf); } + mutex_unlock(&ctx->lock); } mutex_unlock(&dma->lock); } diff --git a/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.h b/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.h index 6a1205daf1d2..4911ce3aa5bd 100644 --- a/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.h +++ b/drivers/media/platform/msm/camera_v2/jpeg_dma/msm_jpeg_dma_dev.h @@ -254,6 +254,7 @@ struct msm_jpegdma_buf_handle { * @format_idx: Current format index. */ struct jpegdma_ctx { + struct mutex lock; struct msm_jpegdma_device *jdma_device; atomic_t active; struct completion completion; From 73c975d6fde1adc6a8bbe28e98809e37c884d123 Mon Sep 17 00:00:00 2001 From: Kuirong Wang Date: Sun, 4 Dec 2016 10:42:02 -0800 Subject: [PATCH 780/813] ARM: dts: msm: update speaker left and right GPIOs for APQ8998 Update speaker left and right GPIOs for APQ8998 platform since it uses different GPIOs from MSM8998 platform. CRs-Fixed: 1099656 Change-Id: Idecd865b7a11e4cd30737f5d800673c87807592c Signed-off-by: Kuirong Wang --- .../boot/dts/qcom/apq8998-v2.1-mediabox.dts | 18 +++++++ arch/arm/boot/dts/qcom/msm8998-pinctrl.dtsi | 54 +++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/arch/arm/boot/dts/qcom/apq8998-v2.1-mediabox.dts b/arch/arm/boot/dts/qcom/apq8998-v2.1-mediabox.dts index bc60d9a08c0b..03fa038d9413 100644 --- a/arch/arm/boot/dts/qcom/apq8998-v2.1-mediabox.dts +++ b/arch/arm/boot/dts/qcom/apq8998-v2.1-mediabox.dts @@ -28,3 +28,21 @@ &mdss_mdp { qcom,mdss-pref-prim-intf = "hdmi"; }; + +&slim_aud { + tasha_codec { + wsa_spkr_sd1: msm_cdc_pinctrll { + compatible = "qcom,msm-cdc-pinctrl"; + pinctrl-names = "aud_active", "aud_sleep"; + pinctrl-0 = <&spkr_1_sd_active_mediabox>; + pinctrl-1 = <&spkr_1_sd_sleep_mediabox>; + }; + + wsa_spkr_sd2: msm_cdc_pinctrlr { + compatible = "qcom,msm-cdc-pinctrl"; + pinctrl-names = "aud_active", "aud_sleep"; + pinctrl-0 = <&spkr_2_sd_active_mediabox>; + pinctrl-1 = <&spkr_2_sd_sleep_mediabox>; + }; + }; +}; diff --git a/arch/arm/boot/dts/qcom/msm8998-pinctrl.dtsi b/arch/arm/boot/dts/qcom/msm8998-pinctrl.dtsi index d4a2290c9b0a..bd20ae0a9b85 100644 --- a/arch/arm/boot/dts/qcom/msm8998-pinctrl.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998-pinctrl.dtsi @@ -2957,5 +2957,59 @@ }; }; }; + + spkr_1_sd_mediabox { + spkr_1_sd_sleep_mediabox: spkr_1_sd_sleep_mediabox { + mux { + pins = "gpio85"; + function = "gpio"; + }; + config { + pins = "gpio85"; + drive-strength = <2>; /* 2 mA */ + bias-pull-down; + input-enable; + }; + }; + spkr_1_sd_active_mediabox: spkr_1_sd_active_mediabox { + mux { + pins = "gpio85"; + function = "gpio"; + }; + config { + pins = "gpio85"; + drive-strength = <8>; /* 8 mA */ + bias-disable; + output-high; + }; + }; + }; + + spkr_2_sd_mediabox_mediabox { + spkr_2_sd_sleep_mediabox: spkr_2_sd_sleep_mediabox { + mux { + pins = "gpio112"; + function = "gpio"; + }; + config { + pins = "gpio112"; + drive-strength = <2>; /* 2 mA */ + bias-pull-down; + input-enable; + }; + }; + spkr_2_sd_active_mediabox: spkr_2_sd_active_mediabox { + mux { + pins = "gpio112"; + function = "gpio"; + }; + config { + pins = "gpio112"; + drive-strength = <8>; /* 8 mA */ + bias-disable; + output-high; + }; + }; + }; }; }; From 9286838256169f5928b6e5a576c46a385eb3481e Mon Sep 17 00:00:00 2001 From: Kuirong Wang Date: Mon, 12 Dec 2016 17:47:41 -0800 Subject: [PATCH 781/813] ASoC: msm8998: add asm loopback FE DAI Add asm loopback FE DAI to support the audio asm loopback use case for msm8998 platform. CRs-Fixed: 1099656 Change-Id: Ia1c6d8dfd75eb21cde8de7b9bfcab4e4277e339f Signed-off-by: Kuirong Wang --- sound/soc/msm/msm8998.c | 44 ++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/sound/soc/msm/msm8998.c b/sound/soc/msm/msm8998.c index 7f43ef401a4f..6396cd6aaf39 100644 --- a/sound/soc/msm/msm8998.c +++ b/sound/soc/msm/msm8998.c @@ -4947,6 +4947,26 @@ static struct snd_soc_dai_link msm_tavil_fe_dai_links[] = { }, }; +static struct snd_soc_dai_link msm_common_misc_fe_dai_links[] = { + { + .name = MSM_DAILINK_NAME(ASM Loopback), + .stream_name = "MultiMedia6", + .cpu_dai_name = "MultiMedia6", + .platform_name = "msm-pcm-loopback", + .dynamic = 1, + .dpcm_playback = 1, + .dpcm_capture = 1, + .codec_dai_name = "snd-soc-dummy-dai", + .codec_name = "snd-soc-dummy", + .trigger = {SND_SOC_DPCM_TRIGGER_POST, + SND_SOC_DPCM_TRIGGER_POST}, + .ignore_suspend = 1, + .no_host_mode = SND_SOC_DAI_LINK_NO_HOST, + .ignore_pmdown_time = 1, + .be_id = MSM_FRONTEND_DAI_MULTIMEDIA6, + }, +}; + static struct snd_soc_dai_link msm_common_be_dai_links[] = { /* Backend AFE DAI Links */ { @@ -5870,6 +5890,7 @@ static struct snd_soc_dai_link msm_auxpcm_be_dai_links[] = { static struct snd_soc_dai_link msm_tasha_dai_links[ ARRAY_SIZE(msm_common_dai_links) + ARRAY_SIZE(msm_tasha_fe_dai_links) + + ARRAY_SIZE(msm_common_misc_fe_dai_links) + ARRAY_SIZE(msm_common_be_dai_links) + ARRAY_SIZE(msm_tasha_be_dai_links) + ARRAY_SIZE(msm_wcn_be_dai_links) + @@ -5880,6 +5901,7 @@ static struct snd_soc_dai_link msm_tasha_dai_links[ static struct snd_soc_dai_link msm_tavil_dai_links[ ARRAY_SIZE(msm_common_dai_links) + ARRAY_SIZE(msm_tavil_fe_dai_links) + + ARRAY_SIZE(msm_common_misc_fe_dai_links) + ARRAY_SIZE(msm_common_be_dai_links) + ARRAY_SIZE(msm_tavil_be_dai_links) + ARRAY_SIZE(msm_wcn_be_dai_links) + @@ -6213,7 +6235,7 @@ static struct snd_soc_card *populate_snd_card_dailinks(struct device *dev) { struct snd_soc_card *card = NULL; struct snd_soc_dai_link *dailink; - int len_1, len_2, len_3; + int len_1, len_2, len_3, len_4; int total_links; const struct of_device_id *match; @@ -6228,8 +6250,9 @@ static struct snd_soc_card *populate_snd_card_dailinks(struct device *dev) card = &snd_soc_card_tasha_msm; len_1 = ARRAY_SIZE(msm_common_dai_links); len_2 = len_1 + ARRAY_SIZE(msm_tasha_fe_dai_links); - len_3 = len_2 + ARRAY_SIZE(msm_common_be_dai_links); - total_links = len_3 + ARRAY_SIZE(msm_tasha_be_dai_links); + len_3 = len_2 + ARRAY_SIZE(msm_common_misc_fe_dai_links); + len_4 = len_3 + ARRAY_SIZE(msm_common_be_dai_links); + total_links = len_4 + ARRAY_SIZE(msm_tasha_be_dai_links); memcpy(msm_tasha_dai_links, msm_common_dai_links, sizeof(msm_common_dai_links)); @@ -6237,9 +6260,12 @@ static struct snd_soc_card *populate_snd_card_dailinks(struct device *dev) msm_tasha_fe_dai_links, sizeof(msm_tasha_fe_dai_links)); memcpy(msm_tasha_dai_links + len_2, + msm_common_misc_fe_dai_links, + sizeof(msm_common_misc_fe_dai_links)); + memcpy(msm_tasha_dai_links + len_3, msm_common_be_dai_links, sizeof(msm_common_be_dai_links)); - memcpy(msm_tasha_dai_links + len_3, + memcpy(msm_tasha_dai_links + len_4, msm_tasha_be_dai_links, sizeof(msm_tasha_be_dai_links)); @@ -6280,8 +6306,9 @@ static struct snd_soc_card *populate_snd_card_dailinks(struct device *dev) card = &snd_soc_card_tavil_msm; len_1 = ARRAY_SIZE(msm_common_dai_links); len_2 = len_1 + ARRAY_SIZE(msm_tavil_fe_dai_links); - len_3 = len_2 + ARRAY_SIZE(msm_common_be_dai_links); - total_links = len_3 + ARRAY_SIZE(msm_tavil_be_dai_links); + len_3 = len_2 + ARRAY_SIZE(msm_common_misc_fe_dai_links); + len_4 = len_3 + ARRAY_SIZE(msm_common_be_dai_links); + total_links = len_4 + ARRAY_SIZE(msm_tavil_be_dai_links); memcpy(msm_tavil_dai_links, msm_common_dai_links, sizeof(msm_common_dai_links)); @@ -6289,9 +6316,12 @@ static struct snd_soc_card *populate_snd_card_dailinks(struct device *dev) msm_tavil_fe_dai_links, sizeof(msm_tavil_fe_dai_links)); memcpy(msm_tavil_dai_links + len_2, + msm_common_misc_fe_dai_links, + sizeof(msm_common_misc_fe_dai_links)); + memcpy(msm_tavil_dai_links + len_3, msm_common_be_dai_links, sizeof(msm_common_be_dai_links)); - memcpy(msm_tavil_dai_links + len_3, + memcpy(msm_tavil_dai_links + len_4, msm_tavil_be_dai_links, sizeof(msm_tavil_be_dai_links)); From cfd44286e54c8f605f4c7284864508e33d946ae0 Mon Sep 17 00:00:00 2001 From: Harry Yang Date: Tue, 6 Dec 2016 15:10:15 -0800 Subject: [PATCH 782/813] qcom-charger: smblib: drop the BOOST PFM Threshold change The original change to set 200mV boost PFM threshold at request of a HW workaround is no longer needed, set as default now in SBL. Remove it. CRs-Fixed: 1095917 Change-Id: I4e041dafa83d935a5c16c96bea7a3ea488e80d87 Signed-off-by: Harry Yang --- drivers/power/qcom-charger/smb-lib.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/power/qcom-charger/smb-lib.c b/drivers/power/qcom-charger/smb-lib.c index 507704dd469a..f1380af1bfa1 100644 --- a/drivers/power/qcom-charger/smb-lib.c +++ b/drivers/power/qcom-charger/smb-lib.c @@ -780,16 +780,6 @@ static int smblib_otg_cl_config(struct smb_charger *chg, int otg_cl_ua) return rc; } - /* configure PFM/PWM mode for OTG regulator */ - rc = smblib_masked_write(chg, DC_ENG_SSUPPLY_CFG3_REG, - ENG_SSUPPLY_CFG_SKIP_TH_V0P2_BIT, - otg_cl_ua > MICRO_250MA ? 1 : 0); - if (rc < 0) { - smblib_err(chg, - "Couldn't write DC_ENG_SSUPPLY_CFG3_REG rc=%d\n", rc); - return rc; - } - return rc; } From cf58ac692b4a271d615996eb64911e27bf52ccf2 Mon Sep 17 00:00:00 2001 From: Runmin Wang Date: Wed, 14 Dec 2016 10:56:40 -0800 Subject: [PATCH 783/813] defconfig: msm: disable AIO support for msm8998 Disable the aio interface as we do not use this for performance benefits. Change-Id: Ibbe1845c673f3a422b013d17492790c79c78d076 Signed-off-by: Runmin Wang --- arch/arm64/configs/msmcortex-perf_defconfig | 1 + arch/arm64/configs/msmcortex_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/msmcortex-perf_defconfig b/arch/arm64/configs/msmcortex-perf_defconfig index 60bb033be6df..e71e6b0b00f1 100644 --- a/arch/arm64/configs/msmcortex-perf_defconfig +++ b/arch/arm64/configs/msmcortex-perf_defconfig @@ -29,6 +29,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_LZ4 is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KALLSYMS_ALL=y +# CONFIG_AIO is not set # CONFIG_MEMBARRIER is not set CONFIG_EMBEDDED=y # CONFIG_SLUB_DEBUG is not set diff --git a/arch/arm64/configs/msmcortex_defconfig b/arch/arm64/configs/msmcortex_defconfig index 1ef2d90f13de..e1f9d61e74db 100644 --- a/arch/arm64/configs/msmcortex_defconfig +++ b/arch/arm64/configs/msmcortex_defconfig @@ -29,6 +29,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_LZ4 is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KALLSYMS_ALL=y +# CONFIG_AIO is not set # CONFIG_MEMBARRIER is not set CONFIG_EMBEDDED=y # CONFIG_COMPAT_BRK is not set From 3366a508ffb6b0698dd309d1ca19a66522b886b1 Mon Sep 17 00:00:00 2001 From: Syed Rameez Mustafa Date: Tue, 13 Dec 2016 16:50:58 -0800 Subject: [PATCH 784/813] Revert "sched/cputime: Fix steal time accounting vs. CPU hotplug" This reverts commit 2a8225ef46968444fb1c4c632ec28e4cc2be633f ("sched/cputime: Fix steal time accounting vs. CPU hotplug"). The commit introduces a bug in scheduler book-keeping whereby if a CPU is hotplugged out for a small duration of time, subsequent time spent executing tasks gets mis-attributed to interrupts. This in turn results in unfair scheduling whereby a task can keep executing for a very long time without any update to it's vruntime. This revert has no side effects for msm based systems. Change-Id: Ibb506824c4223551bceb449594ac99f9dfd8064b Signed-off-by: Syed Rameez Mustafa --- kernel/sched/core.c | 1 - kernel/sched/sched.h | 13 ------------- 2 files changed, 14 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee708909dc17..307f430e5bd5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6066,7 +6066,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) set_window_start(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); break; case CPU_ONLINE: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f569c6fe3cbb..6f8123ca878b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2534,16 +2534,3 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} From e18221c7d0b46cd4e12db77581741a6c49503ebb Mon Sep 17 00:00:00 2001 From: Harry Yang Date: Mon, 5 Dec 2016 14:59:11 -0800 Subject: [PATCH 785/813] qcom-charger: set optimal boost switching frequency Currently, the default boost switching frequency is fixed at 800KHz, which shows low efficiency in low USB load. To improve it, only use the default 800KHz when boost current goes above 100mA; otherwise, switch to 2MHz. CRs-Fixed: 1095917 Change-Id: Iebb62303febfb78738ada4553fdb9b9eb4810ed5 Signed-off-by: Harry Yang --- .../bindings/power/qcom-charger/qpnp-smb2.txt | 6 ++++ drivers/power/qcom-charger/qpnp-smb2.c | 23 +++++++++++++++ drivers/power/qcom-charger/smb-lib.c | 29 ++++++++++++++++++- drivers/power/qcom-charger/smb-lib.h | 5 ++++ drivers/power/qcom-charger/smb-reg.h | 1 + 5 files changed, 63 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/power/qcom-charger/qpnp-smb2.txt b/Documentation/devicetree/bindings/power/qcom-charger/qpnp-smb2.txt index 382587ea5922..94a1fdceec8f 100644 --- a/Documentation/devicetree/bindings/power/qcom-charger/qpnp-smb2.txt +++ b/Documentation/devicetree/bindings/power/qcom-charger/qpnp-smb2.txt @@ -64,6 +64,12 @@ Charger specific properties: Value type: Definition: Specifies the DC input current limit in micro-amps. +- qcom,boost-threshold-ua + Usage: optional + Value type: + Definition: Specifies the boost current threshold in micro-amps. + If the value is not present, 100mA is used as default. + - qcom,wipower-max-uw Usage: optional Value type: diff --git a/drivers/power/qcom-charger/qpnp-smb2.c b/drivers/power/qcom-charger/qpnp-smb2.c index e73ed2f1d288..543189ae5498 100644 --- a/drivers/power/qcom-charger/qpnp-smb2.c +++ b/drivers/power/qcom-charger/qpnp-smb2.c @@ -210,6 +210,13 @@ static struct smb_params v1_params = { .max_u = 2000, .step_u = 200, }, + .freq_boost = { + .name = "boost switching frequency", + .reg = CFG_BUCKBOOST_FREQ_SELECT_BOOST_REG, + .min_u = 600, + .max_u = 2000, + .step_u = 200, + }, }; #define STEP_CHARGING_MAX_STEPS 5 @@ -218,6 +225,7 @@ struct smb_dt_props { int usb_icl_ua; int otg_cl_ua; int dc_icl_ua; + int boost_threshold_ua; int fv_uv; int wipower_max_uw; u32 step_soc_threshold[STEP_CHARGING_MAX_STEPS - 1]; @@ -243,6 +251,7 @@ module_param_named( ); #define MICRO_1P5A 1500000 +#define MICRO_P1A 100000 static int smb2_parse_dt(struct smb2 *chip) { struct smb_charger *chg = &chip->chg; @@ -304,6 +313,12 @@ static int smb2_parse_dt(struct smb2 *chip) if (rc < 0) chip->dt.dc_icl_ua = -EINVAL; + rc = of_property_read_u32(node, + "qcom,boost-threshold-ua", + &chip->dt.boost_threshold_ua); + if (rc < 0) + chip->dt.boost_threshold_ua = MICRO_P1A; + rc = of_property_read_u32(node, "qcom,wipower-max-uw", &chip->dt.wipower_max_uw); if (rc < 0) @@ -370,6 +385,7 @@ static enum power_supply_property smb2_usb_props[] = { POWER_SUPPLY_PROP_PD_ACTIVE, POWER_SUPPLY_PROP_INPUT_CURRENT_SETTLED, POWER_SUPPLY_PROP_INPUT_CURRENT_NOW, + POWER_SUPPLY_PROP_BOOST_CURRENT, POWER_SUPPLY_PROP_PE_START, }; @@ -436,6 +452,9 @@ static int smb2_usb_get_prop(struct power_supply *psy, case POWER_SUPPLY_PROP_INPUT_CURRENT_NOW: rc = smblib_get_prop_usb_current_now(chg, val); break; + case POWER_SUPPLY_PROP_BOOST_CURRENT: + val->intval = chg->boost_current_ua; + break; case POWER_SUPPLY_PROP_PD_IN_HARD_RESET: rc = smblib_get_prop_pd_in_hard_reset(chg, val); break; @@ -490,6 +509,9 @@ static int smb2_usb_set_prop(struct power_supply *psy, case POWER_SUPPLY_PROP_PD_USB_SUSPEND_SUPPORTED: chg->system_suspend_supported = val->intval; break; + case POWER_SUPPLY_PROP_BOOST_CURRENT: + rc = smblib_set_prop_boost_current(chg, val); + break; default: pr_err("set prop %d is not supported\n", psp); rc = -EINVAL; @@ -1073,6 +1095,7 @@ static int smb2_init_hw(struct smb2 *chip) chg->otg_cl_ua = chip->dt.otg_cl_ua; chg->dcp_icl_ua = chip->dt.usb_icl_ua; + chg->boost_threshold_ua = chip->dt.boost_threshold_ua; rc = smblib_read(chg, APSD_RESULT_STATUS_REG, &stat); if (rc < 0) { diff --git a/drivers/power/qcom-charger/smb-lib.c b/drivers/power/qcom-charger/smb-lib.c index f1380af1bfa1..64e4354b5707 100644 --- a/drivers/power/qcom-charger/smb-lib.c +++ b/drivers/power/qcom-charger/smb-lib.c @@ -1501,7 +1501,7 @@ int smblib_get_prop_dc_current_max(struct smb_charger *chg, } /******************* - * USB PSY SETTERS * + * DC PSY SETTERS * * *****************/ int smblib_set_prop_dc_current_max(struct smb_charger *chg, @@ -1883,6 +1883,25 @@ int smblib_set_prop_usb_current_max(struct smb_charger *chg, return rc; } +#define FSW_2MHZ 2000 +#define FSW_800KHZ_RESET 800 +int smblib_set_prop_boost_current(struct smb_charger *chg, + const union power_supply_propval *val) +{ + int rc = 0; + + rc = smblib_set_charge_param(chg, &chg->param.freq_boost, + val->intval <= chg->boost_threshold_ua ? + FSW_2MHZ : FSW_800KHZ_RESET); + if (rc < 0) { + dev_err(chg->dev, "Error in setting freq_boost rc=%d\n", rc); + return rc; + } + + chg->boost_current_ua = val->intval; + return rc; +} + int smblib_set_prop_typec_power_role(struct smb_charger *chg, const union power_supply_propval *val) { @@ -2753,6 +2772,12 @@ static void typec_sink_insertion(struct smb_charger *chg) false, 0); } +static void typec_sink_removal(struct smb_charger *chg) +{ + smblib_set_charge_param(chg, &chg->param.freq_boost, FSW_800KHZ_RESET); + chg->boost_current_ua = 0; +} + static void smblib_handle_typec_removal(struct smb_charger *chg) { vote(chg->pd_disallowed_votable_indirect, CC_DETACHED_VOTER, true, 0); @@ -2772,6 +2797,7 @@ static void smblib_handle_typec_removal(struct smb_charger *chg) vote(chg->apsd_disable_votable, PD_HARD_RESET_VOTER, false, 0); typec_source_removal(chg); + typec_sink_removal(chg); smblib_update_usb_type(chg); } @@ -2789,6 +2815,7 @@ static void smblib_handle_typec_insertion(struct smb_charger *chg, typec_sink_insertion(chg); } else { typec_source_insertion(chg); + typec_sink_removal(chg); } vote(chg->pd_disallowed_votable_indirect, LEGACY_CABLE_VOTER, diff --git a/drivers/power/qcom-charger/smb-lib.h b/drivers/power/qcom-charger/smb-lib.h index b309095b04c1..fce76a0732b6 100644 --- a/drivers/power/qcom-charger/smb-lib.h +++ b/drivers/power/qcom-charger/smb-lib.h @@ -110,6 +110,7 @@ struct smb_params { struct smb_chg_param step_soc; struct smb_chg_param step_cc_delta[5]; struct smb_chg_param freq_buck; + struct smb_chg_param freq_boost; }; struct parallel_params { @@ -198,6 +199,7 @@ struct smb_charger { int voltage_max_uv; int pd_active; bool system_suspend_supported; + int boost_threshold_ua; int system_temp_level; int thermal_levels; @@ -216,6 +218,7 @@ struct smb_charger { /* workaround flag */ u32 wa_flags; enum cc2_sink_type cc2_sink_detach_flag; + int boost_current_ua; }; int smblib_read(struct smb_charger *chg, u16 addr, u8 *val); @@ -346,6 +349,8 @@ int smblib_set_prop_usb_voltage_min(struct smb_charger *chg, const union power_supply_propval *val); int smblib_set_prop_usb_voltage_max(struct smb_charger *chg, const union power_supply_propval *val); +int smblib_set_prop_boost_current(struct smb_charger *chg, + const union power_supply_propval *val); int smblib_set_prop_typec_power_role(struct smb_charger *chg, const union power_supply_propval *val); int smblib_set_prop_pd_active(struct smb_charger *chg, diff --git a/drivers/power/qcom-charger/smb-reg.h b/drivers/power/qcom-charger/smb-reg.h index c2a2b0c86d73..1986a185be4a 100644 --- a/drivers/power/qcom-charger/smb-reg.h +++ b/drivers/power/qcom-charger/smb-reg.h @@ -1000,5 +1000,6 @@ enum { #define SYSOK_OPTIONS_MASK GENMASK(2, 0) #define CFG_BUCKBOOST_FREQ_SELECT_BUCK_REG (MISC_BASE + 0xA0) +#define CFG_BUCKBOOST_FREQ_SELECT_BOOST_REG (MISC_BASE + 0xA1) #endif /* __SMB2_CHARGER_REG_H */ From 841179fa162f8a4f3af799f661be428eb475738f Mon Sep 17 00:00:00 2001 From: Harry Yang Date: Wed, 7 Dec 2016 09:48:01 -0800 Subject: [PATCH 786/813] ARM: dts: msm: set boost current threshold on PMI8998 A boost current threshold will be referenced in deciding optimal boost switching frequency for efficiency. CRs-Fixed: 1095917 Change-Id: I1e738e66cdfb72305dd4256dc0a6e05b03de585e Signed-off-by: Harry Yang --- arch/arm/boot/dts/qcom/msm-pmi8998.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/qcom/msm-pmi8998.dtsi b/arch/arm/boot/dts/qcom/msm-pmi8998.dtsi index 0dc9da9289e2..ce8fa696eb99 100644 --- a/arch/arm/boot/dts/qcom/msm-pmi8998.dtsi +++ b/arch/arm/boot/dts/qcom/msm-pmi8998.dtsi @@ -155,6 +155,7 @@ "usbin_i", "usbin_v"; + qcom,boost-threshold-ua = <100000>; qcom,wipower-max-uw = <5000000>; dpdm-supply = <&qusb_phy0>; From 7210624955249fb4fb936b2ca18ee9bab670fdeb Mon Sep 17 00:00:00 2001 From: Abhijeet Dharmapurikar Date: Thu, 15 Dec 2016 16:55:32 -0800 Subject: [PATCH 787/813] smb-lib: report FULL regardless of charger presence For a full battery, the driver currently reports battery full only when the charger is online and switches back to reporting discharging when charger is removed. This is incorrect because a user might reinsert the charger seeing discharging status right after he removes it only to realize that the battery is full after he plugs it back. So allow the full status to be reported even when no charger is online. Change-Id: I0c31e2fbe984420a6754e923cfc3abe77bc54991 Signed-off-by: Abhijeet Dharmapurikar --- drivers/power/qcom-charger/smb-lib.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/power/qcom-charger/smb-lib.c b/drivers/power/qcom-charger/smb-lib.c index 507704dd469a..f7dcd1eef2d0 100644 --- a/drivers/power/qcom-charger/smb-lib.c +++ b/drivers/power/qcom-charger/smb-lib.c @@ -1155,32 +1155,9 @@ int smblib_get_prop_batt_capacity(struct smb_charger *chg, int smblib_get_prop_batt_status(struct smb_charger *chg, union power_supply_propval *val) { - union power_supply_propval pval = {0, }; - bool usb_online, dc_online; u8 stat; int rc; - rc = smblib_get_prop_usb_online(chg, &pval); - if (rc < 0) { - smblib_err(chg, "Couldn't get usb online property rc=%d\n", - rc); - return rc; - } - usb_online = (bool)pval.intval; - - rc = smblib_get_prop_dc_online(chg, &pval); - if (rc < 0) { - smblib_err(chg, "Couldn't get dc online property rc=%d\n", - rc); - return rc; - } - dc_online = (bool)pval.intval; - - if (!usb_online && !dc_online) { - val->intval = POWER_SUPPLY_STATUS_DISCHARGING; - return rc; - } - rc = smblib_read(chg, BATTERY_CHARGER_STATUS_1_REG, &stat); if (rc < 0) { smblib_err(chg, "Couldn't read BATTERY_CHARGER_STATUS_1 rc=%d\n", From 8599d43e8a6804fea1b9e1f56456287b75d8a45c Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 12 Dec 2016 13:38:22 +0530 Subject: [PATCH 788/813] ARM: dts: msm: Add remote fs device node on msmfalcon It is needed to enable remote file system on modem processor, which needs access to store data onto eMMC/UFS device. The shared memory size used by modem and apps for this purpose is 2MB on msmfalcon. Change-Id: Id0cdd7f27b42c261966bb5e92d8229fa234445c4 Signed-off-by: Sahitya Tummala --- arch/arm/boot/dts/qcom/msmfalcon.dtsi | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/boot/dts/qcom/msmfalcon.dtsi b/arch/arm/boot/dts/qcom/msmfalcon.dtsi index 572a896ad795..97944307f8f3 100644 --- a/arch/arm/boot/dts/qcom/msmfalcon.dtsi +++ b/arch/arm/boot/dts/qcom/msmfalcon.dtsi @@ -977,6 +977,13 @@ }; }; + qcom,rmtfs_sharedmem@0 { + compatible = "qcom,sharedmem-uio"; + reg = <0x0 0x200000>; + reg-names = "rmtfs"; + qcom,client-id = <0x00000001>; + }; + qcom,rmnet-ipa { compatible = "qcom,rmnet-ipa"; qcom,rmnet-ipa-ssr; From fe4987d5446841de4dd9a24803e5d467248f7f74 Mon Sep 17 00:00:00 2001 From: Vic Wei Date: Wed, 14 Dec 2016 18:47:49 -0800 Subject: [PATCH 789/813] smb-lib : Fix incorrect VALID_INPUT_POWER_SOURCE_BIT check In smblib_get_prop_usb_online() and smblib_get_prop_dc_online(), used incorrect VALID_INPUT_POWER_SOURCE_BIT check. Now change to VALID_INPUT_POWER_SOURCE_STS_BIT check. Change-Id: I70489775b2a99afff8b26366758834a98b1ffb30 Signed-off-by: Vic Wei CRs-Fixed: 1102132 --- drivers/power/qcom-charger/smb-lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/qcom-charger/smb-lib.c b/drivers/power/qcom-charger/smb-lib.c index 0faf8aee8aa0..879d33df9d19 100644 --- a/drivers/power/qcom-charger/smb-lib.c +++ b/drivers/power/qcom-charger/smb-lib.c @@ -1498,7 +1498,7 @@ int smblib_get_prop_dc_online(struct smb_charger *chg, stat); val->intval = (stat & USE_DCIN_BIT) && - (stat & VALID_INPUT_POWER_SOURCE_BIT); + (stat & VALID_INPUT_POWER_SOURCE_STS_BIT); return rc; } @@ -1564,7 +1564,7 @@ int smblib_get_prop_usb_online(struct smb_charger *chg, stat); val->intval = (stat & USE_USBIN_BIT) && - (stat & VALID_INPUT_POWER_SOURCE_BIT); + (stat & VALID_INPUT_POWER_SOURCE_STS_BIT); return rc; } From 9d34289a0bbc88d07403024446206bcd31489869 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Thu, 15 Dec 2016 10:33:05 +0800 Subject: [PATCH 790/813] clk: msm: hdmi: HDMI 10nm HPG Rev2.0 update HPG Rev 1.0 is a basic bring up implementation of HDMI PHY/PLL. This change absorbs the changes upgrade to HPG Rev2.0. CRs-Fixed: 1033918 Change-Id: I768463aaad17f7be5d3fe11d7ca23d422833cfe5 Signed-off-by: Lei Chen --- drivers/clk/msm/mdss/mdss-hdmi-pll-8998.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/clk/msm/mdss/mdss-hdmi-pll-8998.c b/drivers/clk/msm/mdss/mdss-hdmi-pll-8998.c index 029f779979c7..c60c4864442f 100644 --- a/drivers/clk/msm/mdss/mdss-hdmi-pll-8998.c +++ b/drivers/clk/msm/mdss/mdss-hdmi-pll-8998.c @@ -494,7 +494,7 @@ static int hdmi_8998_pll_set_clk_rate(struct clk *c, unsigned long rate) _W(pll, SYSCLK_EN_SEL, 0x37); _W(pll, SYS_CLK_CTRL, 0x2); _W(pll, CLK_ENABLE1, 0xE); - _W(pll, PLL_IVCO, 0xF); + _W(pll, PLL_IVCO, 0x7); _W(pll, VCO_TUNE_CTRL, 0x0); _W(pll, SVS_MODE_CLK_SEL, cfg.svs_mode_clk_sel); _W(pll, CLK_SELECT, 0x30); @@ -536,10 +536,10 @@ static int hdmi_8998_pll_set_clk_rate(struct clk *c, unsigned long rate) _W(pll, PHY_TX_PRE_DRIVER_2(2), cfg.l2_pre_driver_2); _W(pll, PHY_TX_PRE_DRIVER_2(3), cfg.l3_pre_driver_2); - _W(pll, PHY_TX_DRV_LVL_RES_CODE_OFFSET(0), 0x0); + _W(pll, PHY_TX_DRV_LVL_RES_CODE_OFFSET(0), 0x3); _W(pll, PHY_TX_DRV_LVL_RES_CODE_OFFSET(1), 0x0); _W(pll, PHY_TX_DRV_LVL_RES_CODE_OFFSET(2), 0x0); - _W(pll, PHY_TX_DRV_LVL_RES_CODE_OFFSET(3), 0x0); + _W(pll, PHY_TX_DRV_LVL_RES_CODE_OFFSET(3), 0x3); _W(phy, PHY_MODE, cfg.phy_mode); @@ -627,8 +627,6 @@ static int hdmi_8998_pll_enable(struct clk *c) _W(phy, PHY_CFG, 0x59); udelay(100); - _W(phy, PHY_CLOCK, 0x6); - /* Ensure all registers are flushed to hardware */ wmb(); From f8c7c6ffdfb366efce72a4df93d124659a246b8c Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 13 May 2016 02:05:32 -0700 Subject: [PATCH 791/813] sched: Track burst length for tasks Track burst length for tasks as time they ran from wakeup to sleep. This is used to predict average time a task may run when it wakes up and thus avoid waking up idle cpu for "short-burst" tasks. Change-Id: Ie71d3163630fb8aa0db8ee8383768f8748270cf9 Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Pavankumar Kondeti --- include/linux/sched.h | 1 + include/trace/events/sched.h | 6 ++-- kernel/sched/core.c | 4 ++- kernel/sched/hmp.c | 55 ++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 4 +++ 5 files changed, 56 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c3be2d56ac5..2a885fbf542a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1362,6 +1362,7 @@ struct ravg { u32 sum_history[RAVG_HIST_SIZE_MAX]; u32 *curr_window_cpu, *prev_window_cpu; u32 curr_window, prev_window; + u64 curr_burst, avg_burst; u16 active_windows; u32 pred_demand; u8 busy_buckets[NUM_BUSY_BUCKETS]; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 72bbed9ad5db..27c5c580acc2 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -134,6 +134,7 @@ TRACE_EVENT(sched_task_load, __field( int, best_cpu ) __field( u64, latency ) __field( int, grp_id ) + __field( u64, avg_burst ) ), TP_fast_assign( @@ -150,13 +151,14 @@ TRACE_EVENT(sched_task_load, sched_ktime_clock() - p->ravg.mark_start : 0; __entry->grp_id = p->grp ? p->grp->id : 0; + __entry->avg_burst = p->ravg.avg_burst; ), - TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu", + TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu avg_burst=%llu", __entry->pid, __entry->comm, __entry->demand, __entry->boost, __entry->reason, __entry->sync, __entry->need_idle, __entry->flags, __entry->grp_id, - __entry->best_cpu, __entry->latency) + __entry->best_cpu, __entry->latency, __entry->avg_burst) ); TRACE_EVENT(sched_set_preferred_cluster, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee708909dc17..1a288e2de50d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1708,7 +1708,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) return cpu; } -static void update_avg(u64 *avg, u64 sample) +void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; *avg += diff >> 3; @@ -3403,6 +3403,8 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); + if (!is_idle_task(prev) && !prev->on_rq) + update_avg_burst(prev); rq->nr_switches++; rq->curr = next; diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 6304c5030137..b2f3013bfe31 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -1552,6 +1552,8 @@ void init_new_task_load(struct task_struct *p, bool idle_task) INIT_LIST_HEAD(&p->grp_list); memset(&p->ravg, 0, sizeof(struct ravg)); p->cpu_cycles = 0; + p->ravg.curr_burst = 0; + p->ravg.avg_burst = 0; p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); @@ -2738,12 +2740,14 @@ done: trace_sched_update_history(rq, p, runtime, samples, event); } -static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) { delta = scale_exec_time(delta, rq); p->ravg.sum += delta; if (unlikely(p->ravg.sum > sched_ravg_window)) p->ravg.sum = sched_ravg_window; + + return delta; } /* @@ -2796,13 +2800,14 @@ static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() * depends on it! */ -static void update_task_demand(struct task_struct *p, struct rq *rq, +static u64 update_task_demand(struct task_struct *p, struct rq *rq, int event, u64 wallclock) { u64 mark_start = p->ravg.mark_start; u64 delta, window_start = rq->window_start; int new_window, nr_full_windows; u32 window_size = sched_ravg_window; + u64 runtime; new_window = mark_start < window_start; if (!account_busy_for_task_demand(p, event)) { @@ -2816,7 +2821,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, * it is not necessary to account those. */ update_history(rq, p, p->ravg.sum, 1, event); - return; + return 0; } if (!new_window) { @@ -2824,8 +2829,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, * The simple case - busy time contained within the existing * window. */ - add_to_task_demand(rq, p, wallclock - mark_start); - return; + return add_to_task_demand(rq, p, wallclock - mark_start); } /* @@ -2837,13 +2841,16 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, window_start -= (u64)nr_full_windows * (u64)window_size; /* Process (window_start - mark_start) first */ - add_to_task_demand(rq, p, window_start - mark_start); + runtime = add_to_task_demand(rq, p, window_start - mark_start); /* Push new sample(s) into task's demand history */ update_history(rq, p, p->ravg.sum, 1, event); - if (nr_full_windows) - update_history(rq, p, scale_exec_time(window_size, rq), - nr_full_windows, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } /* * Roll window_start back to current to process any remainder @@ -2853,13 +2860,31 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, /* Process (wallclock - window_start) next */ mark_start = window_start; - add_to_task_demand(rq, p, wallclock - mark_start); + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static inline void +update_task_burst(struct task_struct *p, struct rq *rq, int event, int runtime) +{ + /* + * update_task_demand() has checks for idle task and + * exit task. The runtime may include the wait time, + * so update the burst only for the cases where the + * task is running. + */ + if (event == PUT_PREV_TASK || (event == TASK_UPDATE && + rq->curr == p)) + p->ravg.curr_burst += runtime; } /* Reflect task activity on its demand and cpu's busy time statistics */ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { + u64 runtime; + if (!rq->window_start || sched_disable_window_stats || p->ravg.mark_start == wallclock) return; @@ -2874,7 +2899,9 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, } update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); - update_task_demand(p, rq, event, wallclock); + runtime = update_task_demand(p, rq, event, wallclock); + if (runtime) + update_task_burst(p, rq, event, runtime); update_cpu_busy_time(p, rq, event, wallclock, irqtime); update_task_pred_demand(rq, p, event); done: @@ -4462,6 +4489,12 @@ bool early_detection_notify(struct rq *rq, u64 wallclock) return 0; } +void update_avg_burst(struct task_struct *p) +{ + update_avg(&p->ravg.avg_burst, p->ravg.curr_burst); + p->ravg.curr_burst = 0; +} + #ifdef CONFIG_CGROUP_SCHED u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f569c6fe3cbb..6b9f11d9a47c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1109,6 +1109,8 @@ extern int update_preferred_cluster(struct related_thread_group *grp, extern void set_preferred_cluster(struct related_thread_group *grp); extern void add_new_task_to_grp(struct task_struct *new); extern unsigned int update_freq_aggregate_threshold(unsigned int threshold); +extern void update_avg_burst(struct task_struct *p); +extern void update_avg(u64 *avg, u64 sample); enum sched_boost_policy { SCHED_BOOST_NONE, @@ -1647,6 +1649,8 @@ static inline int alloc_related_thread_groups(void) { return 0; } #define trace_sched_cpu_load_cgroup(...) #define trace_sched_cpu_load_wakeup(...) +static inline void update_avg_burst(struct task_struct *p) {} + #endif /* CONFIG_SCHED_HMP */ /* From 0dee0d1411e4ba837089a769a5bcce57a5a14df2 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 9 Sep 2016 19:38:03 +0530 Subject: [PATCH 792/813] sched: Avoid waking idle cpu for short-burst tasks Introduce sched_short_burst tunable to classify "short-burst" tasks. These tasks are eligible for packing to avoid overhead associated with waking up an idle CPU. select_best_cpu() ignores power-cost and selects the CPU with least wakeup latency which is not loaded with IRQs and can accommodate this task without exceeding spill limits. The ties are broken with load followed by previous CPU. This policy does not affect cluster selection but only CPU selection in the selected cluster. The tasks eligible for "wakeup-up-idle" and "boost" are not considered for packing. This policy is applied for both "fair" and "rt" scheduling class tasks. Change-Id: I2a05493fde93f58636725f18d0ce8dbce4418a30 Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Pavankumar Kondeti --- Documentation/scheduler/sched-hmp.txt | 10 +++++++ include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 39 +++++++++++++++++++++------ kernel/sched/hmp.c | 17 +++++++++++- kernel/sched/rt.c | 18 +++++++++++++ kernel/sched/sched.h | 10 +++++++ kernel/sysctl.c | 7 +++++ 7 files changed, 93 insertions(+), 9 deletions(-) diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt index 091d49ea80cf..766c01d321b5 100644 --- a/Documentation/scheduler/sched-hmp.txt +++ b/Documentation/scheduler/sched-hmp.txt @@ -726,6 +726,16 @@ d. /proc/sys/kernel/sched_select_prev_cpu_us Default value of sched_select_prev_cpu_us is 2000 (2ms). This can be turned off by setting it to 0. +e. /proc/sys/kernel/sched_short_burst_ns + This threshold controls whether a task is considered as "short-burst" + or not. "short-burst" tasks are eligible for packing to avoid overhead + associated with waking up an idle CPU. "non-idle" CPUs which are not + loaded with IRQs and can accommodate the waking task without exceeding + spill limits are considered. The ties are broken with load followed + by previous CPU. This tunable does not affect cluster selection. + It only affects CPU selection in a given cluster. This packing is + skipped for tasks that are eligible for "wake-up-idle" and "boost". + **** 5.2.4 Wakeup Logic for Task "p" Wakeup task placement logic is as follows: diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5d0899df64ff..e4aff5e6e17f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -68,6 +68,7 @@ extern unsigned int sysctl_sched_freq_aggregate; extern unsigned int sysctl_sched_enable_thread_grouping; extern unsigned int sysctl_sched_freq_aggregate_threshold_pct; extern unsigned int sysctl_sched_prefer_sync_wakee_to_waker; +extern unsigned int sysctl_sched_short_burst; #else /* CONFIG_SCHED_HMP */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3db77aff2433..95b961dc7b14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2591,6 +2591,7 @@ static u32 __compute_runnable_contrib(u64 n) #define SBC_FLAG_CSTATE_LOAD 0x100 #define SBC_FLAG_BEST_SIBLING 0x200 #define SBC_FLAG_WAKER_CPU 0x400 +#define SBC_FLAG_PACK_TASK 0x800 /* Cluster selection flag */ #define SBC_FLAG_COLOC_CLUSTER 0x10000 @@ -2607,6 +2608,7 @@ struct cpu_select_env { u8 sync:1; u8 ignore_prev_cpu:1; enum sched_boost_policy boost_policy; + u8 pack_task:1; int prev_cpu; DECLARE_BITMAP(candidate_list, NR_CPUS); DECLARE_BITMAP(backup_list, NR_CPUS); @@ -2958,8 +2960,17 @@ static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, { int cpu_cost; - cpu_cost = power_cost(cpu, task_load(env->p) + + /* + * We try to find the least loaded *busy* CPU irrespective + * of the power cost. + */ + if (env->pack_task) + cpu_cost = cpu_min_power_cost(cpu); + + else + cpu_cost = power_cost(cpu, task_load(env->p) + cpu_cravg_sync(cpu, env->sync)); + if (cpu_cost <= stats->min_cost) __update_cluster_stats(cpu, stats, env, cpu_cost); } @@ -3034,6 +3045,15 @@ static inline int wake_to_idle(struct task_struct *p) (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle; } +static inline bool env_has_special_flags(struct cpu_select_env *env) +{ + if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE || + env->reason) + return true; + + return false; +} + static inline bool bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) { @@ -3041,9 +3061,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) struct task_struct *task = env->p; struct sched_cluster *cluster; - if (env->boost_policy != SCHED_BOOST_NONE || env->reason || - !task->ravg.mark_start || - env->need_idle || !sched_short_sleep_task_threshold) + if (!task->ravg.mark_start || !sched_short_sleep_task_threshold) return false; prev_cpu = env->prev_cpu; @@ -3092,8 +3110,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) static inline bool wake_to_waker_cluster(struct cpu_select_env *env) { - return env->boost_policy == SCHED_BOOST_NONE && - !env->need_idle && !env->reason && env->sync && + return env->sync && task_load(current) > sched_big_waker_task_load && task_load(env->p) < sched_small_wakee_task_load; } @@ -3118,7 +3135,6 @@ cluster_allowed(struct task_struct *p, struct sched_cluster *cluster) return !cpumask_empty(&tmp_mask); } - /* return cheapest cpu that can fit this task */ static int select_best_cpu(struct task_struct *p, int target, int reason, int sync) @@ -3128,6 +3144,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, struct related_thread_group *grp; unsigned int sbc_flag = 0; int cpu = raw_smp_processor_id(); + bool special; struct cpu_select_env env = { .p = p, @@ -3140,6 +3157,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .rtg = NULL, .sbc_best_flag = 0, .sbc_best_cluster_flag = 0, + .pack_task = false, }; env.boost_policy = task_sched_boost(p) ? @@ -3149,6 +3167,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, bitmap_zero(env.backup_list, NR_CPUS); init_cluster_cpu_stats(&stats); + special = env_has_special_flags(&env); rcu_read_lock(); @@ -3160,7 +3179,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, clear_bit(pref_cluster->id, env.candidate_list); else env.rtg = grp; - } else { + } else if (!special) { cluster = cpu_rq(cpu)->cluster; if (wake_to_waker_cluster(&env)) { if (bias_to_waker_cpu(p, cpu)) { @@ -3181,6 +3200,10 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, } } + if (!special && is_short_burst_task(p)) { + env.pack_task = true; + sbc_flag = SBC_FLAG_PACK_TASK; + } retry: cluster = select_least_power_cluster(&env); diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index b2f3013bfe31..95e618ee1124 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -961,6 +961,13 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; +/* + * Scheduler tries to avoid waking up idle CPUs for tasks running + * in short bursts. If the task average burst is less than + * sysctl_sched_short_burst nanoseconds, it is eligible for packing. + */ +unsigned int __read_mostly sysctl_sched_short_burst; + static void _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { @@ -1553,7 +1560,13 @@ void init_new_task_load(struct task_struct *p, bool idle_task) memset(&p->ravg, 0, sizeof(struct ravg)); p->cpu_cycles = 0; p->ravg.curr_burst = 0; - p->ravg.avg_burst = 0; + /* + * Initialize the avg_burst to twice the threshold, so that + * a task would not be classified as short burst right away + * after fork. It takes at least 6 sleep-wakeup cycles for + * the avg_burst to go below the threshold. + */ + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); @@ -2987,6 +3000,8 @@ void reset_task_stats(struct task_struct *p) p->ravg.curr_window_cpu = curr_window_ptr; p->ravg.prev_window_cpu = prev_window_ptr; + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + /* Retain EXITING_TASK marker */ p->ravg.sum_history[0] = sum; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 52edd6b158ed..624bededfb85 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1679,6 +1679,7 @@ static int find_lowest_rq_hmp(struct task_struct *task) int i; int restrict_cluster; int boost_on_big; + int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX; boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && sched_boost_policy() == SCHED_BOOST_ON_BIG; @@ -1695,6 +1696,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return best_cpu; /* No targets found */ + pack_task = is_short_burst_task(task); + /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect @@ -1720,6 +1723,20 @@ static int find_lowest_rq_hmp(struct task_struct *task) if (!restrict_cluster) cpu_load = scale_load_to_cpu(cpu_load, i); + if (pack_task) { + wakeup_latency = cpu_rq(i)->wakeup_latency; + + if (wakeup_latency > least_wakeup_latency) + continue; + + if (wakeup_latency < least_wakeup_latency) { + least_wakeup_latency = wakeup_latency; + min_load = cpu_load; + best_cpu = i; + continue; + } + } + if (cpu_load < min_load || (cpu_load == min_load && (i == prev_cpu || (best_cpu != prev_cpu && @@ -1728,6 +1745,7 @@ static int find_lowest_rq_hmp(struct task_struct *task) best_cpu = i; } } + if (restrict_cluster && best_cpu != -1) break; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b9f11d9a47c..b9a109e5ef94 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1188,6 +1188,11 @@ static inline int cpu_max_power_cost(int cpu) return cpu_rq(cpu)->cluster->max_power_cost; } +static inline int cpu_min_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->min_power_cost; +} + static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period) { return div64_u64(cycles, period); @@ -1385,6 +1390,11 @@ static inline u64 cpu_cravg_sync(int cpu, int sync) return load; } +static inline bool is_short_burst_task(struct task_struct *p) +{ + return p->ravg.avg_burst < sysctl_sched_short_burst; +} + extern void check_for_migration(struct rq *rq, struct task_struct *p); extern void pre_big_task_count_change(const struct cpumask *cpus); extern void post_big_task_count_change(const struct cpumask *cpus); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b7cbd7940f7b..d4682d0cdeb1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -507,6 +507,13 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &three, }, + { + .procname = "sched_short_burst_ns", + .data = &sysctl_sched_short_burst, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SCHED_DEBUG { From 92dc28458ccc3ab3aff715c77c744d0bdebd7506 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 9 Sep 2016 19:50:27 +0530 Subject: [PATCH 793/813] sched: Track average sleep time Similar to tracking average burst length for tasks, average sleep time indicates how much a task sleeps on an average before waking up to run. Very low sleep and burst lengths indicates tasks that could be sensitive to task-wake latencies and hence should not be packed. Change-Id: Ife68a9a9a9e596246aab5029f60e41c5bad781e4 Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Pavankumar Kondeti --- include/linux/sched.h | 2 +- include/trace/events/sched.h | 7 +++++-- kernel/sched/core.c | 4 ++-- kernel/sched/hmp.c | 14 +++++++++----- kernel/sched/sched.h | 4 ++-- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a885fbf542a..7b5135f04233 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1362,7 +1362,7 @@ struct ravg { u32 sum_history[RAVG_HIST_SIZE_MAX]; u32 *curr_window_cpu, *prev_window_cpu; u32 curr_window, prev_window; - u64 curr_burst, avg_burst; + u64 curr_burst, avg_burst, avg_sleep_time; u16 active_windows; u32 pred_demand; u8 busy_buckets[NUM_BUSY_BUCKETS]; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 27c5c580acc2..c3e41f5f494c 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -135,6 +135,7 @@ TRACE_EVENT(sched_task_load, __field( u64, latency ) __field( int, grp_id ) __field( u64, avg_burst ) + __field( u64, avg_sleep ) ), TP_fast_assign( @@ -152,13 +153,15 @@ TRACE_EVENT(sched_task_load, p->ravg.mark_start : 0; __entry->grp_id = p->grp ? p->grp->id : 0; __entry->avg_burst = p->ravg.avg_burst; + __entry->avg_sleep = p->ravg.avg_sleep_time; ), - TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu avg_burst=%llu", + TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu avg_burst=%llu avg_sleep=%llu", __entry->pid, __entry->comm, __entry->demand, __entry->boost, __entry->reason, __entry->sync, __entry->need_idle, __entry->flags, __entry->grp_id, - __entry->best_cpu, __entry->latency, __entry->avg_burst) + __entry->best_cpu, __entry->latency, __entry->avg_burst, + __entry->avg_sleep) ); TRACE_EVENT(sched_set_preferred_cluster, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a288e2de50d..1a48b0fc4d3f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2100,7 +2100,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) set_task_cpu(p, cpu); } - set_task_last_wake(p, wallclock); + note_task_waking(p, wallclock); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); stat: @@ -2169,7 +2169,7 @@ static void try_to_wake_up_local(struct task_struct *p) update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); - set_task_last_wake(p, wallclock); + note_task_waking(p, wallclock); } ttwu_do_wakeup(rq, p, 0); diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 95e618ee1124..158fc748873b 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -74,11 +74,6 @@ inline void clear_ed_task(struct task_struct *p, struct rq *rq) rq->ed_task = NULL; } -inline void set_task_last_wake(struct task_struct *p, u64 wallclock) -{ - p->last_wake_ts = wallclock; -} - inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) { p->last_switch_out_ts = wallclock; @@ -1567,6 +1562,7 @@ void init_new_task_load(struct task_struct *p, bool idle_task) * the avg_burst to go below the threshold. */ p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + p->ravg.avg_sleep_time = 0; p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); @@ -4510,6 +4506,14 @@ void update_avg_burst(struct task_struct *p) p->ravg.curr_burst = 0; } +void note_task_waking(struct task_struct *p, u64 wallclock) +{ + u64 sleep_time = wallclock - p->last_switch_out_ts; + + p->last_wake_ts = wallclock; + update_avg(&p->ravg.avg_sleep_time, sleep_time); +} + #ifdef CONFIG_CGROUP_SCHED u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b9a109e5ef94..74035e1605ec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1092,7 +1092,7 @@ extern void mark_task_starting(struct task_struct *p); extern void set_window_start(struct rq *rq); extern void migrate_sync_cpu(int cpu, int new_cpu); extern void update_cluster_topology(void); -extern void set_task_last_wake(struct task_struct *p, u64 wallclock); +extern void note_task_waking(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); extern void init_clusters(void); extern int __init set_sched_enable_hmp(char *str); @@ -1490,7 +1490,7 @@ static inline void set_window_start(struct rq *rq) { } static inline void migrate_sync_cpu(int cpu, int new_cpu) {} static inline void init_clusters(void) {} static inline void update_cluster_topology(void) { } -static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } +static inline void note_task_waking(struct task_struct *p, u64 wallclock) { } static inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) { } From 2352f5e1b49b96ad97b80b698c7e88139cad9dfe Mon Sep 17 00:00:00 2001 From: "Raju P.L.S.S.S.N" Date: Fri, 4 Nov 2016 17:30:51 +0530 Subject: [PATCH 794/813] ARM: dts: msm: Enable APSS-RPM communication for msmfalcon Remove rpm-standalone flag under rpm_bus to allow communication between APPS and RPM processors for msmfalcon. Change-Id: I1736dc8a2a34ece02aaec70c867c7a0993fa82c7 Signed-off-by: Raju P.L.S.S.S.N --- arch/arm/boot/dts/qcom/msmfalcon.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/boot/dts/qcom/msmfalcon.dtsi b/arch/arm/boot/dts/qcom/msmfalcon.dtsi index 572a896ad795..abaa950f8574 100644 --- a/arch/arm/boot/dts/qcom/msmfalcon.dtsi +++ b/arch/arm/boot/dts/qcom/msmfalcon.dtsi @@ -1163,7 +1163,6 @@ compatible = "qcom,rpm-glink"; qcom,glink-edge = "rpm"; rpm-channel-name = "rpm_requests"; - rpm-standalone; /* TODO: remove this after bring up */ }; qcom,ipc_router { From cfc8db22aea588a7aeae9f849a727a0ce8479cc0 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Mon, 19 Sep 2016 12:44:15 +0530 Subject: [PATCH 795/813] mm: vmpressure: make vmpressure window variable Right now the vmpressure window is of constant size 2MB, which works well with the following exceptions. 1) False vmpressure triggers are seen when the RAM size is greater than 3GB. This results in lowmemorykiller, which uses vmpressure events, killing tasks unnecessarily. 2) Vmpressure events are received late under memory pressure. This behaviour is seen prominently in <=2GB RAM targets. This results in lowmemorykiller kicking in late to kill tasks resulting in avoidable page cache reclaim. The problem analysis shows that the issue is with the constant size of the vmpressure window which does not adapt to the varying memory conditions. This patch recalculates the vmpressure window size at the end of each window. The chosen window size is proportional to the total of free and cached memory at that point. Change-Id: I7e9ef4ddd82e2c2dd04ce09ec8d58a8829cfb64d Signed-off-by: Vinayak Menon --- mm/vmpressure.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 75b7ffe9e7a3..f514dc40dab1 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -41,7 +41,7 @@ * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). */ -static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; +static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; /* * These thresholds are used when we account memory pressure through @@ -290,6 +290,29 @@ void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, schedule_work(&vmpr->work); } +void calculate_vmpressure_win(void) +{ + long x; + + x = global_page_state(NR_FILE_PAGES) - + global_page_state(NR_SHMEM) - + total_swapcache_pages() + + global_page_state(NR_FREE_PAGES); + if (x < 1) + x = 1; + /* + * For low (free + cached), vmpressure window should be + * small, and high for higher values of (free + cached). + * But it should not be linear as well. This ensures + * timely vmpressure notifications when system is under + * memory pressure, and optimal number of events when + * cached is high. The sqaure root function is empirically + * found to serve the purpose. + */ + x = int_sqrt(x); + vmpressure_win = x; +} + void vmpressure_global(gfp_t gfp, unsigned long scanned, unsigned long reclaimed) { @@ -304,6 +327,9 @@ void vmpressure_global(gfp_t gfp, unsigned long scanned, return; spin_lock(&vmpr->sr_lock); + if (!vmpr->scanned) + calculate_vmpressure_win(); + vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; From 4600adf60ef54d10c86ffa2b9a0217b1df421d29 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Oct 2016 17:42:09 +0100 Subject: [PATCH 796/813] arm64: kaslr: fix breakage with CONFIG_MODVERSIONS=y As it turns out, the KASLR code breaks CONFIG_MODVERSIONS, since the kcrctab has an absolute address field that is relocated at runtime when the kernel offset is randomized. This has been fixed already for PowerPC in the past, so simply wire up the existing code dealing with this issue. Change-Id: Ib01e32f2c5cd3d6feff6321a897ed8d276bc3be3 Cc: Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Tested-by: Timur Tabi Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Git-commit: 9c0e83c371cf4696926c95f9c8c77cd6ea803426 Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Signed-off-by: Runmin Wang --- arch/arm64/include/asm/module.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index e12af6754634..06ff7fd9e81f 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -17,6 +17,7 @@ #define __ASM_MODULE_H #include +#include #define MODULE_ARCH_VERMAGIC "aarch64" @@ -32,6 +33,10 @@ u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, Elf64_Sym *sym); #ifdef CONFIG_RANDOMIZE_BASE +#ifdef CONFIG_MODVERSIONS +#define ARCH_RELOCATES_KCRCTAB +#define reloc_start (kimage_vaddr - KIMAGE_VADDR) +#endif extern u64 module_alloc_base; #else #define module_alloc_base ((u64)_etext - MODULES_VSIZE) From 01fd9333a31961459cac18a9aee0a2bb3d740a41 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 12 Feb 2015 12:06:12 +0530 Subject: [PATCH 797/813] qcom: scm: Support register r6 to pass the session id Non-atomic scm call which could be interrupted, trustzone will store the session id in a register(r6) which will be used when trustzone resumes the call. To avoid r6 being used by compiler, HLOS now uses it to send a zero before making scm call. Change-Id: I2927efe04fd019d551aae5e6548da3fb5c18b655 Signed-off-by: Taniya Das --- drivers/soc/qcom/scm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/soc/qcom/scm.c b/drivers/soc/qcom/scm.c index f2216f968319..045a5001fc9f 100644 --- a/drivers/soc/qcom/scm.c +++ b/drivers/soc/qcom/scm.c @@ -134,6 +134,7 @@ struct scm_response { #define R3_STR "r3" #define R4_STR "r4" #define R5_STR "r5" +#define R6_STR "r6" #endif @@ -481,6 +482,7 @@ static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5, register u32 r3 asm("r3") = w3; register u32 r4 asm("r4") = w4; register u32 r5 asm("r5") = w5; + register u32 r6 asm("r6") = 0; do { asm volatile( @@ -494,13 +496,14 @@ static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5, __asmeq("%7", R3_STR) __asmeq("%8", R4_STR) __asmeq("%9", R5_STR) + __asmeq("%10", R6_STR) #ifdef REQUIRES_SEC ".arch_extension sec\n" #endif "smc #0\n" : "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3) : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), - "r" (r5)); + "r" (r5), "r" (r6)); } while (r0 == SCM_INTERRUPTED); From d3cb6fa63a8e5566d3fe4a0f13c77d4474e1ea1e Mon Sep 17 00:00:00 2001 From: Siddartha Mohanadoss Date: Mon, 12 Dec 2016 15:45:22 -0800 Subject: [PATCH 798/813] iio: rradc: Update reading BATT_ID channel There is a need for RRADC driver to return fresh results of the BATT_ID channel which could take upto 1.5 seconds in round robin mode. Switch the RRADC driver to continuous mode when client requests the BATT_ID channel to obtain the results faster. In addition explicity force the batt_id to ensure it forces a conversion on the new read. The existing USBIN_V channel also requires having the RRADC in continuous mode therefore have a common support for clients that uses continuous mode. Change-Id: I5b89ed9f0015bc6db3adce22e3ac6d0d0283e2bf Signed-off-by: Siddartha Mohanadoss --- drivers/iio/adc/qcom-rradc.c | 236 ++++++++++++++++++++++++----------- 1 file changed, 162 insertions(+), 74 deletions(-) diff --git a/drivers/iio/adc/qcom-rradc.c b/drivers/iio/adc/qcom-rradc.c index 0e8fda8b9080..ec774917f4a4 100644 --- a/drivers/iio/adc/qcom-rradc.c +++ b/drivers/iio/adc/qcom-rradc.c @@ -28,9 +28,8 @@ #define FG_ADC_RR_SKIN_TEMP_MSB 0x51 #define FG_ADC_RR_RR_ADC_CTL 0x52 #define FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL_MASK 0x8 -#define FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL BIT(3) +#define FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL BIT(3) #define FG_ADC_RR_ADC_LOG 0x53 -#define FG_ADC_RR_ADC_LOG_CLR_CTRL_MASK 0xFE #define FG_ADC_RR_ADC_LOG_CLR_CTRL BIT(0) #define FG_ADC_RR_FAKE_BATT_LOW_LSB 0x58 @@ -40,6 +39,7 @@ #define FG_ADC_RR_BATT_ID_CTRL 0x60 #define FG_ADC_RR_BATT_ID_TRIGGER 0x61 +#define FG_ADC_RR_BATT_ID_TRIGGER_CTL BIT(0) #define FG_ADC_RR_BATT_ID_STS 0x62 #define FG_ADC_RR_BATT_ID_CFG 0x63 #define FG_ADC_RR_BATT_ID_5_LSB 0x66 @@ -182,9 +182,11 @@ #define FG_RR_ADC_COHERENT_CHECK_RETRY 5 #define FG_RR_ADC_MAX_CONTINUOUS_BUFFER_LEN 16 #define FG_RR_ADC_STS_CHANNEL_READING_MASK 0x3 +#define FG_RR_ADC_STS_CHANNEL_STS 0x2 -#define FG_RR_CONV_CONTINUOUS_TIME_MIN 80000 -#define FG_RR_CONV_CONTINUOUS_TIME_MAX 81000 +#define FG_RR_CONV_CONTINUOUS_TIME_MIN_US 50000 +#define FG_RR_CONV_CONTINUOUS_TIME_MAX_US 51000 +#define FG_RR_CONV_MAX_RETRY_CNT 50 /* * The channel number is not a physical index in hardware, @@ -570,40 +572,157 @@ static const struct rradc_channels rradc_chans[] = { FG_ADC_RR_AUX_THERM_STS) }; +static int rradc_enable_continuous_mode(struct rradc_chip *chip) +{ + int rc = 0; + + /* Clear channel log */ + rc = rradc_masked_write(chip, FG_ADC_RR_ADC_LOG, + FG_ADC_RR_ADC_LOG_CLR_CTRL, + FG_ADC_RR_ADC_LOG_CLR_CTRL); + if (rc < 0) { + pr_err("log ctrl update to clear failed:%d\n", rc); + return rc; + } + + rc = rradc_masked_write(chip, FG_ADC_RR_ADC_LOG, + FG_ADC_RR_ADC_LOG_CLR_CTRL, 0); + if (rc < 0) { + pr_err("log ctrl update to not clear failed:%d\n", rc); + return rc; + } + + /* Switch to continuous mode */ + rc = rradc_masked_write(chip, FG_ADC_RR_RR_ADC_CTL, + FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL_MASK, + FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL); + if (rc < 0) { + pr_err("Update to continuous mode failed:%d\n", rc); + return rc; + } + + return rc; +} + +static int rradc_disable_continuous_mode(struct rradc_chip *chip) +{ + int rc = 0; + + /* Switch to non continuous mode */ + rc = rradc_masked_write(chip, FG_ADC_RR_RR_ADC_CTL, + FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL_MASK, 0); + if (rc < 0) { + pr_err("Update to non-continuous mode failed:%d\n", rc); + return rc; + } + + return rc; +} + +static int rradc_check_status_ready_with_retry(struct rradc_chip *chip, + struct rradc_chan_prop *prop, u8 *buf, u16 status) +{ + int rc = 0, retry_cnt = 0, mask = 0; + + switch (prop->channel) { + case RR_ADC_BATT_ID: + /* BATT_ID STS bit does not get set initially */ + mask = FG_RR_ADC_STS_CHANNEL_STS; + break; + default: + mask = FG_RR_ADC_STS_CHANNEL_READING_MASK; + break; + } + + while (((buf[0] & mask) != mask) && + (retry_cnt < FG_RR_CONV_MAX_RETRY_CNT)) { + pr_debug("%s is not ready; nothing to read:0x%x\n", + rradc_chans[prop->channel].datasheet_name, buf[0]); + usleep_range(FG_RR_CONV_CONTINUOUS_TIME_MIN_US, + FG_RR_CONV_CONTINUOUS_TIME_MAX_US); + retry_cnt++; + rc = rradc_read(chip, status, buf, 1); + if (rc < 0) { + pr_err("status read failed:%d\n", rc); + return rc; + } + } + + if (retry_cnt >= FG_RR_CONV_MAX_RETRY_CNT) + rc = -ENODATA; + + return rc; +} + +static int rradc_read_channel_with_continuous_mode(struct rradc_chip *chip, + struct rradc_chan_prop *prop, u8 *buf) +{ + int rc = 0; + u16 status = 0; + + rc = rradc_enable_continuous_mode(chip); + if (rc < 0) { + pr_err("Failed to switch to continuous mode\n"); + return rc; + } + + status = rradc_chans[prop->channel].sts; + rc = rradc_read(chip, status, buf, 1); + if (rc < 0) { + pr_err("status read failed:%d\n", rc); + return rc; + } + + rc = rradc_check_status_ready_with_retry(chip, prop, + buf, status); + if (rc < 0) { + pr_err("Status read failed:%d\n", rc); + return rc; + } + + rc = rradc_disable_continuous_mode(chip); + if (rc < 0) { + pr_err("Failed to switch to non continuous mode\n"); + return rc; + } + + return rc; +} + static int rradc_do_conversion(struct rradc_chip *chip, struct rradc_chan_prop *prop, u16 *data) { - int rc = 0, bytes_to_read = 0, retry = 0; + int rc = 0, bytes_to_read = 0; u8 buf[6]; u16 offset = 0, batt_id_5 = 0, batt_id_15 = 0, batt_id_150 = 0; u16 status = 0; mutex_lock(&chip->lock); - if ((prop->channel != RR_ADC_BATT_ID) && - (prop->channel != RR_ADC_CHG_HOT_TEMP) && - (prop->channel != RR_ADC_CHG_TOO_HOT_TEMP) && - (prop->channel != RR_ADC_SKIN_HOT_TEMP) && - (prop->channel != RR_ADC_SKIN_TOO_HOT_TEMP) && - (prop->channel != RR_ADC_USBIN_V)) { - /* BATT_ID STS bit does not get set initially */ - status = rradc_chans[prop->channel].sts; - rc = rradc_read(chip, status, buf, 1); + switch (prop->channel) { + case RR_ADC_BATT_ID: + rc = rradc_masked_write(chip, FG_ADC_RR_BATT_ID_TRIGGER, + FG_ADC_RR_BATT_ID_TRIGGER_CTL, + FG_ADC_RR_BATT_ID_TRIGGER_CTL); if (rc < 0) { - pr_err("status read failed:%d\n", rc); + pr_err("BATT_ID trigger set failed:%d\n", rc); goto fail; } - buf[0] &= FG_RR_ADC_STS_CHANNEL_READING_MASK; - if (buf[0] != FG_RR_ADC_STS_CHANNEL_READING_MASK) { - pr_debug("%s is not ready; nothing to read\n", - rradc_chans[prop->channel].datasheet_name); - rc = -ENODATA; + rc = rradc_read_channel_with_continuous_mode(chip, prop, buf); + if (rc < 0) { + pr_err("Error reading in continuous mode:%d\n", rc); goto fail; } - } - if (prop->channel == RR_ADC_USBIN_V) { + rc = rradc_masked_write(chip, FG_ADC_RR_BATT_ID_TRIGGER, + FG_ADC_RR_BATT_ID_TRIGGER_CTL, 0); + if (rc < 0) { + pr_err("BATT_ID trigger re-set failed:%d\n", rc); + goto fail; + } + break; + case RR_ADC_USBIN_V: /* Force conversion every cycle */ rc = rradc_masked_write(chip, FG_ADC_RR_USB_IN_V_TRIGGER, FG_ADC_RR_USB_IN_V_EVERY_CYCLE_MASK, @@ -613,58 +732,9 @@ static int rradc_do_conversion(struct rradc_chip *chip, goto fail; } - /* Clear channel log */ - rc = rradc_masked_write(chip, FG_ADC_RR_ADC_LOG, - FG_ADC_RR_ADC_LOG_CLR_CTRL_MASK, - FG_ADC_RR_ADC_LOG_CLR_CTRL); + rc = rradc_read_channel_with_continuous_mode(chip, prop, buf); if (rc < 0) { - pr_err("log ctrl update to clear failed:%d\n", rc); - goto fail; - } - - rc = rradc_masked_write(chip, FG_ADC_RR_ADC_LOG, - FG_ADC_RR_ADC_LOG_CLR_CTRL_MASK, 0); - if (rc < 0) { - pr_err("log ctrl update to not clear failed:%d\n", rc); - goto fail; - } - - /* Switch to continuous mode */ - rc = rradc_masked_write(chip, FG_ADC_RR_RR_ADC_CTL, - FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL_MASK, - FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL); - if (rc < 0) { - pr_err("Update to continuous mode failed:%d\n", rc); - goto fail; - } - - status = rradc_chans[prop->channel].sts; - rc = rradc_read(chip, status, buf, 1); - if (rc < 0) { - pr_err("status read failed:%d\n", rc); - goto fail; - } - - buf[0] &= FG_RR_ADC_STS_CHANNEL_READING_MASK; - while ((buf[0] != FG_RR_ADC_STS_CHANNEL_READING_MASK) && - (retry < 2)) { - pr_debug("%s is not ready; nothing to read\n", - rradc_chans[prop->channel].datasheet_name); - usleep_range(FG_RR_CONV_CONTINUOUS_TIME_MIN, - FG_RR_CONV_CONTINUOUS_TIME_MAX); - retry++; - rc = rradc_read(chip, status, buf, 1); - if (rc < 0) { - pr_err("status read failed:%d\n", rc); - goto fail; - } - } - - /* Switch to non continuous mode */ - rc = rradc_masked_write(chip, FG_ADC_RR_RR_ADC_CTL, - FG_ADC_RR_ADC_CTL_CONTINUOUS_SEL_MASK, 0); - if (rc < 0) { - pr_err("Update to continuous mode failed:%d\n", rc); + pr_err("Error reading in continuous mode:%d\n", rc); goto fail; } @@ -675,11 +745,29 @@ static int rradc_do_conversion(struct rradc_chip *chip, pr_err("Restore every cycle update failed:%d\n", rc); goto fail; } + break; + case RR_ADC_CHG_HOT_TEMP: + case RR_ADC_CHG_TOO_HOT_TEMP: + case RR_ADC_SKIN_HOT_TEMP: + case RR_ADC_SKIN_TOO_HOT_TEMP: + pr_debug("Read only the data registers\n"); + break; + default: + status = rradc_chans[prop->channel].sts; + rc = rradc_read(chip, status, buf, 1); + if (rc < 0) { + pr_err("status read failed:%d\n", rc); + goto fail; + } - if (retry >= 2) { + rc = rradc_check_status_ready_with_retry(chip, prop, + buf, status); + if (rc < 0) { + pr_debug("Status read failed:%d\n", rc); rc = -ENODATA; goto fail; } + break; } offset = rradc_chans[prop->channel].lsb; From e74a2a20ffbd11e09b4960e596804f725d07e11f Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 16 Dec 2016 17:25:07 -0800 Subject: [PATCH 799/813] usb: gadget: composite: enable BESL support According to USB 2.0 ECN Errata for Link Power Management (USB2-LPM-Errata-final.pdf), BESL must be enabled if LPM is enabled. This helps with USB30CV TD 9.21 LPM L1 Suspend Resume Test. Change-Id: I3412d36959d0b6b49d369cb1695836af82d7b5b4 Signed-off-by: Felipe Balbi Git-commit: a6615937bcd9234e6d6bb817c3701fce44d0a84d Git-repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git Signed-off-by: Hemant Kumar --- drivers/usb/gadget/composite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index a480b0a9a238..9866c9fef1f1 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -675,7 +675,7 @@ static int bos_desc(struct usb_composite_dev *cdev) usb_ext->bLength = USB_DT_USB_EXT_CAP_SIZE; usb_ext->bDescriptorType = USB_DT_DEVICE_CAPABILITY; usb_ext->bDevCapabilityType = USB_CAP_TYPE_EXT; - usb_ext->bmAttributes = cpu_to_le32(USB_LPM_SUPPORT); + usb_ext->bmAttributes = cpu_to_le32(USB_LPM_SUPPORT | USB_BESL_SUPPORT); if (gadget_is_superspeed(cdev->gadget)) { /* From 9c19f509863a0d31a857aaf959b59096ea84d4c0 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 29 Sep 2016 13:28:59 +0530 Subject: [PATCH 800/813] clk: qcom: Add support for hardware control branch These are the branches which have hardware control bit to be able to enable/disable the branch. They also need to support set rate on them and before setting any rate, we have to make sure the current parent and the next parent is prepared & enabled before the RCG is updated. To support both parents to be prepared/enabled use the flag FORCE_ENABLE_RCGR. Change-Id: I14abed3827de8cefc31f3deb3c1e589136c32b8d Signed-off-by: Taniya Das --- drivers/clk/qcom/clk-branch.c | 67 ++++++++++++ drivers/clk/qcom/clk-branch.h | 1 + drivers/clk/qcom/clk-rcg.h | 4 +- drivers/clk/qcom/clk-rcg2.c | 188 +++++++++++++++++++++++++++++----- 4 files changed, 234 insertions(+), 26 deletions(-) diff --git a/drivers/clk/qcom/clk-branch.c b/drivers/clk/qcom/clk-branch.c index cffaf46d732f..096e16db02fe 100644 --- a/drivers/clk/qcom/clk-branch.c +++ b/drivers/clk/qcom/clk-branch.c @@ -248,6 +248,73 @@ const struct clk_ops clk_branch2_ops = { }; EXPORT_SYMBOL_GPL(clk_branch2_ops); +static int clk_branch2_hw_ctl_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + /* + * Make sure the branch clock has CLK_SET_RATE_PARENT flag, + * and the RCG has FORCE_ENABLE_RCGR flag set. + */ + if (!(hw->init->flags & CLK_SET_RATE_PARENT)) { + pr_err("set rate would not get propagated to parent\n"); + return -EINVAL; + } + + return 0; +} + +static unsigned long clk_branch2_hw_ctl_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + return parent_rate; +} + +static int clk_branch2_hw_ctl_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct clk_hw *clkp; + + clkp = __clk_get_hw(clk_get_parent(hw->clk)); + + req->best_parent_hw = clkp; + req->best_parent_rate = clk_round_rate(clkp->clk, req->rate); + + return 0; +} + +static int clk_branch2_hw_ctl_enable(struct clk_hw *hw) +{ + struct clk_hw *parent = __clk_get_hw(clk_get_parent(hw->clk)); + + /* The parent branch clock should have been prepared prior to this. */ + if (!parent || (parent && !clk_hw_is_prepared(parent))) + return -EINVAL; + + return clk_enable_regmap(hw); +} + +static void clk_branch2_hw_ctl_disable(struct clk_hw *hw) +{ + struct clk_hw *parent = __clk_get_hw(clk_get_parent(hw->clk)); + + if (!parent) + return; + + clk_disable_regmap(hw); +} + +const struct clk_ops clk_branch2_hw_ctl_ops = { + .enable = clk_branch2_hw_ctl_enable, + .disable = clk_branch2_hw_ctl_disable, + .is_enabled = clk_is_enabled_regmap, + .set_rate = clk_branch2_hw_ctl_set_rate, + .recalc_rate = clk_branch2_hw_ctl_recalc_rate, + .determine_rate = clk_branch2_hw_ctl_determine_rate, + .set_flags = clk_branch_set_flags, + .list_registers = clk_branch2_list_registers, +}; +EXPORT_SYMBOL_GPL(clk_branch2_hw_ctl_ops); + static int clk_gate_toggle(struct clk_hw *hw, bool en) { struct clk_gate2 *gt = to_clk_gate2(hw); diff --git a/drivers/clk/qcom/clk-branch.h b/drivers/clk/qcom/clk-branch.h index 8a934cf8bed1..b67ac1dfbbf9 100644 --- a/drivers/clk/qcom/clk-branch.h +++ b/drivers/clk/qcom/clk-branch.h @@ -62,6 +62,7 @@ extern const struct clk_ops clk_branch_ops; extern const struct clk_ops clk_branch2_ops; extern const struct clk_ops clk_gate2_ops; extern const struct clk_ops clk_branch_simple_ops; +extern const struct clk_ops clk_branch2_hw_ctl_ops; #define to_clk_branch(_hw) \ container_of(to_clk_regmap(_hw), struct clk_branch, clkr) diff --git a/drivers/clk/qcom/clk-rcg.h b/drivers/clk/qcom/clk-rcg.h index 020bd351bbd8..accdac9fb964 100644 --- a/drivers/clk/qcom/clk-rcg.h +++ b/drivers/clk/qcom/clk-rcg.h @@ -171,10 +171,12 @@ struct clk_rcg2 { const struct parent_map *parent_map; const struct freq_tbl *freq_tbl; unsigned long current_freq; + u32 new_index; + u32 curr_index; struct clk_regmap clkr; -#define FORCE_ENABLE_RCGR BIT(0) u8 flags; +#define FORCE_ENABLE_RCGR BIT(0) }; #define to_clk_rcg2(_hw) container_of(to_clk_regmap(_hw), struct clk_rcg2, clkr) diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c index 4104a238c088..653722f9c4b0 100644 --- a/drivers/clk/qcom/clk-rcg2.c +++ b/drivers/clk/qcom/clk-rcg2.c @@ -89,30 +89,6 @@ static int clk_rcg_set_force_enable(struct clk_hw *hw) return ret; } -static int clk_rcg2_enable(struct clk_hw *hw) -{ - int ret = 0; - struct clk_rcg2 *rcg = to_clk_rcg2(hw); - - if (rcg->flags & FORCE_ENABLE_RCGR) - ret = clk_rcg_set_force_enable(hw); - - return ret; -} - -static void clk_rcg2_disable(struct clk_hw *hw) -{ - struct clk_rcg2 *rcg = to_clk_rcg2(hw); - - if (rcg->flags & FORCE_ENABLE_RCGR) { - /* force disable RCG - clear CMD_ROOT_EN bit */ - regmap_update_bits(rcg->clkr.regmap, - rcg->cmd_rcgr + CMD_REG, CMD_ROOT_EN, 0); - /* Add a delay to disable the RCG */ - udelay(100); - } -} - static u8 clk_rcg2_get_parent(struct clk_hw *hw) { struct clk_rcg2 *rcg = to_clk_rcg2(hw); @@ -381,16 +357,178 @@ static long clk_rcg2_list_rate(struct clk_hw *hw, unsigned n, return (rcg->freq_tbl + n)->freq; } +static int prepare_enable_rcg_srcs(struct clk_hw *hw, struct clk *curr, + struct clk *new) +{ + int rc = 0; + + rc = clk_prepare(curr); + if (rc) + return rc; + + if (clk_hw_is_prepared(hw)) { + rc = clk_prepare(new); + if (rc) + goto err_new_src_prepare; + } + + rc = clk_prepare(new); + if (rc) + goto err_new_src_prepare2; + + rc = clk_enable(curr); + if (rc) + goto err_curr_src_enable; + + if (__clk_get_enable_count(hw->clk)) { + rc = clk_enable(new); + if (rc) + goto err_new_src_enable; + } + + rc = clk_enable(new); + if (rc) + goto err_new_src_enable2; + + return rc; + +err_new_src_enable2: + if (__clk_get_enable_count(hw->clk)) + clk_disable(new); +err_new_src_enable: + clk_disable(curr); +err_curr_src_enable: + clk_unprepare(new); +err_new_src_prepare2: + if (clk_hw_is_prepared(hw)) + clk_unprepare(new); +err_new_src_prepare: + clk_unprepare(curr); + + return rc; +} + +static void disable_unprepare_rcg_srcs(struct clk_hw *hw, struct clk *curr, + struct clk *new) +{ + clk_disable(new); + + clk_disable(curr); + + if (__clk_get_enable_count(hw->clk)) + clk_disable(new); + + clk_unprepare(new); + clk_unprepare(curr); + + if (clk_hw_is_prepared(hw)) + clk_unprepare(new); +} + +static struct freq_tbl cxo_f = { + .freq = 19200000, + .src = 0, + .pre_div = 1, + .m = 0, + .n = 0, +}; + +static int clk_enable_disable_prepare_unprepare(struct clk_hw *hw, int cindex, + int nindex, bool enable) +{ + struct clk_hw *new_p, *curr_p; + + curr_p = clk_hw_get_parent_by_index(hw, cindex); + new_p = clk_hw_get_parent_by_index(hw, nindex); + + if (enable) + return prepare_enable_rcg_srcs(hw, curr_p->clk, new_p->clk); + + disable_unprepare_rcg_srcs(hw, curr_p->clk, new_p->clk); + return 0; +} + +static int clk_rcg2_enable(struct clk_hw *hw) +{ + int ret = 0; + const struct freq_tbl *f; + struct clk_rcg2 *rcg = to_clk_rcg2(hw); + + if (rcg->flags & FORCE_ENABLE_RCGR) { + if (!rcg->current_freq) + rcg->current_freq = cxo_f.freq; + + if (rcg->current_freq == cxo_f.freq) + rcg->curr_index = 0; + else { + f = qcom_find_freq(rcg->freq_tbl, rcg->current_freq); + rcg->curr_index = qcom_find_src_index(hw, + rcg->parent_map, f->src); + } + + ret = clk_enable_disable_prepare_unprepare(hw, rcg->curr_index, + rcg->new_index, true); + if (ret) { + pr_err("Failed to prepare_enable new and current sources\n"); + return ret; + } + + clk_rcg_set_force_enable(hw); + + clk_enable_disable_prepare_unprepare(hw, rcg->curr_index, + rcg->new_index, false); + } + + return ret; +} + +static void clk_rcg2_disable(struct clk_hw *hw) +{ + struct clk_rcg2 *rcg = to_clk_rcg2(hw); + + if (rcg->flags & FORCE_ENABLE_RCGR) { + /* force disable RCG - clear CMD_ROOT_EN bit */ + regmap_update_bits(rcg->clkr.regmap, + rcg->cmd_rcgr + CMD_REG, CMD_ROOT_EN, 0); + /* Add a delay to disable the RCG */ + udelay(100); + } +} + + static int __clk_rcg2_set_rate(struct clk_hw *hw, unsigned long rate) { struct clk_rcg2 *rcg = to_clk_rcg2(hw); const struct freq_tbl *f; + int ret = 0; + + /* Current frequency */ + if (rcg->flags & FORCE_ENABLE_RCGR) + rcg->current_freq = clk_get_rate(hw->clk); f = qcom_find_freq(rcg->freq_tbl, rate); if (!f) return -EINVAL; - return clk_rcg2_configure(rcg, f); + /* New parent index */ + if (rcg->flags & FORCE_ENABLE_RCGR) { + rcg->new_index = qcom_find_src_index(hw, + rcg->parent_map, f->src); + ret = clk_rcg2_enable(hw); + if (ret) { + pr_err("Failed to enable rcg\n"); + return ret; + } + } + + ret = clk_rcg2_configure(rcg, f); + if (ret) + return ret; + + if (rcg->flags & FORCE_ENABLE_RCGR) + clk_rcg2_disable(hw); + + return ret; } static int clk_rcg2_set_rate(struct clk_hw *hw, unsigned long rate, From 61c423799fb52bbc83070303000e341b587d1ce7 Mon Sep 17 00:00:00 2001 From: Suyog Sarda Date: Tue, 7 Jun 2016 21:15:42 +0530 Subject: [PATCH 801/813] lowmemorykiller: Introduce sysfs node for ALMK and PPR adj threshold The grouping of tasks based on oom_score_adj values change from one framework to another. This requires corresponding changes in the threshold values set for almk and per process reclaim. Introduce sysfs nodes to set threshold adj for process reclaim and adaptive LMK dynamically. Change-Id: Ib7565bfd5d2e93aa4ff8fdd20414cac0a0f38bf7 Signed-off-by: Suyog Sarda Signed-off-by: Vinayak Menon --- drivers/staging/android/lowmemorykiller.c | 2 ++ mm/process_reclaim.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index a76a7ff618b9..68a4559f9d26 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -98,6 +98,8 @@ static unsigned long lowmem_count(struct shrinker *s, static atomic_t shift_adj = ATOMIC_INIT(0); static short adj_max_shift = 353; +module_param_named(adj_max_shift, adj_max_shift, short, + S_IRUGO | S_IWUSR); /* User knob to enable/disable adaptive lmk feature */ static int enable_adaptive_lmk; diff --git a/mm/process_reclaim.c b/mm/process_reclaim.c index 8cf5f13548e8..98e5af190fe0 100644 --- a/mm/process_reclaim.c +++ b/mm/process_reclaim.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, The Linux Foundation. All rights reserved. + * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -48,6 +48,10 @@ static unsigned long pressure_max = 90; module_param_named(pressure_min, pressure_min, ulong, S_IRUGO | S_IWUSR); module_param_named(pressure_max, pressure_max, ulong, S_IRUGO | S_IWUSR); +static short min_score_adj = 360; +module_param_named(min_score_adj, min_score_adj, short, + S_IRUGO | S_IWUSR); + /* * Scheduling process reclaim workqueue unecessarily * when the reclaim efficiency is low does not make @@ -114,7 +118,6 @@ static void swap_fn(struct work_struct *work) int i; int tasksize; int total_sz = 0; - short min_score_adj = 360; int total_scan = 0; int total_reclaimed = 0; int nr_to_reclaim; From d6b27f7bddb4ec8ac0f3462dea5eaf3162867bdc Mon Sep 17 00:00:00 2001 From: "Raju P.L.S.S.S.N" Date: Thu, 15 Dec 2016 19:30:27 +0530 Subject: [PATCH 802/813] ARM: dts: msm: Add lpass pmu irq to mpm wakeup list for msmfalcon lpass_pmu_tmr_timeout_irq_cx 519 is enabled if ADSP is unable to wakeup from island retention due to power management failure. Add this to MPM wakeup list to handle such scenario. Change-Id: I71bd87330f64322f54693a4c412f0af69cd3c704 Signed-off-by: Raju P.L.S.S.S.N --- arch/arm/boot/dts/qcom/msmfalcon-pm.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/qcom/msmfalcon-pm.dtsi b/arch/arm/boot/dts/qcom/msmfalcon-pm.dtsi index 39c766613b30..139c23fac955 100644 --- a/arch/arm/boot/dts/qcom/msmfalcon-pm.dtsi +++ b/arch/arm/boot/dts/qcom/msmfalcon-pm.dtsi @@ -344,6 +344,7 @@ <0x34 275>, /* qmp_usb3_lfps_rxterm_irq_cx */ <0x4f 379>, /* qusb2phy_intr */ <0x57 358>, /* ee0_apps_hlos_spmi_periph_irq */ + <0x5b 519>, /* lpass_pmu_tmr_timeout_irq_cx */ <0xff 16>, /* APC[0-7]_qgicQTmrHypPhysIrptReq */ <0xff 17>, /* APC[0-7]_qgicQTmrSecPhysIrptReq */ <0xff 18>, /* APC[0-7]_qgicQTmrNonSecPhysIrptReq */ @@ -708,7 +709,6 @@ <0xff 515>, /* turing_irq_out_vmm[3] */ <0xff 516>, /* lpass_irq_out_apcs[41] */ <0xff 517>, /* lpass_irq_out_apcs[42] */ - <0xff 519>, /* lpass_irq_out_apcs[44] */ <0xff 520>, /* lpass_irq_out_apcs[45] */ <0xff 544>, /* turing_irq_out_apcs[00] */ <0xff 545>, /* turing_irq_out_apcs[01] */ From 601e0dc02f4cf8ce8465d5a15d4e50eda12e74c5 Mon Sep 17 00:00:00 2001 From: Pengfei Liu Date: Fri, 9 Dec 2016 14:02:57 +0800 Subject: [PATCH 803/813] ARM: dts: msm: Add dual camera support for qrd8998 skuk board Add dual camera related modifications for qrd8998 skuk board. The modifications includes rear/aux camera device node and corresponding eeprom/actuator/ois device node. Change-Id: I21110b930cf5a74a656e40f2f3210cc57ae91b95 Signed-off-by: Pengfei Liu --- .../dts/qcom/msm8998-camera-sensor-skuk.dtsi | 204 ++++++++++++++---- 1 file changed, 157 insertions(+), 47 deletions(-) diff --git a/arch/arm/boot/dts/qcom/msm8998-camera-sensor-skuk.dtsi b/arch/arm/boot/dts/qcom/msm8998-camera-sensor-skuk.dtsi index 36441f9aa15a..69b0286dba09 100644 --- a/arch/arm/boot/dts/qcom/msm8998-camera-sensor-skuk.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998-camera-sensor-skuk.dtsi @@ -29,6 +29,89 @@ }; }; +&tlmm{ + cam_sensor_front_active: cam_sensor_front_active { + /* RESET */ + mux { + pins = "gpio9"; + function = "gpio"; + }; + + config { + pins = "gpio9"; + bias-disable; /* No PULL */ + drive-strength = <2>; /* 2 MA */ + }; + }; + + cam_sensor_front_suspend: cam_sensor_front_suspend { + /* RESET */ + mux { + pins = "gpio9"; + function = "gpio"; + }; + + config { + pins = "gpio9"; + bias-disable; /* No PULL */ + drive-strength = <2>; /* 2 MA */ + }; + }; + + cam_sensor_rear2_active: cam_sensor_rear2_active { + /* RESET, STANDBY */ + mux { + pins = "gpio28","gpio27"; + function = "gpio"; + }; + + config { + pins = "gpio28","gpio27"; + bias-disable; /* No PULL */ + drive-strength = <2>; /* 2 MA */ + }; + }; + + cam_sensor_rear2_suspend: cam_sensor_rear2_suspend { + /* RESET, STANDBY */ + mux { + pins = "gpio28","gpio27"; + function = "gpio"; + }; + config { + pins = "gpio28","gpio27"; + bias-disable; /* No PULL */ + drive-strength = <2>; /* 2 MA */ + }; + }; + + cam_sensor_rear_active: cam_sensor_rear_active { + /* RESET, STANDBY */ + mux { + pins = "gpio30","gpio29"; + function = "gpio"; + }; + config { + pins = "gpio30","gpio29"; + bias-disable; /* No PULL */ + drive-strength = <2>; /* 2 MA */ + }; + }; + + cam_sensor_rear_suspend: cam_sensor_rear_suspend { + /* RESET, STANDBY */ + mux { + pins = "gpio30","gpio29"; + function = "gpio"; + }; + config { + pins = "gpio30","gpio29"; + bias-disable; /* No PULL */ + drive-strength = <2>; /* 2 MA */ + }; + }; +}; + &cci { actuator0: qcom,actuator@0 { cell-index = <0>; @@ -50,7 +133,7 @@ reg = <0x1>; compatible = "qcom,actuator"; qcom,cci-master = <0>; - gpios = <&tlmm 29 0>; + gpios = <&tlmm 27 0>; qcom,gpio-vaf = <0>; qcom,gpio-req-tbl-num = <0>; qcom,gpio-req-tbl-flags = <0>; @@ -60,6 +143,22 @@ pinctrl-1 = <&cam_actuator_vaf_suspend>; }; + ois0: qcom,ois@0 { + cell-index = <0>; + reg = <0x0>; + compatible = "qcom,ois"; + qcom,cci-master = <0>; + gpios = <&tlmm 27 0>; + qcom,gpio-vaf = <0>; + qcom,gpio-req-tbl-num = <0>; + qcom,gpio-req-tbl-flags = <0>; + qcom,gpio-req-tbl-label = "CAM_VAF"; + pinctrl-names = "cam_default", "cam_suspend"; + pinctrl-0 = <&cam_actuator_vaf_active>; + pinctrl-1 = <&cam_actuator_vaf_suspend>; + status = "disabled"; + }; + eeprom0: qcom,eeprom@0 { cell-index = <0>; reg = <0>; @@ -104,35 +203,38 @@ cell-index = <1>; reg = <0x1>; compatible = "qcom,eeprom"; - cam_vdig-supply = <&pm8998_lvs1>; cam_vio-supply = <&pm8998_lvs1>; cam_vana-supply = <&pmi8998_bob>; - qcom,cam-vreg-name = "cam_vdig", "cam_vio", "cam_vana"; - qcom,cam-vreg-min-voltage = <0 0 3312000>; - qcom,cam-vreg-max-voltage = <0 0 3600000>; - qcom,cam-vreg-op-mode = <0 0 80000>; + cam_vdig-supply = <&pm8998_s3>; + qcom,cam-vreg-name = "cam_vio", "cam_vana", "cam_vdig"; + qcom,cam-vreg-min-voltage = <0 3312000 1352000>; + qcom,cam-vreg-max-voltage = <0 3600000 1352000>; + qcom,cam-vreg-op-mode = <0 80000 105000>; qcom,gpio-no-mux = <0>; pinctrl-names = "cam_default", "cam_suspend"; - pinctrl-0 = <&cam_sensor_mclk2_active - &cam_sensor_rear2_active>; - pinctrl-1 = <&cam_sensor_mclk2_suspend - &cam_sensor_rear2_suspend>; - gpios = <&tlmm 15 0>, - <&tlmm 9 0>, - <&tlmm 8 0>; + pinctrl-0 = <&cam_sensor_mclk1_active + &cam_sensor_rear2_active>; + pinctrl-1 = <&cam_sensor_mclk1_suspend + &cam_sensor_rear2_suspend>; + gpios = <&tlmm 14 0>, + <&tlmm 28 0>, + <&pm8998_gpios 20 0>, + <&tlmm 29 0>; qcom,gpio-reset = <1>; - qcom,gpio-vana = <2>; - qcom,gpio-req-tbl-num = <0 1 2>; - qcom,gpio-req-tbl-flags = <1 0 0>; + qcom,gpio-vdig = <2>; + qcom,gpio-vana = <3>; + qcom,gpio-req-tbl-num = <0 1 2 3>; + qcom,gpio-req-tbl-flags = <1 0 0 0>; qcom,gpio-req-tbl-label = "CAMIF_MCLK1", "CAM_RESET1", - "CAM_VANA1"; + "CAM_VDIG", + "CAM_VANA"; qcom,sensor-position = <0>; qcom,sensor-mode = <0>; - qcom,cci-master = <1>; + qcom,cci-master = <0>; status = "ok"; - clocks = <&clock_mmss clk_mclk2_clk_src>, - <&clock_mmss clk_mmss_camss_mclk2_clk>; + clocks = <&clock_mmss clk_mclk1_clk_src>, + <&clock_mmss clk_mmss_camss_mclk1_clk>; clock-names = "cam_src_clk", "cam_clk"; qcom,clock-rates = <24000000 0>; }; @@ -152,12 +254,12 @@ qcom,cam-vreg-op-mode = <0 80000 105000>; qcom,gpio-no-mux = <0>; pinctrl-names = "cam_default", "cam_suspend"; - pinctrl-0 = <&cam_sensor_mclk1_active + pinctrl-0 = <&cam_sensor_mclk2_active &cam_sensor_front_active>; - pinctrl-1 = <&cam_sensor_mclk1_suspend + pinctrl-1 = <&cam_sensor_mclk2_suspend &cam_sensor_front_suspend>; - gpios = <&tlmm 14 0>, - <&tlmm 28 0>, + gpios = <&tlmm 15 0>, + <&tlmm 9 0>, <&pm8998_gpios 9 0>; qcom,gpio-reset = <1>; qcom,gpio-vdig = <2>; @@ -170,8 +272,8 @@ qcom,sensor-mode = <0>; qcom,cci-master = <1>; status = "ok"; - clocks = <&clock_mmss clk_mclk1_clk_src>, - <&clock_mmss clk_mmss_camss_mclk1_clk>; + clocks = <&clock_mmss clk_mclk2_clk_src>, + <&clock_mmss clk_mmss_camss_mclk2_clk>; clock-names = "cam_src_clk", "cam_clk"; qcom,clock-rates = <24000000 0>; }; @@ -180,7 +282,8 @@ cell-index = <0>; compatible = "qcom,camera"; reg = <0x0>; - qcom,special-support-sensors = "imx362_gt24c64a"; + qcom,special-support-sensors = "imx362_gt24c64a", + "s5k3m3sm", "s5k2l7sx"; qcom,csiphy-sd-index = <0>; qcom,csid-sd-index = <0>; qcom,mount-angle = <270>; @@ -230,36 +333,42 @@ qcom,csiphy-sd-index = <1>; qcom,csid-sd-index = <1>; qcom,mount-angle = <90>; + qcom,led-flash-src = <&led_flash0>; + qcom,actuator-src = <&actuator1>; qcom,eeprom-src = <&eeprom1>; - cam_vdig-supply = <&pm8998_lvs1>; + qcom,ois-src = <&ois0>; cam_vio-supply = <&pm8998_lvs1>; cam_vana-supply = <&pmi8998_bob>; - qcom,cam-vreg-name = "cam_vdig", "cam_vio", "cam_vana"; - qcom,cam-vreg-min-voltage = <0 0 3312000>; - qcom,cam-vreg-max-voltage = <0 0 3600000>; - qcom,cam-vreg-op-mode = <0 0 80000>; + cam_vdig-supply = <&pm8998_s3>; + qcom,cam-vreg-name = "cam_vio", "cam_vana", "cam_vdig"; + qcom,cam-vreg-min-voltage = <0 3312000 1352000>; + qcom,cam-vreg-max-voltage = <0 3600000 1352000>; + qcom,cam-vreg-op-mode = <0 80000 105000>; qcom,gpio-no-mux = <0>; pinctrl-names = "cam_default", "cam_suspend"; - pinctrl-0 = <&cam_sensor_mclk2_active - &cam_sensor_rear2_active>; - pinctrl-1 = <&cam_sensor_mclk2_suspend - &cam_sensor_rear2_suspend>; - gpios = <&tlmm 15 0>, - <&tlmm 9 0>, - <&tlmm 8 0>; + pinctrl-0 = <&cam_sensor_mclk1_active + &cam_sensor_rear2_active>; + pinctrl-1 = <&cam_sensor_mclk1_suspend + &cam_sensor_rear2_suspend>; + gpios = <&tlmm 14 0>, + <&tlmm 28 0>, + <&pm8998_gpios 20 0>, + <&tlmm 29 0>; qcom,gpio-reset = <1>; - qcom,gpio-vana = <2>; - qcom,gpio-req-tbl-num = <0 1 2>; - qcom,gpio-req-tbl-flags = <1 0 0>; + qcom,gpio-vdig = <2>; + qcom,gpio-vana = <3>; + qcom,gpio-req-tbl-num = <0 1 2 3>; + qcom,gpio-req-tbl-flags = <1 0 0 0>; qcom,gpio-req-tbl-label = "CAMIF_MCLK1", "CAM_RESET1", - "CAM_VANA1"; + "CAM_VDIG", + "CAM_VANA"; qcom,sensor-position = <0>; qcom,sensor-mode = <0>; qcom,cci-master = <0>; status = "ok"; - clocks = <&clock_mmss clk_mclk2_clk_src>, - <&clock_mmss clk_mmss_camss_mclk2_clk>; + clocks = <&clock_mmss clk_mclk1_clk_src>, + <&clock_mmss clk_mmss_camss_mclk1_clk>; clock-names = "cam_src_clk", "cam_clk"; qcom,clock-rates = <24000000 0>; }; @@ -271,6 +380,7 @@ qcom,csiphy-sd-index = <2>; qcom,csid-sd-index = <2>; qcom,mount-angle = <90>; + qcom,eeprom-src = <&eeprom2>; cam_vio-supply = <&pm8998_lvs1>; cam_vana-supply = <&pm8998_l22>; cam_vdig-supply = <&pm8998_s3>; @@ -283,9 +393,9 @@ qcom,gpio-no-mux = <0>; pinctrl-names = "cam_default", "cam_suspend"; pinctrl-0 = <&cam_sensor_mclk2_active - &cam_sensor_rear2_active>; + &cam_sensor_front_active>; pinctrl-1 = <&cam_sensor_mclk2_suspend - &cam_sensor_rear2_suspend>; + &cam_sensor_front_suspend>; gpios = <&tlmm 15 0>, <&tlmm 9 0>, <&pm8998_gpios 9 0>; From ce1f9f6dabfb6f6566ac42211033e5f1632f61ba Mon Sep 17 00:00:00 2001 From: cyizhao Date: Tue, 13 Dec 2016 17:55:50 +0800 Subject: [PATCH 804/813] ARM: dts: msm: Enable volume up key for 8998 QRD interposer GPIO7 is used for volume up key on 8998 QRD interposer device, configure it to input and pull-up and add gpio-keys device to enable the key detection. CRs-Fixed: 1098142 Change-Id: I45ec6a02fc6f74ee871455c5662f62f1cfa7bc74 Signed-off-by: cyizhao --- .../msm8998-v2.1-interposer-msmfalcon-qrd.dts | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-qrd.dts b/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-qrd.dts index 013c849c4936..bed8d6d20221 100644 --- a/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-qrd.dts +++ b/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-qrd.dts @@ -113,3 +113,32 @@ qcom,vdd-voltage-level = <0 925000 925000>; core-supply = <&pmfalcon_l1>; }; + +&pm2falcon_gpios { + /* GPIO 7 for VOL_UP */ + gpio@c600 { + status = "ok"; + qcom,mode = <0>; + qcom,pull = <0>; + qcom,vin-sel = <0>; + qcom,src-sel = <0>; + qcom,out-strength = <1>; + }; +}; + +&soc { + gpio_keys { + compatible = "gpio-keys"; + input-name = "gpio-keys"; + status = "ok"; + + vol_up { + label = "volume_up"; + gpios = <&pm2falcon_gpios 7 0x1>; + linux,input-type = <1>; + linux,code = <115>; + gpio-key,wakeup; + debounce-interval = <15>; + }; + }; +}; From 1dd5515dbb31333c1ffa41c1a7cd613211d530f4 Mon Sep 17 00:00:00 2001 From: cyizhao Date: Wed, 30 Nov 2016 18:06:32 +0800 Subject: [PATCH 805/813] ARM: dts: msm: Delete RGB led trigger properties in QRD 8998 SKUK Delete RGB led trigger properties to remove the led control in kernel space. CRs-Fixed: 1094158 Change-Id: I8b28ece4bbfe76e3d9b003e14c6600b856c30043 Signed-off-by: cyizhao --- arch/arm/boot/dts/qcom/msm8998-qrd-skuk.dtsi | 12 ++++++++++++ arch/arm/boot/dts/qcom/msm8998-qrd-vr1.dtsi | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/arch/arm/boot/dts/qcom/msm8998-qrd-skuk.dtsi b/arch/arm/boot/dts/qcom/msm8998-qrd-skuk.dtsi index dcd84e79ba1b..41a727fe2c88 100644 --- a/arch/arm/boot/dts/qcom/msm8998-qrd-skuk.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998-qrd-skuk.dtsi @@ -389,3 +389,15 @@ qcom,thermal-node; }; }; + +&red_led { + /delete-property/ linux,default-trigger; +}; + +&green_led { + /delete-property/ linux,default-trigger; +}; + +&blue_led { + /delete-property/ linux,default-trigger; +}; diff --git a/arch/arm/boot/dts/qcom/msm8998-qrd-vr1.dtsi b/arch/arm/boot/dts/qcom/msm8998-qrd-vr1.dtsi index 2d1616412caa..a4e0017a58dc 100644 --- a/arch/arm/boot/dts/qcom/msm8998-qrd-vr1.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998-qrd-vr1.dtsi @@ -272,3 +272,15 @@ qcom,thermal-node; }; }; + +&red_led { + /delete-property/ linux,default-trigger; +}; + +&green_led { + /delete-property/ linux,default-trigger; +}; + +&blue_led { + /delete-property/ linux,default-trigger; +}; From 66a9333cd8bf4ae801c141088575c6641f2094f0 Mon Sep 17 00:00:00 2001 From: ansharma Date: Mon, 19 Dec 2016 11:49:01 +0530 Subject: [PATCH 806/813] ARM: dts: msm: Configure WLED for mdss on pm2falcon interposer Specify the number of strings in WLED properly to support brightness for mdss on msmfalcon interposer. CRs-Fixed: 1090076 Change-Id: If5dbc043f4708ffe6c474ce688ea86572c1b9ffd Signed-off-by: ansharma --- .../boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-cdp.dts | 4 ++++ .../boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-mtp.dts | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-cdp.dts b/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-cdp.dts index e88ee107e280..5ea248f6f2dc 100644 --- a/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-cdp.dts +++ b/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-cdp.dts @@ -149,3 +149,7 @@ qcom,platform-te-gpio = <&tlmm 10 0>; qcom,panel-mode-gpio = <&tlmm 91 0>; }; + +&pm2falcon_wled { + qcom,led-strings-list = [01 02]; +}; diff --git a/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-mtp.dts b/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-mtp.dts index ccc94307277d..7c0c53033a44 100644 --- a/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-mtp.dts +++ b/arch/arm/boot/dts/qcom/msm8998-v2.1-interposer-msmfalcon-mtp.dts @@ -178,3 +178,7 @@ }; }; }; + +&pm2falcon_wled { + qcom,led-strings-list = [01 02]; +}; From 749935230e8baa591f559ee9e4d59cbdad2972a3 Mon Sep 17 00:00:00 2001 From: ansharma Date: Sun, 18 Dec 2016 00:02:52 +0530 Subject: [PATCH 807/813] ARM: dts: msm: Enable Gm control for pm2falcon Error amp Gm(LOOP_GM) adaptively changes with brightness. As brightness decreases, Gm increases. CRs-Fixed: 1102641 Change-Id: I3dde602e434971cca8ec0947198d1c7b441168cf Signed-off-by: ansharma --- arch/arm/boot/dts/qcom/msm-pm2falcon.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/qcom/msm-pm2falcon.dtsi b/arch/arm/boot/dts/qcom/msm-pm2falcon.dtsi index 8ec542d953e2..c4d587abfb74 100644 --- a/arch/arm/boot/dts/qcom/msm-pm2falcon.dtsi +++ b/arch/arm/boot/dts/qcom/msm-pm2falcon.dtsi @@ -245,6 +245,7 @@ qcom,en-phase-stag; qcom,led-strings-list = [00 01 02]; qcom,en-ext-pfet-sc-pro; + qcom,loop-auto-gm-en; qcom,pmic-revid = <&pm2falcon_revid>; status = "ok"; }; From 4c79c86589315f2444c629ebf2fdf1e119e6c8d4 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Thu, 3 Nov 2016 15:17:44 +0530 Subject: [PATCH 808/813] drivers: cpuidle: lpm-levels: Fix round off error in calculation Fix round off error in calculation of wake up time programmed. Current implementation adds extra delay in wake up time due to round off error in conversion of usec to SCLK. Use nsec instead. CRs-fixed: 1081884 Change-Id: Iecb3b06cde79c59c24a0e56ef05c41a5f9b6204e Signed-off-by: Maulik Shah --- drivers/cpuidle/lpm-levels.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/lpm-levels.c b/drivers/cpuidle/lpm-levels.c index de033cc37a15..81801605d6e7 100644 --- a/drivers/cpuidle/lpm-levels.c +++ b/drivers/cpuidle/lpm-levels.c @@ -1132,11 +1132,11 @@ static int cluster_configure(struct lpm_cluster *cluster, int idx, goto failed_set_mode; } - us = us + 1; + us = (us + 1) * 1000; clear_predict_history(); clear_cl_predict_history(); - do_div(us, USEC_PER_SEC/SCLK_HZ); + do_div(us, NSEC_PER_SEC/SCLK_HZ); msm_mpm_enter_sleep(us, from_idle, cpumask); } From ab98cfe02e6df5668387c1ce89e1c94af81749dd Mon Sep 17 00:00:00 2001 From: Sandeep Panda Date: Mon, 19 Dec 2016 12:39:52 +0530 Subject: [PATCH 809/813] defconfig: msmfalcon: enable compilation of MDSS PLL driver Add necessary configuration to enable compilation of MDSS PLL driver which is now based on common clock framework for msmfalcon platform. Change-Id: Iba01736475b852d8d64f944c371e585c94b85114 Signed-off-by: Sandeep Panda --- arch/arm/configs/msmfalcon-perf_defconfig | 1 + arch/arm/configs/msmfalcon_defconfig | 1 + arch/arm64/configs/msmfalcon-perf_defconfig | 1 + arch/arm64/configs/msmfalcon_defconfig | 1 + 4 files changed, 4 insertions(+) diff --git a/arch/arm/configs/msmfalcon-perf_defconfig b/arch/arm/configs/msmfalcon-perf_defconfig index 72c65b1cea66..9b5f64fe1c44 100644 --- a/arch/arm/configs/msmfalcon-perf_defconfig +++ b/arch/arm/configs/msmfalcon-perf_defconfig @@ -477,6 +477,7 @@ CONFIG_SEEMP_CORE=y CONFIG_USB_BAM=y CONFIG_QCOM_CLK_SMD_RPM=y CONFIG_MSM_GCC_FALCON=y +CONFIG_QCOM_MDSS_PLL=y CONFIG_REMOTE_SPINLOCK_MSM=y CONFIG_ARM_SMMU=y CONFIG_QCOM_COMMON_LOG=y diff --git a/arch/arm/configs/msmfalcon_defconfig b/arch/arm/configs/msmfalcon_defconfig index aa3ee39e616b..d7d1d2cad629 100644 --- a/arch/arm/configs/msmfalcon_defconfig +++ b/arch/arm/configs/msmfalcon_defconfig @@ -479,6 +479,7 @@ CONFIG_USB_BAM=y CONFIG_QCOM_CLK_SMD_RPM=y CONFIG_MSM_GPUCC_FALCON=y CONFIG_MSM_MMCC_FALCON=y +CONFIG_QCOM_MDSS_PLL=y CONFIG_REMOTE_SPINLOCK_MSM=y CONFIG_ARM_SMMU=y CONFIG_IOMMU_DEBUG=y diff --git a/arch/arm64/configs/msmfalcon-perf_defconfig b/arch/arm64/configs/msmfalcon-perf_defconfig index 10c988472268..27c484c4d0d7 100644 --- a/arch/arm64/configs/msmfalcon-perf_defconfig +++ b/arch/arm64/configs/msmfalcon-perf_defconfig @@ -499,6 +499,7 @@ CONFIG_USB_BAM=y CONFIG_QCOM_CLK_SMD_RPM=y CONFIG_MSM_GPUCC_FALCON=y CONFIG_MSM_MMCC_FALCON=y +CONFIG_QCOM_MDSS_PLL=y CONFIG_REMOTE_SPINLOCK_MSM=y CONFIG_IOMMU_IO_PGTABLE_FAST=y CONFIG_ARM_SMMU=y diff --git a/arch/arm64/configs/msmfalcon_defconfig b/arch/arm64/configs/msmfalcon_defconfig index 4d641012da4e..2f9d037ff81d 100644 --- a/arch/arm64/configs/msmfalcon_defconfig +++ b/arch/arm64/configs/msmfalcon_defconfig @@ -508,6 +508,7 @@ CONFIG_USB_BAM=y CONFIG_QCOM_CLK_SMD_RPM=y CONFIG_MSM_GPUCC_FALCON=y CONFIG_MSM_MMCC_FALCON=y +CONFIG_QCOM_MDSS_PLL=y CONFIG_REMOTE_SPINLOCK_MSM=y CONFIG_IOMMU_IO_PGTABLE_FAST=y CONFIG_IOMMU_IO_PGTABLE_FAST_SELFTEST=y From 6e76078030643334b76145a5ed900c3f8fe3dc3d Mon Sep 17 00:00:00 2001 From: Ananda Kishore Date: Wed, 14 Dec 2016 17:05:48 +0530 Subject: [PATCH 810/813] sensors: ssc: update device tree documentation for ssc based sensors Update documentation for ssc based sensors device tree to make firmware name optional. Firmware is optional if sensors driver is sharing processor. Change-Id: Ia417f6fa47d7570a6560b1409fe61c0d22ad6aac Signed-off-by: Ananda Kishore --- Documentation/devicetree/bindings/qdsp/msm-ssc-sensors.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/qdsp/msm-ssc-sensors.txt b/Documentation/devicetree/bindings/qdsp/msm-ssc-sensors.txt index 2fb34fd16258..c4b69d734880 100644 --- a/Documentation/devicetree/bindings/qdsp/msm-ssc-sensors.txt +++ b/Documentation/devicetree/bindings/qdsp/msm-ssc-sensors.txt @@ -5,7 +5,12 @@ msm-ssc-sensors driver implements the mechanism that allows to load SLPI firmwar Required properties: - compatible: This must be "qcom,msm-ssc-sensors" + +Optional properties: + - qcom,firmware-name: SLPI firmware name, must be "slpi_v1" or "slpi_v2" + Firmware name is not required, if sensors driver is sharing processor for execution. + Example: The following for msm8998 version 1. From 1713de7137be89ba83c3a4eb178ad894726366ad Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Tue, 13 Dec 2016 20:00:45 +0530 Subject: [PATCH 811/813] mm: memcontrol: fix a compile time error Fix a compile time error introduced by 'commit 334ca3ed18de ("Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android")' Change-Id: I1af8d539a1ce07e16f6c46f855c65cc4985de737 Signed-off-by: Vinayak Menon --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08806bb1f070..12923d118bf2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5271,7 +5271,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, .allow_attach = mem_cgroup_allow_attach, .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, From b858dc4085f1387e0a8e26e8fd4fb30561e3f442 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 15 Dec 2016 14:58:47 +0530 Subject: [PATCH 812/813] clk: qcom: mmcc: Update the DSI PLL parent names The byte and pixel clocks RCG sources from their dsi byte/pixel PLLs, update the parent names so that those parents could be requested. Change-Id: Ie92df31a5cdfa176e872d721a84475a37172a2dd Signed-off-by: Taniya Das --- drivers/clk/qcom/mmcc-msmfalcon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/clk/qcom/mmcc-msmfalcon.c b/drivers/clk/qcom/mmcc-msmfalcon.c index 1d874f6db464..ef4c8c264078 100644 --- a/drivers/clk/qcom/mmcc-msmfalcon.c +++ b/drivers/clk/qcom/mmcc-msmfalcon.c @@ -112,8 +112,8 @@ static const struct parent_map mmcc_parent_map_1[] = { static const char * const mmcc_parent_names_1[] = { "xo", - "dsi0_phy_pll_out_byteclk", - "dsi1_phy_pll_out_byteclk", + "dsi0pll_byte_clk_mux", + "dsi1pll_byte_clk_mux", "core_bi_pll_test_se", }; @@ -240,8 +240,8 @@ static const struct parent_map mmcc_parent_map_8[] = { static const char * const mmcc_parent_names_8[] = { "xo", - "dsi0_phy_pll_out_dsiclk", - "dsi1_phy_pll_out_dsiclk", + "dsi0pll_pixel_clk_mux", + "dsi1pll_pixel_clk_mux", "core_bi_pll_test_se", }; From 8e63d4bb1cb90ec48b69fa985942dfa4c22c6896 Mon Sep 17 00:00:00 2001 From: Amey Telawane Date: Tue, 6 Dec 2016 19:00:40 +0530 Subject: [PATCH 813/813] ARM: dts: msm: add tpdm, tpda and cti nodes on msmfalcon Add support for CTI, TPDA and TPDM for msmfalcon. TPDM is used for trace, profiling and diagnostics and tpda is used as an aggregator. Change-Id: Ifad3259ca0fab45a264596475150d01708db3588 CRs-fixed: 1056777 Signed-off-by: Amey Telawane --- .../boot/dts/qcom/msmfalcon-coresight.dtsi | 772 +++++++++++++++++- 1 file changed, 769 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/qcom/msmfalcon-coresight.dtsi b/arch/arm/boot/dts/qcom/msmfalcon-coresight.dtsi index 2f1ef974811e..d5e27cc05979 100644 --- a/arch/arm/boot/dts/qcom/msmfalcon-coresight.dtsi +++ b/arch/arm/boot/dts/qcom/msmfalcon-coresight.dtsi @@ -173,7 +173,7 @@ <&funnel_merg_in_funnel_in0>; }; }; - port@3 { + port@2 { reg = <6>; funnel_in0_in_funnel_qatb: endpoint { slave-mode; @@ -181,7 +181,7 @@ <&funnel_qatb_out_funnel_in0>; }; }; - port@4 { + port@3 { reg = <7>; funnel_in0_in_stm: endpoint { slave-mode; @@ -215,7 +215,23 @@ <&funnel_merg_in_funnel_in1>; }; }; - port@5 { + port@1 { + reg = <2>; + funnel_in1_in_tpda_nav: endpoint { + slave-mode; + remote-endpoint = + <&tpda_nav_out_funnel_in1>; + }; + }; + port@2 { + reg = <5>; + funnel_in1_in_tpda_mss: endpoint { + slave-mode; + remote-endpoint = + <&tpda_mss_out_funnel_in1>; + }; + }; + port@3 { reg = <6>; funnel_in1_in_funnel_apss_merg: endpoint { slave-mode; @@ -258,6 +274,22 @@ <&funnel_apss_out_funnel_apss_merg>; }; }; + port@2 { + reg = <1>; + funnel_apss_merg_in_tpda_olc: endpoint { + slave-mode; + remote-endpoint = + <&tpda_olc_out_funnel_apss_merg>; + }; + }; + port@3 { + reg = <3>; + funnel_apss_merg_in_tpda_apss: endpoint { + slave-mode; + remote-endpoint = + <&tpda_apss_out_funnel_apss_merg>; + }; + }; }; }; @@ -829,6 +861,162 @@ clock-names = "core_clk", "core_a_clk"; }; + cti_apss: cti@7b80000 { + compatible = "arm,coresight-cti"; + reg = <0x7b80000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-apss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_apss_dl: cti@7bc1000 { + compatible = "arm,coresight-cti"; + reg = <0x7bc1000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-apss-dl"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_olc: cti@7b91000 { + compatible = "arm,coresight-cti"; + reg = <0x7b91000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-olc"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_lpass0: cti@7060000 { + compatible = "arm,coresight-cti"; + reg = <0x7060000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-lpass0"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_lpass1: cti@7061000 { + compatible = "arm,coresight-cti"; + reg = <0x7061000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-lpass1"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_turing: cti@7068000 { + compatible = "arm,coresight-cti"; + reg = <0x7068000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-turing"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_wcss0: cti@71a4000 { + compatible = "arm,coresight-cti"; + reg = <0x71a4000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-wcss0"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_wcss1: cti@71a5000 { + compatible = "arm,coresight-cti"; + reg = <0x71a5000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-wcss1"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_wcss2: cti@71a6000 { + compatible = "arm,coresight-cti"; + reg = <0x71a6000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-wcss2"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_mmss: cti@7188000 { + compatible = "arm,coresight-cti"; + reg = <0x7188000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-mmss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_isdb: cti@7121000 { + compatible = "arm,coresight-cti"; + reg = <0x7121000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-isdb"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_rpm: cti@7048000 { + compatible = "arm,coresight-cti"; + reg = <0x7048000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-rpm"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + + cti_mss: cti@7041000 { + compatible = "arm,coresight-cti"; + reg = <0x7041000 0x1000>; + reg-names = "cti-base"; + + coresight-name = "coresight-cti-mss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + }; + funnel_qatb: funnel@6005000 { compatible = "arm,primecell"; arm,primecell-periphid = <0x0003b908>; @@ -861,6 +1049,14 @@ <&tpda_out_funnel_qatb>; }; }; + port@2 { + reg = <3>; + funnel_qatb_in_funnel_dlct: endpoint { + slave-mode; + remote-endpoint = + <&funnel_dlct_out_funnel_qatb>; + }; + }; }; }; @@ -898,7 +1094,31 @@ <&funnel_qatb_in_tpda>; }; }; + port@1 { + reg = <1>; + tpda_in_funnel_gpu_dl: endpoint { + slave-mode; + remote-endpoint = + <&funnel_gpu_dl_out_tpda>; + }; + }; port@2 { + reg = <2>; + tpda_in_funnel_dlct: endpoint { + slave-mode; + remote-endpoint = + <&funnel_dlct_out_tpda>; + }; + }; + port@3 { + reg = <4>; + tpda_in_tpdm_vsense: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_vsense_out_tpda>; + }; + }; + port@4 { reg = <5>; tpda_in_tpdm_dcc: endpoint { slave-mode; @@ -906,6 +1126,110 @@ <&tpdm_dcc_out_tpda>; }; }; + port@5 { + reg = <6>; + tpda_in_tpdm_prng: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_prng_out_tpda>; + }; + }; + port@6 { + reg = <8>; + tpda_in_tpdm_qm: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_qm_out_tpda>; + }; + }; + port@7 { + reg = <10>; + tpda_in_tpdm_pimem: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_pimem_out_tpda>; + }; + }; + port@8 { + reg = <11>; + tpda_in_tpdm: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_out_tpda>; + }; + }; + }; + }; + + funnel_gpu_dl: funnel@7140000 { + compatible = "arm,primecell"; + arm,primecell-periphid = <0x0003b908>; + + reg = <0x71c40000 0x1000>; + reg-names = "funnel-base"; + + coresight-name = "coresight-funnel-gpu-dl"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "apb_pclk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + funnel_gpu_dl_out_tpda: endpoint { + remote-endpoint = + <&tpda_in_funnel_gpu_dl>; + }; + }; + port@2 { + reg = <0>; + funnel_gpu_dl_in_tpdm_gpu: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_gpu_out_funnel_gpu_dl>; + }; + }; + }; + }; + + tpdm_gpu: tpdm@7111000 { + status = "disabled"; + compatible = "qcom,coresight-tpdm"; + reg = <0x7111000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-gpu"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_gpu_out_funnel_gpu_dl: endpoint { + remote-endpoint = <&funnel_gpu_dl_in_tpdm_gpu>; + }; + }; + }; + + tpdm_vsense: tpdm@7038000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x7038000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-vsense"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_vsense_out_tpda: endpoint { + remote-endpoint = <&tpda_in_tpdm_vsense>; + }; }; }; @@ -926,4 +1250,446 @@ }; }; }; + + tpdm_prng: tpdm@704c000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x704c000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-prng"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_prng_out_tpda: endpoint { + remote-endpoint = <&tpda_in_tpdm_prng>; + }; + }; + }; + + tpdm_qm: tpdm@71d0000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x71d0000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-qm"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_qm_out_tpda: endpoint { + remote-endpoint = <&tpda_in_tpdm_qm>; + }; + }; + }; + + tpdm_pimem: tpdm@7050000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x7050000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-pimem"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_pimem_out_tpda: endpoint { + remote-endpoint = <&tpda_in_tpdm_pimem>; + }; + }; + }; + + tpdm: tpdm@6006000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x6006000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm"; + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_out_tpda: endpoint { + remote-endpoint = <&tpda_in_tpdm>; + }; + }; + }; + + tpda_nav: tpda@7191000 { + compatible = "qcom,coresight-tpda"; + reg = <0x7191000 0x1000>; + reg-names = "tpda-base"; + + coresight-name = "coresight-tpda-nav"; + + qcom,tpda-atid = <68>; + qcom,cmb-elem-size = <0 32>; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + tpda_nav_out_funnel_in1: endpoint { + remote-endpoint = + <&funnel_in1_in_tpda_nav>; + }; + }; + port@1 { + reg = <0>; + tpda_nav_in_tpdm_nav: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_nav_out_tpda_nav>; + }; + }; + }; + }; + + tpda_apss: tpda@7bc2000 { + compatible = "qcom,coresight-tpda"; + reg = <0x7bc2000 0x1000>; + reg-names = "tpda-base"; + + coresight-name = "coresight-tpda-apss"; + + qcom,tpda-atid = <66>; + qcom,dsb-elem-size = <0 128>; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + tpda_apss_out_funnel_apss_merg: endpoint { + remote-endpoint = + <&funnel_apss_merg_in_tpda_apss>; + }; + }; + port@1 { + reg = <0>; + tpda_apss_in_tpdm_apss: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_apss_out_tpda_apss>; + }; + }; + }; + }; + + tpdm_apss: tpdm@7bc0000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x7bc0000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-apss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_apss_out_tpda_apss: endpoint { + remote-endpoint = <&tpda_apss_in_tpdm_apss>; + }; + }; + }; + + tpda_mss: tpda@7043000 { + compatible = "qcom,coresight-tpda"; + reg = <0x7043000 0x1000>; + reg-names = "tpda-base"; + + coresight-name = "coresight-tpda-mss"; + + qcom,tpda-atid = <67>; + qcom,dsb-elem-size = <0 32>; + qcom,cmb-elem-size = <0 32>; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + tpda_mss_out_funnel_in1: endpoint { + remote-endpoint = + <&funnel_in1_in_tpda_mss>; + }; + }; + port@1 { + reg = <0>; + tpda_mss_in_tpdm_mss: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_mss_out_tpda_mss>; + }; + }; + }; + }; + + tpdm_mss: tpdm@7042000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x7042000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-mss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_mss_out_tpda_mss: endpoint { + remote-endpoint = <&tpda_mss_in_tpdm_mss>; + }; + }; + }; + + tpdm_nav: tpdm@7190000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x7190000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-nav"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_nav_out_tpda_nav: endpoint { + remote-endpoint = <&tpda_nav_in_tpdm_nav>; + }; + }; + }; + + tpda_olc: tpda@7b92000 { + compatible = "qcom,coresight-tpda"; + reg = <0x7b92000 0x1000>; + reg-names = "tpda-base"; + + coresight-name = "coresight-tpda-olc"; + + qcom,tpda-atid = <69>; + qcom,cmb-elem-size = <0 64>; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + tpda_olc_out_funnel_apss_merg: endpoint { + remote-endpoint = + <&funnel_apss_merg_in_tpda_olc>; + }; + }; + port@1 { + reg = <0>; + tpda_olc_in_tpdm_olc: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_olc_out_tpda_olc>; + }; + }; + }; + }; + + tpdm_olc: tpdm@7b90000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x7b90000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-olc"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_olc_out_tpda_olc: endpoint { + remote-endpoint = <&tpda_olc_in_tpdm_olc>; + }; + }; + }; + + funnel_dlct: funnel@71c3000 { + compatible = "arm,primecell"; + arm,primecell-periphid = <0x0003b908>; + + reg = <0x71c3000 0x1000>; + reg-names = "funnel-base"; + + coresight-name = "coresight-funnel-dlct"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "apb_pclk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + funnel_dlct_out_tpda: endpoint { + remote-endpoint = + <&tpda_in_funnel_dlct>; + }; + }; + port@1 { + reg = <1>; + funnel_dlct_out_funnel_qatb: endpoint { + remote-endpoint = + <&funnel_qatb_in_funnel_dlct>; + }; + }; + port@2 { + reg = <0>; + funnel_dlct_in_tpdm_dlct: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_dlct_out_funnel_dlct>; + }; + }; + port@3 { + reg = <3>; + funnel_dlct_in_funnel_wcss: endpoint { + slave-mode; + remote-endpoint = + <&funnel_wcss_out_funnel_dlct>; + }; + }; + }; + }; + + tpdm_dlct: tpdm@71c2000 { + compatible = "qcom,coresight-tpdm"; + reg = <0x71c2000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-dlct"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_dlct_out_funnel_dlct: endpoint { + remote-endpoint = <&funnel_dlct_in_tpdm_dlct>; + }; + }; + }; + + funnel_wcss: funnel@719e000 { + compatible = "arm,primecell"; + arm,primecell-periphid = <0x0003b908>; + + reg = <0x719e000 0x1000>; + reg-names = "funnel-base"; + + coresight-name = "coresight-funnel-wcss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "apb_pclk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + funnel_wcss_out_funnel_dlct: endpoint { + remote-endpoint = + <&funnel_dlct_in_funnel_wcss>; + }; + }; + port@1 { + reg = <1>; + funnel_wcss_in_tpda_wcss: endpoint { + slave-mode; + remote-endpoint = + <&tpda_wcss_out_funnel_wcss>; + }; + }; + }; + }; + + tpda_wcss: tpda@719d000 { + status = "disabled"; + compatible = "qcom,coresight-tpda"; + reg = <0x719d000 0x1000>; + reg-names = "tpda-base"; + + coresight-name = "coresight-tpda-wcss"; + + qcom,tpda-atid = <70>; + qcom,dsb-elem-size = <0 32>; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + tpda_wcss_out_funnel_wcss: endpoint { + remote-endpoint = + <&funnel_wcss_in_tpda_wcss>; + }; + }; + port@1 { + reg = <0>; + tpda_wcss_in_tpdm_wcss: endpoint { + slave-mode; + remote-endpoint = + <&tpdm_wcss_out_tpda_wcss>; + }; + }; + }; + }; + + tpdm_wcss: tpdm@719c000 { + status = "disabled"; + compatible = "qcom,coresight-tpdm"; + reg = <0x719c000 0x1000>; + reg-names = "tpdm-base"; + + coresight-name = "coresight-tpdm-wcss"; + + clocks = <&clock_rpmcc RPM_QDSS_CLK>, + <&clock_rpmcc RPM_QDSS_A_CLK>; + clock-names = "core_clk", "core_a_clk"; + + port{ + tpdm_wcss_out_tpda_wcss: endpoint { + remote-endpoint = <&tpda_wcss_in_tpdm_wcss>; + }; + }; + }; };