From c6f1725b9cbd208c469142ebfc5bff476352cc08 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 15:52:07 -0700 Subject: [PATCH 01/27] mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly commit 58bc4c34d249bf1bc50730a9a209139347cfacfe upstream. 5dd0b16cdaff ("mm/vmstat: Make NR_TLB_REMOTE_FLUSH_RECEIVED available even on UP") made the availability of the NR_TLB_REMOTE_FLUSH* counters inside the kernel unconditional to reduce #ifdef soup, but (either to avoid showing dummy zero counters to userspace, or because that code was missed) didn't update the vmstat_array, meaning that all following counters would be shown with incorrect values. This only affects kernel builds with CONFIG_VM_EVENT_COUNTERS=y && CONFIG_DEBUG_TLBFLUSH=y && CONFIG_SMP=n. Link: http://lkml.kernel.org/r/20181001143138.95119-2-jannh@google.com Fixes: 5dd0b16cdaff ("mm/vmstat: Make NR_TLB_REMOTE_FLUSH_RECEIVED available even on UP") Signed-off-by: Jann Horn Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Christoph Lameter Cc: Kemi Wang Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmstat.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 5712cdaae964..8895eff2d735 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -858,6 +858,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", +#else + "", /* nr_tlb_remote_flush */ + "", /* nr_tlb_remote_flush_received */ #endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", From 937b51457b5fd9560a2ddb300b5dfbec87082919 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 26 Sep 2018 18:11:22 +0200 Subject: [PATCH 02/27] fbdev/omapfb: fix omapfb_memory_read infoleak commit 1bafcbf59fed92af58955024452f45430d3898c5 upstream. OMAPFB_MEMORY_READ ioctl reads pixels from the LCD's memory and copies them to a userspace buffer. The code has two issues: - The user provided width and height could be large enough to overflow the calculations - The copy_to_user() can copy uninitialized memory to the userspace, which might contain sensitive kernel information. Fix these by limiting the width & height parameters, and only copying the amount of data that we actually received from the LCD. Signed-off-by: Tomi Valkeinen Reported-by: Jann Horn Cc: stable@vger.kernel.org Cc: security@kernel.org Cc: Will Deacon Cc: Jann Horn Cc: Tony Lindgren Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c index 9ddfdd63b84c..34ab4f950f0a 100644 --- a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c +++ b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c @@ -496,6 +496,9 @@ static int omapfb_memory_read(struct fb_info *fbi, if (!access_ok(VERIFY_WRITE, mr->buffer, mr->buffer_size)) return -EFAULT; + if (mr->w > 4096 || mr->h > 4096) + return -EINVAL; + if (mr->w * mr->h * 3 > mr->buffer_size) return -EINVAL; @@ -509,7 +512,7 @@ static int omapfb_memory_read(struct fb_info *fbi, mr->x, mr->y, mr->w, mr->h); if (r > 0) { - if (copy_to_user(mr->buffer, buf, mr->buffer_size)) + if (copy_to_user(mr->buffer, buf, r)) r = -EFAULT; } From 5961c3d0064fdc4326d5390b30ceb50ae5d1459e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Oct 2018 12:52:15 -0700 Subject: [PATCH 03/27] x86/vdso: Fix asm constraints on vDSO syscall fallbacks commit 715bd9d12f84d8f5cc8ad21d888f9bc304a8eb0b upstream. The syscall fallbacks in the vDSO have incorrect asm constraints. They are not marked as writing to their outputs -- instead, they are marked as clobbering "memory", which is useless. In particular, gcc is smart enough to know that the timespec parameter hasn't escaped, so a memory clobber doesn't clobber it. And passing a pointer as an asm *input* does not tell gcc that the pointed-to value is changed. Add in the fact that the asm instructions weren't volatile, and gcc was free to omit them entirely unless their sole output (the return value) is used. Which it is (phew!), but that stops happening with some upcoming patches. As a trivial example, the following code: void test_fallback(struct timespec *ts) { vdso_fallback_gettime(CLOCK_MONOTONIC, ts); } compiles to: 00000000000000c0 : c0: c3 retq To add insult to injury, the RCX and R11 clobbers on 64-bit builds were missing. The "memory" clobber is also unnecessary -- no ordering with respect to other memory operations is needed, but that's going to be fixed in a separate not-for-stable patch. Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/2c0231690551989d2fafa60ed0e7b5cc8b403908.1538422295.git.luto@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/vdso/vclock_gettime.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 5dd363d54348..722cbed8026b 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -51,8 +51,9 @@ extern u8 pvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : + "memory", "rcx", "r11"); return ret; } @@ -60,8 +61,9 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : + "memory", "rcx", "r11"); return ret; } @@ -143,12 +145,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) : "memory", "edx"); return ret; @@ -158,12 +160,12 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*tv), "=m" (*tz) : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) : "memory", "edx"); return ret; From e8dc08a109cb0ceb84bd0f5b9e7f13f054f7fba5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Oct 2018 16:23:49 -0700 Subject: [PATCH 04/27] x86/vdso: Fix vDSO syscall fallback asm constraint regression commit 02e425668f5c9deb42787d10001a3b605993ad15 upstream. When I added the missing memory outputs, I failed to update the index of the first argument (ebx) on 32-bit builds, which broke the fallbacks. Somehow I must have screwed up my testing or gotten lucky. Add another test to cover gettimeofday() as well. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Link: http://lkml.kernel.org/r/21bd45ab04b6d838278fa5bebfa9163eceffa13c.1538608971.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/vdso/vclock_gettime.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 722cbed8026b..049327ee8868 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -147,11 +147,11 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) : "memory", "edx"); return ret; } @@ -162,11 +162,11 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[tv], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*tv), "=m" (*tz) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) : "memory", "edx"); return ret; } From 5527ae62617e69400435aa2f8b95656ad41626a2 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 27 Sep 2018 15:47:33 -0500 Subject: [PATCH 05/27] PCI: Reprogram bridge prefetch registers on resume commit 083874549fdfefa629dfa752785e20427dde1511 upstream. On 38+ Intel-based ASUS products, the NVIDIA GPU becomes unusable after S3 suspend/resume. The affected products include multiple generations of NVIDIA GPUs and Intel SoCs. After resume, nouveau logs many errors such as: fifo: fault 00 [READ] at 0000005555555000 engine 00 [GR] client 04 [HUB/FE] reason 4a [] on channel -1 [007fa91000 unknown] DRM: failed to idle channel 0 [DRM] Similarly, the NVIDIA proprietary driver also fails after resume (black screen, 100% CPU usage in Xorg process). We shipped a sample to NVIDIA for diagnosis, and their response indicated that it's a problem with the parent PCI bridge (on the Intel SoC), not the GPU. Runtime suspend/resume works fine, only S3 suspend is affected. We found a workaround: on resume, rewrite the Intel PCI bridge 'Prefetchable Base Upper 32 Bits' register (PCI_PREF_BASE_UPPER32). In the cases that I checked, this register has value 0 and we just have to rewrite that value. Linux already saves and restores PCI config space during suspend/resume, but this register was being skipped because upon resume, it already has value 0 (the correct, pre-suspend value). Intel appear to have previously acknowledged this behaviour and the requirement to rewrite this register: https://bugzilla.kernel.org/show_bug.cgi?id=116851#c23 Based on that, rewrite the prefetch register values even when that appears unnecessary. We have confirmed this solution on all the affected models we have in-hands (X542UQ, UX533FD, X530UN, V272UN). Additionally, this solves an issue where r8169 MSI-X interrupts were broken after S3 suspend/resume on ASUS X441UAR. This issue was recently worked around in commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e"). It also fixes the same issue on RTL6186evl/8111evl on an Aimfor-tech laptop that we had not yet patched. I suspect it will also fix the issue that was worked around in commit 7c53a722459c ("r8169: don't use MSI-X on RTL8168g"). Thomas Martitz reports that this change also solves an issue where the AMD Radeon Polaris 10 GPU on the HP Zbook 14u G5 is unresponsive after S3 suspend/resume. Link: https://bugzilla.kernel.org/show_bug.cgi?id=201069 Signed-off-by: Daniel Drake Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki Reviewed-By: Peter Wu CC: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 295bf1472d02..5073ab023123 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1064,12 +1064,12 @@ int pci_save_state(struct pci_dev *dev) EXPORT_SYMBOL(pci_save_state); static void pci_restore_config_dword(struct pci_dev *pdev, int offset, - u32 saved_val, int retry) + u32 saved_val, int retry, bool force) { u32 val; pci_read_config_dword(pdev, offset, &val); - if (val == saved_val) + if (!force && val == saved_val) return; for (;;) { @@ -1088,25 +1088,36 @@ static void pci_restore_config_dword(struct pci_dev *pdev, int offset, } static void pci_restore_config_space_range(struct pci_dev *pdev, - int start, int end, int retry) + int start, int end, int retry, + bool force) { int index; for (index = end; index >= start; index--) pci_restore_config_dword(pdev, 4 * index, pdev->saved_config_space[index], - retry); + retry, force); } static void pci_restore_config_space(struct pci_dev *pdev) { if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) { - pci_restore_config_space_range(pdev, 10, 15, 0); + pci_restore_config_space_range(pdev, 10, 15, 0, false); /* Restore BARs before the command register. */ - pci_restore_config_space_range(pdev, 4, 9, 10); - pci_restore_config_space_range(pdev, 0, 3, 0); + pci_restore_config_space_range(pdev, 4, 9, 10, false); + pci_restore_config_space_range(pdev, 0, 3, 0, false); + } else if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + pci_restore_config_space_range(pdev, 12, 15, 0, false); + + /* + * Force rewriting of prefetch registers to avoid S3 resume + * issues on Intel PCI bridges that occur when these + * registers are not explicitly written. + */ + pci_restore_config_space_range(pdev, 9, 11, 0, true); + pci_restore_config_space_range(pdev, 0, 8, 0, false); } else { - pci_restore_config_space_range(pdev, 0, 15, 0); + pci_restore_config_space_range(pdev, 0, 15, 0, false); } } From 24479b9d2d03c1fde26dae957b18da8164caf3b0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 29 Sep 2018 16:01:58 +0200 Subject: [PATCH 06/27] mac80211: fix setting IEEE80211_KEY_FLAG_RX_MGMT for AP mode keys commit 211710ca74adf790b46ab3867fcce8047b573cd1 upstream. key->sta is only valid after ieee80211_key_link, which is called later in this function. Because of that, the IEEE80211_KEY_FLAG_RX_MGMT is never set when management frame protection is enabled. Fixes: e548c49e6dc6b ("mac80211: add key flag for management keys") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1f930032253a..67348d8ac35d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -219,7 +219,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ - if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP)) + if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: From 1516d9fa636f1e6276bf37086d06f9847c25bc59 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Oct 2018 11:08:12 +0200 Subject: [PATCH 07/27] PM / core: Clear the direct_complete flag on errors commit 69e445ab8b66a9f30519842ef18be555d3ee9b51 upstream. If __device_suspend() runs asynchronously (in which case the device passed to it is in dpm_suspended_list at that point) and it returns early on an error or pending wakeup, and the power.direct_complete flag has been set for the device already, the subsequent device_resume() will be confused by that and it will call pm_runtime_enable() incorrectly, as runtime PM has not been disabled for the device by __device_suspend(). To avoid that, clear power.direct_complete if __device_suspend() is not going to disable runtime PM for the device before returning. Fixes: aae4518b3124 (PM / sleep: Mechanism to avoid resuming runtime-suspended devices unnecessarily) Reported-by: Al Cooper Tested-by: Al Cooper Reviewed-by: Ulf Hansson Cc: 3.16+ # 3.16+ Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/base/power/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e9b713675c7c..05409141ec07 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1355,8 +1355,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) dpm_wait_for_children(dev, async); - if (async_error) + if (async_error) { + dev->power.direct_complete = false; goto Complete; + } /* * If a device configured to wake up the system from sleep states @@ -1368,6 +1370,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) pm_wakeup_event(dev, 0); if (pm_wakeup_pending()) { + dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } From dce708809daaced8d63eda3447b59951186f3f1a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 25 Sep 2018 20:56:02 -0400 Subject: [PATCH 08/27] dm cache: fix resize crash if user doesn't reload cache table commit 5d07384a666d4b2f781dc056bfeec2c27fbdf383 upstream. A reload of the cache's DM table is needed during resize because otherwise a crash will occur when attempting to access smq policy entries associated with the portion of the cache that was recently extended. The reason is cache-size based data structures in the policy will not be resized, the only way to safely extend the cache is to allow for a proper cache policy initialization that occurs when the cache table is loaded. For example the smq policy's space_init(), init_allocator(), calc_hotspot_params() must be sized based on the extended cache size. The fix for this is to disallow cache resizes of this pattern: 1) suspend "cache" target's device 2) resize the fast device used for the cache 3) resume "cache" target's device Instead, the last step must be a full reload of the cache's DM table. Fixes: 66a636356 ("dm cache: add stochastic-multi-queue (smq) policy") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-cache-target.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index b59615ddf6ba..531d6f3a786e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3391,8 +3391,13 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) static bool can_resize(struct cache *cache, dm_cblock_t new_size) { - if (from_cblock(new_size) > from_cblock(cache->cache_size)) - return true; + if (from_cblock(new_size) > from_cblock(cache->cache_size)) { + if (cache->sized) { + DMERR("%s: unable to extend cache due to missing cache table reload", + cache_device_name(cache)); + return false; + } + } /* * We can't drop a dirty block when shrinking the cache. From ed8649a4691b52228bbbeca87780e284f2c62456 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 1 Oct 2018 18:36:07 +0300 Subject: [PATCH 09/27] xhci: Add missing CAS workaround for Intel Sunrise Point xHCI commit ffe84e01bb1b38c7eb9c6b6da127a6c136d251df upstream. The workaround for missing CAS bit is also needed for xHC on Intel sunrisepoint PCH. For more details see: Intel 100/c230 series PCH specification update Doc #332692-006 Errata #8 Cc: Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index cbf3be66f89c..d6e2199bcfe5 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -174,6 +174,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) } if (pdev->vendor == PCI_VENDOR_ID_INTEL && (pdev->device == PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI || + pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI || + pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_APL_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI)) xhci->quirks |= XHCI_MISSING_CAS; From d70a6783f730d83709ca8173558483dba0741b88 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 24 Sep 2018 15:28:10 +0200 Subject: [PATCH 10/27] USB: serial: simple: add Motorola Tetra MTP6550 id commit f5fad711c06e652f90f581fc7c2caee327c33d31 upstream. Add device-id for the Motorola Tetra radio MTP6550. Bus 001 Device 004: ID 0cad:9012 Motorola CGISS Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 0 (Defined at Interface level) bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 64 idVendor 0x0cad Motorola CGISS idProduct 0x9012 bcdDevice 24.16 iManufacturer 1 Motorola Solutions, Inc. iProduct 2 TETRA PEI interface iSerial 0 bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 55 bNumInterfaces 2 bConfigurationValue 1 iConfiguration 3 Generic Serial config bmAttributes 0x80 (Bus Powered) MaxPower 500mA Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x81 EP 1 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes bInterval 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x01 EP 1 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x82 EP 2 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes bInterval 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x02 EP 2 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes bInterval 0 Device Qualifier (for other device speed): bLength 10 bDescriptorType 6 bcdUSB 2.00 bDeviceClass 0 (Defined at Interface level) bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 64 bNumConfigurations 1 Device Status: 0x0000 (Bus Powered) Reported-by: Hans Hult Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/usb-serial-simple.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index 2674da40d9cd..6d6acf2c07c3 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -87,7 +87,8 @@ DEVICE(moto_modem, MOTO_IDS); /* Motorola Tetra driver */ #define MOTOROLA_TETRA_IDS() \ - { USB_DEVICE(0x0cad, 0x9011) } /* Motorola Solutions TETRA PEI */ + { USB_DEVICE(0x0cad, 0x9011) }, /* Motorola Solutions TETRA PEI */ \ + { USB_DEVICE(0x0cad, 0x9012) } /* MTP6550 */ DEVICE(motorola_tetra, MOTOROLA_TETRA_IDS); /* Novatel Wireless GPS driver */ From 26f9ef6cebbe7af1893efc7720d3270e0139b9cf Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 25 Sep 2018 21:06:24 -0700 Subject: [PATCH 11/27] of: unittest: Disable interrupt node tests for old world MAC systems commit 8894891446c9380709451b99ab45c5c53adfd2fc upstream. On systems with OF_IMAP_OLDWORLD_MAC set in of_irq_workarounds, the devicetree interrupt parsing code is different, causing unit tests of devicetree interrupt nodes to fail. Due to a bug in unittest code, which tries to dereference an uninitialized pointer, this results in a crash. OF: /testcase-data/phandle-tests/consumer-a: arguments longer than property Unable to handle kernel paging request for data at address 0x00bc616e Faulting instruction address: 0xc08e9468 Oops: Kernel access of bad area, sig: 11 [#1] BE PREEMPT PowerMac Modules linked in: CPU: 0 PID: 1 Comm: swapper Not tainted 4.14.72-rc1-yocto-standard+ #1 task: cf8e0000 task.stack: cf8da000 NIP: c08e9468 LR: c08ea5bc CTR: c08ea5ac REGS: cf8dbb50 TRAP: 0300 Not tainted (4.14.72-rc1-yocto-standard+) MSR: 00001032 CR: 82004044 XER: 00000000 DAR: 00bc616e DSISR: 40000000 GPR00: c08ea5bc cf8dbc00 cf8e0000 c13ca517 c13ca517 c13ca8a0 00000066 00000002 GPR08: 00000063 00bc614e c0b05865 000affff 82004048 00000000 c00047f0 00000000 GPR16: c0a80000 c0a9cc34 c13ca517 c0ad1134 05ffffff 000affff c0b05860 c0abeef8 GPR24: cecec278 cecec278 c0a8c4d0 c0a885e0 c13ca8a0 05ffffff c13ca8a0 c13ca517 NIP [c08e9468] device_node_gen_full_name+0x30/0x15c LR [c08ea5bc] device_node_string+0x190/0x3c8 Call Trace: [cf8dbc00] [c007f670] trace_hardirqs_on_caller+0x118/0x1fc (unreliable) [cf8dbc40] [c08ea5bc] device_node_string+0x190/0x3c8 [cf8dbcb0] [c08eb794] pointer+0x25c/0x4d0 [cf8dbd00] [c08ebcbc] vsnprintf+0x2b4/0x5ec [cf8dbd60] [c08ec00c] vscnprintf+0x18/0x48 [cf8dbd70] [c008e268] vprintk_store+0x4c/0x22c [cf8dbda0] [c008ecac] vprintk_emit+0x94/0x130 [cf8dbdd0] [c008ff54] printk+0x5c/0x6c [cf8dbe10] [c0b8ddd4] of_unittest+0x2220/0x26f8 [cf8dbea0] [c0004434] do_one_initcall+0x4c/0x184 [cf8dbf00] [c0b4534c] kernel_init_freeable+0x13c/0x1d8 [cf8dbf30] [c0004814] kernel_init+0x24/0x118 [cf8dbf40] [c0013398] ret_from_kernel_thread+0x5c/0x64 The problem was observed when running a qemu test for the g3beige machine with devicetree unittests enabled. Disable interrupt node tests on affected systems to avoid both false unittest failures and the crash. With this patch in place, unittest on the affected system passes with the following message. dt-test ### end of unittest - 144 passed, 0 failed Fixes: 53a42093d96ef ("of: Add device tree selftests") Signed-off-by: Guenter Roeck Reviewed-by: Frank Rowand Signed-off-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- drivers/of/unittest.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 2a547ca3d443..2eac3df7dd29 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -553,6 +553,9 @@ static void __init of_unittest_parse_interrupts(void) struct of_phandle_args args; int i, rc; + if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC) + return; + np = of_find_node_by_path("/testcase-data/interrupts/interrupts0"); if (!np) { pr_err("missing testcase data\n"); @@ -627,6 +630,9 @@ static void __init of_unittest_parse_interrupts_extended(void) struct of_phandle_args args; int i, rc; + if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC) + return; + np = of_find_node_by_path("/testcase-data/interrupts/interrupts-extended0"); if (!np) { pr_err("missing testcase data\n"); @@ -778,15 +784,19 @@ static void __init of_unittest_platform_populate(void) pdev = of_find_device_by_node(np); unittest(pdev, "device 1 creation failed\n"); - irq = platform_get_irq(pdev, 0); - unittest(irq == -EPROBE_DEFER, "device deferred probe failed - %d\n", irq); + if (!(of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)) { + irq = platform_get_irq(pdev, 0); + unittest(irq == -EPROBE_DEFER, + "device deferred probe failed - %d\n", irq); - /* Test that a parsing failure does not return -EPROBE_DEFER */ - np = of_find_node_by_path("/testcase-data/testcase-device2"); - pdev = of_find_device_by_node(np); - unittest(pdev, "device 2 creation failed\n"); - irq = platform_get_irq(pdev, 0); - unittest(irq < 0 && irq != -EPROBE_DEFER, "device parsing error failed - %d\n", irq); + /* Test that a parsing failure does not return -EPROBE_DEFER */ + np = of_find_node_by_path("/testcase-data/testcase-device2"); + pdev = of_find_device_by_node(np); + unittest(pdev, "device 2 creation failed\n"); + irq = platform_get_irq(pdev, 0); + unittest(irq < 0 && irq != -EPROBE_DEFER, + "device parsing error failed - %d\n", irq); + } np = of_find_node_by_path("/testcase-data/platform-tests"); unittest(np, "No testcase data in device tree\n"); From fb751efb29d037ee051f326b6f622881ca0b9a14 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 13 Jun 2018 00:51:28 -0400 Subject: [PATCH 12/27] ext4: always verify the magic number in xattr blocks commit 513f86d73855ce556ea9522b6bfd79f87356dc3a upstream. If there an inode points to a block which is also some other type of metadata block (such as a block allocation bitmap), the buffer_verified flag can be set when it was validated as that other metadata block type; however, it would make a really terrible external attribute block. The reason why we use the verified flag is to avoid constantly reverifying the block. However, it doesn't take much overhead to make sure the magic number of the xattr block is correct, and this will avoid potential crashes. This addresses CVE-2018-10879. https://bugzilla.kernel.org/show_bug.cgi?id=200001 Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger [Backported to 4.4: adjust context] Signed-off-by: Daniel Rosenberg Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b51bb73b06a6..d0aaf338fa9f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -220,12 +220,12 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) { int error; - if (buffer_verified(bh)) - return 0; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EFSCORRUPTED; + if (buffer_verified(bh)) + return 0; + if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) return -EFSBADCRC; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, From 75fe5488b22f10844334c0960b12b2cf5c606028 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 19 Dec 2017 12:56:57 +0530 Subject: [PATCH 13/27] cgroup: Fix deadlock in cpu hotplug path commit 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 upstream. Deadlock during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 In case of PF_EXITING task being migrated from source to destination cgroup, migrate next available task in source cgroup. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo [AmitP: Upstream commit cherry-pick failed, so I picked the backported changes from CAF/msm-4.9 tree instead: https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?id=49b74f1696417b270c89cd893ca9f37088928078] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4cb94b678e9f..5299618d6308 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4083,7 +4083,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); From 023fdb64ee8d171f691a97bffa1d168bef750c7f Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Mon, 5 Mar 2018 14:44:02 +0800 Subject: [PATCH 14/27] ath10k: fix use-after-free in ath10k_wmi_cmd_send_nowait commit 9ef0f58ed7b4a55da4a64641d538e0d9e46579ac upstream. The skb may be freed in tx completion context before trace_ath10k_wmi_cmd is called. This can be easily captured when KASAN(Kernel Address Sanitizer) is enabled. The fix is to move trace_ath10k_wmi_cmd before the send operation. As the ret has no meaning in trace_ath10k_wmi_cmd then, so remove this parameter too. Signed-off-by: Carl Huang Tested-by: Brian Norris Reviewed-by: Brian Norris Signed-off-by: Kalle Valo Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/trace.h | 12 ++++-------- drivers/net/wireless/ath/ath10k/wmi.c | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/trace.h b/drivers/net/wireless/ath/ath10k/trace.h index 71bdb368813d..0194bebbdbf7 100644 --- a/drivers/net/wireless/ath/ath10k/trace.h +++ b/drivers/net/wireless/ath/ath10k/trace.h @@ -152,10 +152,9 @@ TRACE_EVENT(ath10k_log_dbg_dump, ); TRACE_EVENT(ath10k_wmi_cmd, - TP_PROTO(struct ath10k *ar, int id, const void *buf, size_t buf_len, - int ret), + TP_PROTO(struct ath10k *ar, int id, const void *buf, size_t buf_len), - TP_ARGS(ar, id, buf, buf_len, ret), + TP_ARGS(ar, id, buf, buf_len), TP_STRUCT__entry( __string(device, dev_name(ar->dev)) @@ -163,7 +162,6 @@ TRACE_EVENT(ath10k_wmi_cmd, __field(unsigned int, id) __field(size_t, buf_len) __dynamic_array(u8, buf, buf_len) - __field(int, ret) ), TP_fast_assign( @@ -171,17 +169,15 @@ TRACE_EVENT(ath10k_wmi_cmd, __assign_str(driver, dev_driver_string(ar->dev)); __entry->id = id; __entry->buf_len = buf_len; - __entry->ret = ret; memcpy(__get_dynamic_array(buf), buf, buf_len); ), TP_printk( - "%s %s id %d len %zu ret %d", + "%s %s id %d len %zu", __get_str(driver), __get_str(device), __entry->id, - __entry->buf_len, - __entry->ret + __entry->buf_len ) ); diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 7569db0f69b5..5bb1be478954 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -1642,8 +1642,8 @@ int ath10k_wmi_cmd_send_nowait(struct ath10k *ar, struct sk_buff *skb, cmd_hdr->cmd_id = __cpu_to_le32(cmd); memset(skb_cb, 0, sizeof(*skb_cb)); + trace_ath10k_wmi_cmd(ar, cmd_id, skb->data, skb->len); ret = ath10k_htc_send(&ar->htc, ar->wmi.eid, skb); - trace_ath10k_wmi_cmd(ar, cmd_id, skb->data, skb->len, ret); if (ret) goto err_pull; From 0cbf366a69887b623108f0a20f8850331a7ffac8 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Sat, 27 May 2017 17:46:15 +0200 Subject: [PATCH 15/27] powerpc/fadump: Return error when fadump registration fails commit 98b8cd7f75643e0a442d7a4c1cef2c9d53b7e92b upstream. - log an error message when registration fails and no error code listed in the switch is returned - translate the hv error code to posix error code and return it from fw_register - return the posix error code from fw_register to the process writing to sysfs - return EEXIST on re-registration - return success on deregistration when fadump is not registered - return ENODEV when no memory is reserved for fadump Signed-off-by: Michal Suchanek Tested-by: Hari Bathini [mpe: Use pr_err() to shrink the error print] Signed-off-by: Michael Ellerman Cc: Kleber Sacilotto de Souza Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/fadump.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index c3c835290131..ca3ad5ebcd41 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -360,9 +360,9 @@ static int __init early_fadump_reserve_mem(char *p) } early_param("fadump_reserve_mem", early_fadump_reserve_mem); -static void register_fw_dump(struct fadump_mem_struct *fdm) +static int register_fw_dump(struct fadump_mem_struct *fdm) { - int rc; + int rc, err; unsigned int wait_time; pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -379,7 +379,11 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } while (wait_time); + err = -EIO; switch (rc) { + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; case -1: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Hardware Error(%d).\n", rc); @@ -387,18 +391,22 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) case -3: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); + err = -EINVAL; break; case -9: printk(KERN_ERR "firmware-assisted kernel dump is already " " registered."); fw_dump.dump_registered = 1; + err = -EEXIST; break; case 0: printk(KERN_INFO "firmware-assisted kernel dump registration" " is successful\n"); fw_dump.dump_registered = 1; + err = 0; break; } + return err; } void crash_fadump(struct pt_regs *regs, const char *str) @@ -997,7 +1005,7 @@ static unsigned long init_fadump_header(unsigned long addr) return addr; } -static void register_fadump(void) +static int register_fadump(void) { unsigned long addr; void *vaddr; @@ -1008,7 +1016,7 @@ static void register_fadump(void) * assisted dump. */ if (!fw_dump.reserve_dump_area_size) - return; + return -ENODEV; ret = fadump_setup_crash_memory_ranges(); if (ret) @@ -1023,7 +1031,7 @@ static void register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - register_fw_dump(&fdm); + return register_fw_dump(&fdm); } static int fadump_unregister_dump(struct fadump_mem_struct *fdm) @@ -1208,7 +1216,6 @@ static ssize_t fadump_register_store(struct kobject *kobj, switch (buf[0]) { case '0': if (fw_dump.dump_registered == 0) { - ret = -EINVAL; goto unlock_out; } /* Un-register Firmware-assisted dump */ @@ -1216,11 +1223,11 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case '1': if (fw_dump.dump_registered == 1) { - ret = -EINVAL; + ret = -EEXIST; goto unlock_out; } /* Register Firmware-assisted dump */ - register_fadump(); + ret = register_fadump(); break; default: ret = -EINVAL; From b4dd80c33338df403868222045d6427295ff54e5 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 5 Oct 2018 12:48:48 -0700 Subject: [PATCH 16/27] ARC: clone syscall to setp r25 as thread pointer commit c58a584f05e35d1d4342923cd7aac07d9c3d3d16 upstream. Per ARC TLS ABI, r25 is designated TP (thread pointer register). However so far kernel didn't do any special treatment, like setting up usermode r25, even for CLONE_SETTLS. We instead relied on libc runtime to do this, in say clone libc wrapper [1]. This was deliberate to keep kernel ABI agnostic (userspace could potentially change TP, specially for different ARC ISA say ARCompact vs. ARCv2 with different spare registers etc) However userspace setting up r25, after clone syscall opens a race, if child is not scheduled and gets a signal instead. It starts off in userspace not in clone but in a signal handler and anything TP sepcific there such as pthread_self() fails which showed up with uClibc testsuite nptl/tst-kill6 [2] Fix this by having kernel populate r25 to TP value. So this locks in ABI, but it was not going to change anyways, and fwiw is same for both ARCompact (arc700 core) and ARCvs (HS3x cores) [1] https://cgit.uclibc-ng.org/cgi/cgit/uclibc-ng.git/tree/libc/sysdeps/linux/arc/clone.S [2] https://github.com/wbx-github/uclibc-ng-test/blob/master/test/nptl/tst-kill6.c Fixes: ARC STAR 9001378481 Cc: stable@vger.kernel.org Reported-by: Nikita Sobolev Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/process.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index a3f750e76b68..8f40c6c5d77e 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -153,6 +153,26 @@ int copy_thread(unsigned long clone_flags, task_thread_info(current)->thr_ptr; } + + /* + * setup usermode thread pointer #1: + * when child is picked by scheduler, __switch_to() uses @c_callee to + * populate usermode callee regs: this works (despite being in a kernel + * function) since special return path for child @ret_from_fork() + * ensures those regs are not clobbered all the way to RTIE to usermode + */ + c_callee->r25 = task_thread_info(p)->thr_ptr; + +#ifdef CONFIG_ARC_CURR_IN_REG + /* + * setup usermode thread pointer #2: + * however for this special use of r25 in kernel, __switch_to() sets + * r25 for kernel needs and only in the final return path is usermode + * r25 setup, from pt_regs->user_r25. So set that up as well + */ + c_regs->user_r25 = c_callee->r25; +#endif + return 0; } From 38ea605f919c61fc0c3ca65abeaa87ca83f2110e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 12 Sep 2018 16:27:44 -0700 Subject: [PATCH 17/27] ucma: fix a use-after-free in ucma_resolve_ip() commit 5fe23f262e0548ca7f19fb79f89059a60d087d22 upstream. There is a race condition between ucma_close() and ucma_resolve_ip(): CPU0 CPU1 ucma_resolve_ip(): ucma_close(): ctx = ucma_get_ctx(file, cmd.id); list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); ... mutex_lock(&mut); if (!ctx->closing) { mutex_unlock(&mut); rdma_destroy_id(ctx->cm_id); ... ucma_free_ctx(ctx); ret = rdma_resolve_addr(); ucma_put_ctx(ctx); Before idr_remove(), ucma_get_ctx() could still find the ctx and after rdma_destroy_id(), rdma_resolve_addr() may still access id_priv pointer. Also, ucma_put_ctx() may use ctx after ucma_free_ctx() too. ucma_close() should call ucma_put_ctx() too which tests the refcnt and waits for the last one releasing it. The similar pattern is already used by ucma_destroy_id(). Reported-and-tested-by: syzbot+da2591e115d57a9cbb8b@syzkaller.appspotmail.com Reported-by: syzbot+cfe3c1e8ef634ba8964b@syzkaller.appspotmail.com Cc: Jason Gunthorpe Cc: Doug Ledford Cc: Leon Romanovsky Signed-off-by: Cong Wang Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/ucma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 9712a63957e1..7525e9f6949e 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1709,6 +1709,8 @@ static int ucma_close(struct inode *inode, struct file *filp) mutex_lock(&mut); if (!ctx->closing) { mutex_unlock(&mut); + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); /* rdma_destroy_id ensures that no event handlers are * inflight for that id before releasing it. */ From c4c84454902516b7648a18d5173304da7dffa9d6 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 3 Sep 2018 23:06:23 +0200 Subject: [PATCH 18/27] ubifs: Check for name being NULL while mounting commit 37f31b6ca4311b94d985fb398a72e5399ad57925 upstream. The requested device name can be NULL or an empty string. Check for that and refuse to continue. UBIFS has to do this manually since we cannot use mount_bdev(), which checks for this condition. Fixes: 1e51764a3c2ac ("UBIFS: add new flash file system") Reported-by: syzbot+38bd0f7865e5c6379280@syzkaller.appspotmail.com Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0bb6de356451..7968b7a5e787 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1918,6 +1918,9 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) int dev, vol; char *endptr; + if (!name || !*name) + return ERR_PTR(-EINVAL); + /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) From ec7055c62714326c56dabcf7757069ac7f276bda Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Sep 2018 16:24:05 +0800 Subject: [PATCH 19/27] tcp: increment sk_drops for dropped rx packets [ Upstream commit 532182cd610782db8c18230c2747626562032205 ] Now ss can report sk_drops, we can instruct TCP to increment this per socket counter when it drops an incoming frame, to refine monitoring and debugging. Following patch takes care of listeners drops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Mao Wenan Signed-off-by: Greg Kroah-Hartman --- include/net/sock.h | 7 +++++++ net/ipv4/tcp_input.c | 33 ++++++++++++++++++++------------- net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 3d5ff7436f41..577075713ad5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2139,6 +2139,13 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops); } +static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +{ + int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + atomic_add(segs, &sk->sk_drops); +} + void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c4c6cd0316e..9df4cbb9c25a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4296,6 +4296,12 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4320,7 +4326,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __kfree_skb(skb); + tcp_drop(sk, skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", @@ -4372,7 +4378,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -4436,7 +4442,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4475,7 +4481,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); } add_sack: @@ -4558,12 +4564,13 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - int eaten = -1; bool fragstolen = false; + int eaten = -1; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) - goto drop; - + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + __kfree_skb(skb); + return; + } skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); @@ -4645,7 +4652,7 @@ out_of_window: tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -5236,7 +5243,7 @@ syn_challenge: return true; discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return false; } @@ -5454,7 +5461,7 @@ csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } EXPORT_SYMBOL(tcp_rcv_established); @@ -5684,7 +5691,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, TCP_DELACK_MAX, TCP_RTO_MAX); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return 0; } else { tcp_send_ack(sk); @@ -6041,7 +6048,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!queued) { discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } return 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eeda67c3dd11..01715fc75169 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1716,6 +1716,7 @@ discard_it: return 0; discard_and_relse: + sk_drops_add(sk, skb); sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 90abe88e1b40..d6c191158e07 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1505,6 +1505,7 @@ discard_it: return 0; discard_and_relse: + sk_drops_add(sk, skb); sock_put(sk); goto discard_it; From 4666b6e2b27d91e05a5b8459e40e4a05dbc1c7b0 Mon Sep 17 00:00:00 2001 From: Yaogong Wang Date: Fri, 14 Sep 2018 16:24:06 +0800 Subject: [PATCH 20/27] tcp: use an RB tree for ooo receive queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a ] Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Ilpo Järvinen Acked-By: Ilpo Järvinen Signed-off-by: David S. Miller Signed-off-by: Mao Wenan Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 8 + include/linux/tcp.h | 7 +- include/net/tcp.h | 2 +- net/core/skbuff.c | 19 ++ net/ipv4/tcp.c | 4 +- net/ipv4/tcp_input.c | 362 +++++++++++++++++++++++---------------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 1 - 8 files changed, 244 insertions(+), 161 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c28bd8be290a..a490dd718654 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } +void skb_rbtree_purge(struct rb_root *root); + void *netdev_alloc_frag(unsigned int fragsz); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, @@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) return __pskb_trim(skb, len); } +#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) +#define skb_rb_first(root) rb_to_skb(rb_first(root)) +#define skb_rb_last(root) rb_to_skb(rb_last(root)) +#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) +#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5b6df1a8dc74..747404dbe506 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -279,10 +279,9 @@ struct tcp_sock { struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; - /* OOO segments go in this list. Note that socket lock must be held, - * as we do not use sk_buff_head lock. - */ - struct sk_buff_head out_of_order_queue; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 6c89238f192e..a99f75ef6a73 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (skb_queue_empty(&tp->out_of_order_queue) && + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 55be076706e5..9703924ed071 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2377,6 +2377,25 @@ void skb_queue_purge(struct sk_buff_head *list) } EXPORT_SYMBOL(skb_queue_purge); +/** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + /** * skb_queue_head - queue a buffer at the list head * @list: list to use diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5e162b8ab184..b7492aabe710 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __skb_queue_head_init(&tp->out_of_order_queue); + tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); @@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9df4cbb9c25a..7832d0d107cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4073,7 +4073,7 @@ static void tcp_fin(struct sock *sk) /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); @@ -4233,7 +4233,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ - if (skb_queue_empty(&tp->out_of_order_queue)) { + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } @@ -4309,10 +4309,13 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; + bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; - bool fragstolen, eaten; + struct rb_node *p; - while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + p = rb_first(&tp->out_of_order_queue); + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4322,9 +4325,10 @@ static void tcp_ofo_queue(struct sock *sk) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); - __skb_unlink(skb, &tp->out_of_order_queue); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; @@ -4336,12 +4340,19 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); + fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - tcp_fin(sk); - if (eaten) + else kfree_skb_partial(skb, fragstolen); + + if (unlikely(fin)) { + tcp_fin(sk); + /* tcp_fin() purges tp->out_of_order_queue, + * so we must end this loop right now. + */ + break; + } } } @@ -4371,8 +4382,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct rb_node **p, *q, *parent; struct sk_buff *skb1; u32 seq, end_seq; + bool fragstolen; tcp_ecn_check_ce(sk, skb); @@ -4387,89 +4400,86 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) inet_csk_schedule_ack(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tp->rcv_nxt, seq, end_seq); - skb1 = skb_peek_tail(&tp->out_of_order_queue); - if (!skb1) { + p = &tp->out_of_order_queue.rb_node; + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = - TCP_SKB_CB(skb)->end_seq; + tp->selective_acks[0].start_seq = seq; + tp->selective_acks[0].end_seq = end_seq; } - __skb_queue_head(&tp->out_of_order_queue, skb); + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + tp->ooo_last_skb = skb; goto end; } - seq = TCP_SKB_CB(skb)->seq; - end_seq = TCP_SKB_CB(skb)->end_seq; - - if (seq == TCP_SKB_CB(skb1)->end_seq) { - bool fragstolen; - - if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); - } else { - tcp_grow_window(sk, skb); - kfree_skb_partial(skb, fragstolen); - skb = NULL; - } - - if (!tp->rx_opt.num_sacks || - tp->selective_acks[0].end_seq != seq) - goto add_sack; - - /* Common case: data arrive in order after hole. */ - tp->selective_acks[0].end_seq = end_seq; - goto end; + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { +coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); + skb = NULL; + goto add_sack; } - /* Find place to insert this segment. */ - while (1) { - if (!after(TCP_SKB_CB(skb1)->seq, seq)) - break; - if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { - skb1 = NULL; - break; + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; } - skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &tp->out_of_order_queue); + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb1); + goto add_sack; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + goto coalesce_done; + } + p = &parent->rb_right; } - /* Do skb overlap to previous one? */ - if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - /* All the bits are present. Drop. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); - skb = NULL; - tcp_dsack_set(sk, seq, end_seq); - goto add_sack; - } - if (after(seq, TCP_SKB_CB(skb1)->seq)) { - /* Partial overlap. */ - tcp_dsack_set(sk, seq, - TCP_SKB_CB(skb1)->end_seq); - } else { - if (skb_queue_is_first(&tp->out_of_order_queue, - skb1)) - skb1 = NULL; - else - skb1 = skb_queue_prev( - &tp->out_of_order_queue, - skb1); - } - } - if (!skb1) - __skb_queue_head(&tp->out_of_order_queue, skb); - else - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); - - /* And clean segments covered by new one as whole. */ - while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { - skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + /* Remove other segments covered by skb. */ + while ((q = rb_next(&skb->rbnode)) != NULL) { + skb1 = rb_entry(q, struct sk_buff, rbnode); if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4477,12 +4487,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } - __skb_unlink(skb1, &tp->out_of_order_queue); + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb1); } + /* If there is no skb after us, we are the last_skb ! */ + if (!q) + tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) @@ -4621,13 +4634,13 @@ queue_and_out: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); - if (!skb_queue_empty(&tp->out_of_order_queue)) { + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pingpong = 0; } @@ -4679,48 +4692,76 @@ drop: tcp_data_queue_ofo(sk, skb); } -static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *list) +static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff *next = NULL; + if (list) + return !skb_queue_is_last(list, skb) ? skb->next : NULL; - if (!skb_queue_is_last(list, skb)) - next = skb_queue_next(list, skb); + return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); +} + +static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *list, + struct rb_root *root) +{ + struct sk_buff *next = tcp_skb_next(skb, list); + + if (list) + __skb_unlink(skb, list); + else + rb_erase(&skb->rbnode, root); - __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * - * If tail is NULL, this means until the end of the list. + * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff_head *list, - struct sk_buff *head, struct sk_buff *tail, - u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb, *n; + struct sk_buff *skb = head, *n; + struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find - * the point where collapsing can be useful. */ - skb = head; + * the point where collapsing can be useful. + */ restart: - end_of_skbs = true; - skb_queue_walk_from_safe(list, skb, n) { - if (skb == tail) - break; + for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { + n = tcp_skb_next(skb, list); + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; @@ -4738,13 +4779,10 @@ restart: break; } - if (!skb_queue_is_last(list, skb)) { - struct sk_buff *next = skb_queue_next(list, skb); - if (next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { - end_of_skbs = false; - break; - } + if (n && n != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { + end_of_skbs = false; + break; } /* Decided to skip this, advance start seq. */ @@ -4754,17 +4792,22 @@ restart: (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; + __skb_queue_head_init(&tmp); + while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) - return; + break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_queue_before(list, skb, nskb); + if (list) + __skb_queue_before(list, skb, nskb); + else + __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -4782,14 +4825,17 @@ restart: start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) - return; + goto end; } } } +end: + skb_queue_walk_safe(&tmp, skb, n) + tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs @@ -4799,34 +4845,39 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 range_truesize, sum_tiny = 0; - struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); - struct sk_buff *head; + struct sk_buff *skb, *head; + struct rb_node *p; u32 start, end; - if (!skb) + p = rb_first(&tp->out_of_order_queue); + skb = rb_entry_safe(p, struct sk_buff, rbnode); +new_range: + if (!skb) { + p = rb_last(&tp->out_of_order_queue); + /* Note: This is possible p is NULL here. We do not + * use rb_entry_safe(), as ooo_last_skb is valid only + * if rbtree is not empty. + */ + tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); return; - + } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; range_truesize = skb->truesize; - head = skb; - for (;;) { - struct sk_buff *next = NULL; + for (head = skb;;) { + skb = tcp_skb_next(skb, NULL); - if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) - next = skb_queue_next(&tp->out_of_order_queue, skb); - skb = next; - - /* Segment is terminated when we see gap or when - * we are at the end of all the queue. */ + /* Range is terminated when we see a gap or when + * we are at the queue end. + */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { /* Do not attempt collapsing tiny skbs */ if (range_truesize != head->truesize || end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { - tcp_collapse(sk, &tp->out_of_order_queue, + tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); } else { sum_tiny += range_truesize; @@ -4834,20 +4885,14 @@ static void tcp_collapse_ofo_queue(struct sock *sk) return; } - head = skb; - if (!skb) - break; - /* Start new segment */ - start = TCP_SKB_CB(skb)->seq; - end = TCP_SKB_CB(skb)->end_seq; - range_truesize = skb->truesize; - } else { - range_truesize += skb->truesize; - if (before(TCP_SKB_CB(skb)->seq, start)) - start = TCP_SKB_CB(skb)->seq; - if (after(TCP_SKB_CB(skb)->end_seq, end)) - end = TCP_SKB_CB(skb)->end_seq; + goto new_range; } + + range_truesize += skb->truesize; + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) + end = TCP_SKB_CB(skb)->end_seq; } } @@ -4858,23 +4903,36 @@ static void tcp_collapse_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - bool res = false; + struct rb_node *node, *prev; - if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); - __skb_queue_purge(&tp->out_of_order_queue); + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if (tp->rx_opt.sack_ok) - tcp_sack_reset(&tp->rx_opt); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); + + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); + __kfree_skb(rb_to_skb(node)); sk_mem_reclaim(sk); - res = true; - } - return res; + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + + node = prev; + } while (node); + tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + + return true; } /* Reduce allocated memory if we can, trying to get @@ -4902,7 +4960,7 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) - tcp_collapse(sk, &sk->sk_receive_queue, + tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); @@ -5007,7 +5065,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 01715fc75169..ee8399f11fd0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); #ifdef CONFIG_TCP_MD5SIG /* Clean up the MD5 key list, if any */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d270870bf492..a48846d81b41 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -496,7 +496,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd_cnt = 0; tcp_init_xmit_timers(newsk); - __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; From e747775172a2d4dc4dae794f248f9687ba793f54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Sep 2018 16:24:07 +0800 Subject: [PATCH 21/27] tcp: fix a stale ooo_last_skb after a replace [ Upstream commit 76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a ] When skb replaces another one in ooo queue, I forgot to also update tp->ooo_last_skb as well, if the replaced skb was the last one in the queue. To fix this, we simply can re-use the code that runs after an insertion, trying to merge skbs at the right of current skb. This not only fixes the bug, but also remove all small skbs that might be a subset of the new one. Example: We receive segments 2001:3001, 4001:5001 Then we receive 2001:8001 : We should replace 2001:3001 with the big skb, but also remove 4001:50001 from the queue to save space. packetdrill test demonstrating the bug 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +0.100 < . 1:1(0) ack 1 win 1024 +0 accept(3, ..., ...) = 4 +0.01 < . 1001:2001(1000) ack 1 win 1024 +0 > . 1:1(0) ack 1 +0.01 < . 1001:3001(2000) ack 1 win 1024 +0 > . 1:1(0) ack 1 Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") Signed-off-by: Eric Dumazet Reported-by: Yuchung Cheng Cc: Yaogong Wang Signed-off-by: David S. Miller Signed-off-by: Mao Wenan Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7832d0d107cd..4cc0a53652ea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4465,7 +4465,7 @@ coalesce_done: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb1); - goto add_sack; + goto merge_right; } } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; @@ -4477,6 +4477,7 @@ coalesce_done: rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); +merge_right: /* Remove other segments covered by skb. */ while ((q = rb_next(&skb->rbnode)) != NULL) { skb1 = rb_entry(q, struct sk_buff, rbnode); From 352b66932a23fb11f0a7c316361220648bca3c79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Sep 2018 16:24:08 +0800 Subject: [PATCH 22/27] tcp: free batches of packets in tcp_prune_ofo_queue() [ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ] Juha-Matti Tilli reported that malicious peers could inject tiny packets in out_of_order_queue, forcing very expensive calls to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for every incoming packet. out_of_order_queue rb-tree can contain thousands of nodes, iterating over all of them is not nice. Before linux-4.9, we would have pruned all packets in ofo_queue in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB. Since we plan to increase tcp_rmem[2] in the future to cope with modern BDP, can not revert to the old behavior, without great pain. Strategy taken in this patch is to purge ~12.5 % of the queue capacity. Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") Signed-off-by: Eric Dumazet Reported-by: Juha-Matti Tilli Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Mao Wenan Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cc0a53652ea..4739a93425ed 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4899,27 +4899,33 @@ new_range: /* * Purge the out-of-order queue. + * Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * Return true if queue was pruned. */ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; + int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); - + goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); + goal -= rb_to_skb(node)->truesize; __kfree_skb(rb_to_skb(node)); - sk_mem_reclaim(sk); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !tcp_under_memory_pressure(sk)) - break; + if (!prev || goal <= 0) { + sk_mem_reclaim(sk); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + goal = sk->sk_rcvbuf >> 3; + } node = prev; } while (node); From be288481479ca8c1beba02a7e779ffeaa11f1597 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Sep 2018 16:24:09 +0800 Subject: [PATCH 23/27] tcp: call tcp_drop() from tcp_data_queue_ofo() [ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ] In order to be able to give better diagnostics and detect malicious traffic, we need to have better sk->sk_drops tracking. Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Mao Wenan Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4739a93425ed..cbe0ca03bde8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4445,7 +4445,7 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4464,7 +4464,7 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); goto merge_right; } } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { From eee1af4e268e10fecb76bce42a8d7343aeb2a5e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Sep 2018 16:24:10 +0800 Subject: [PATCH 24/27] tcp: add tcp_ooo_try_coalesce() helper [ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ] In case skb in out_or_order_queue is the result of multiple skbs coalescing, we would like to get a proper gso_segs counter tracking, so that future tcp_drop() can report an accurate number. I chose to not implement this tracking for skbs in receive queue, since they are not dropped, unless socket is disconnected. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Mao Wenan Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cbe0ca03bde8..1aff93d76f24 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4296,6 +4296,23 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static bool tcp_ooo_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + bool res = tcp_try_coalesce(sk, to, from, fragstolen); + + /* In case tcp_drop() is called later, update to->gso_segs */ + if (res) { + u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + + max_t(u16, 1, skb_shinfo(from)->gso_segs); + + skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + } + return res; +} + static void tcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); @@ -4422,7 +4439,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { + if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, + skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); @@ -4467,7 +4485,8 @@ coalesce_done: tcp_drop(sk, skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + } else if (tcp_ooo_try_coalesce(sk, skb1, + skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; From 367222df917632b354d193cc861ba0d7e622654f Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Mon, 18 Jun 2018 17:00:39 +0300 Subject: [PATCH 25/27] ath10k: fix scan crash due to incorrect length calculation commit c8291988806407e02a01b4b15b4504eafbcc04e0 upstream. Length of WMI scan message was not calculated correctly. The allocated buffer was smaller than what we expected. So WMI message corrupted skb_info, which is at the end of skb->data. This fix takes TLV header into account even if the element is zero-length. Crash log: [49.629986] Unhandled kernel unaligned access[#1]: [49.634932] CPU: 0 PID: 1176 Comm: logd Not tainted 4.4.60 #180 [49.641040] task: 83051460 ti: 8329c000 task.ti: 8329c000 [49.646608] $ 0 : 00000000 00000001 80984a80 00000000 [49.652038] $ 4 : 45259e89 8046d484 8046df30 8024ba70 [49.657468] $ 8 : 00000000 804cc4c0 00000001 20306320 [49.662898] $12 : 33322037 000110f2 00000000 31203930 [49.668327] $16 : 82792b40 80984a80 00000001 804207fc [49.673757] $20 : 00000000 0000012c 00000040 80470000 [49.679186] $24 : 00000000 8024af7c [49.684617] $28 : 8329c000 8329db88 00000001 802c58d0 [49.690046] Hi : 00000000 [49.693022] Lo : 453c0000 [49.696013] epc : 800efae4 put_page+0x0/0x58 [49.700615] ra : 802c58d0 skb_release_data+0x148/0x1d4 [49.706184] Status: 1000fc03 KERNEL EXL IE [49.710531] Cause : 00800010 (ExcCode 04) [49.714669] BadVA : 45259e89 [49.717644] PrId : 00019374 (MIPS 24Kc) Signed-off-by: Zhi Chen Signed-off-by: Kalle Valo Cc: Brian Norris Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/wmi-tlv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index c72eb4464de9..c27fff39ddae 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -1459,10 +1459,10 @@ ath10k_wmi_tlv_op_gen_start_scan(struct ath10k *ar, bssid_len = arg->n_bssids * sizeof(struct wmi_mac_addr); ie_len = roundup(arg->ie_len, 4); len = (sizeof(*tlv) + sizeof(*cmd)) + - (arg->n_channels ? sizeof(*tlv) + chan_len : 0) + - (arg->n_ssids ? sizeof(*tlv) + ssid_len : 0) + - (arg->n_bssids ? sizeof(*tlv) + bssid_len : 0) + - (arg->ie_len ? sizeof(*tlv) + ie_len : 0); + sizeof(*tlv) + chan_len + + sizeof(*tlv) + ssid_len + + sizeof(*tlv) + bssid_len + + sizeof(*tlv) + ie_len; skb = ath10k_wmi_alloc_skb(ar, len); if (!skb) From 3a07d58f20c8e0efd9e5d43ee929c3f5c5f35874 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 16 May 2017 09:30:18 +0800 Subject: [PATCH 26/27] ebtables: arpreply: Add the standard target sanity check commit c953d63548207a085abcb12a15fefc8a11ffdf0a upstream. The info->target comes from userspace and it would be used directly. So we need to add the sanity check to make sure it is a valid standard target, although the ebtables tool has already checked it. Kernel needs to validate anything coming from userspace. If the target is set as an evil value, it would break the ebtables and cause a panic. Because the non-standard target is treated as one offset. Now add one helper function ebt_invalid_target, and we would replace the macro INVALID_TARGET later. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso Cc: Loic Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter_bridge/ebtables.h | 5 +++++ net/bridge/netfilter/ebt_arpreply.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 2ea517c7c6b9..bffd096fae3b 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -125,4 +125,9 @@ extern unsigned int ebt_do_table(struct sk_buff *skb, /* True if the target is not a standard target */ #define INVALID_TARGET (info->target < -NUM_STANDARD_TARGETS || info->target >= 0) +static inline bool ebt_invalid_target(int target) +{ + return (target < -NUM_STANDARD_TARGETS || target >= 0); +} + #endif diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index 070cf134a22f..f2660c1b29e4 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -67,6 +67,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par) if (e->ethproto != htons(ETH_P_ARP) || e->invflags & EBT_IPROTO) return -EINVAL; + if (ebt_invalid_target(info->target)) + return -EINVAL; + return 0; } From b001adea66f0e0a7803adfbf9128a2d7969daa4e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 13 Oct 2018 09:11:36 +0200 Subject: [PATCH 27/27] Linux 4.4.161 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 607394a56036..57e4ff1a8b96 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 160 +SUBLEVEL = 161 EXTRAVERSION = NAME = Blurry Fish Butt