From e70c132b2dac4fa9287f08f7572907ed99cbb65b Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 23 Jan 2018 14:34:38 -0800 Subject: [PATCH 01/79] ANDROID: xattr: Pass EOPNOTSUPP to permission2 The permission call for xattr operations happens regardless of whether or not the xattr functions are implemented. The xattr functions currently don't have support for permission2. Passing EOPNOTSUPP as the mount point in xattr_permission allows us to return EOPNOTSUPP early in permission2, if the filesystem supports it. Change-Id: I9d07e4cd633cf40af60450ffbff7ac5c1b4e8c2c Signed-off-by: Daniel Rosenberg Bug: 35848445 --- fs/sdcardfs/inode.c | 2 ++ fs/xattr.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 36c36590bcea..3ef34874fd7b 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -625,6 +625,8 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma struct inode tmp; struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode)); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); if (!top) return -EINVAL; diff --git a/fs/xattr.c b/fs/xattr.c index 76f01bf4b048..a40f49cc04c3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -70,7 +70,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) return -EPERM; } - return inode_permission(inode, mask); + return inode_permission2(ERR_PTR(-EOPNOTSUPP), inode, mask); } /** From 202e079275c622e56d67021b84ec792426692445 Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 8 Mar 2017 19:03:44 +0100 Subject: [PATCH 02/79] UPSTREAM: eventpoll.h: add missing epoll event masks [resend due to me forgetting to cc: linux-api the first time around I posted these back on Feb 23] From: Greg Kroah-Hartman For some reason these values are not in the uapi header file, so any libc has to define it themselves. To prevent them from needing to do this, just have the kernel provide the correct values. Reported-by: Elliott Hughes Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 7e040726850a106587485c21bdacc0bfc8a0cbed) Change-Id: I7b1370668faeeb7597288b29dde5bc6d63c95be6 Signed-off-by: Greg Hackmann --- include/uapi/linux/eventpoll.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index bc81fb2e1f0e..6f04cb419115 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,6 +26,19 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Epoll event masks */ +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 + /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. From 962d1f3fe2f44b79f2fb45b82171781a5f98c7ae Mon Sep 17 00:00:00 2001 From: Ke Wang Date: Thu, 11 Jan 2018 11:16:32 +0800 Subject: [PATCH 03/79] ANDROID: sched: EAS: check energy_aware() before calling select_energy_cpu_brute() in up-migrate path In up-migrate path, select_energy_cpu_brute() was called directly without checking energy_aware(). This will make select_energy_cpu_brute() always worked even disabling energy_aware() on the asymmetric cpu capacity system. Signed-off-by: Ke Wang --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ca6ac458016b..06e77d60a510 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9920,7 +9920,7 @@ void check_for_migration(struct rq *rq, struct task_struct *p) int active_balance; int cpu = task_cpu(p); - if (rq->misfit_task) { + if (energy_aware() && rq->misfit_task) { if (rq->curr->state != TASK_RUNNING || rq->curr->nr_cpus_allowed == 1) return; From 8bb0c6a1b3b2fe92707ee6f6901bfdfc392f8cd1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 9 Dec 2016 10:24:05 -0800 Subject: [PATCH 04/79] x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels commit 1c52d859cb2d417e7216d3e56bb7fea88444cec9 upstream. We support various non-Intel CPUs that don't have the CPUID instruction, so the M486 test was wrong. For now, fix it with a big hammer: handle missing CPUID on all 32-bit CPUs. Reported-by: One Thousand Gnomes Signed-off-by: Andy Lutomirski Cc: Juergen Gross Cc: Peter Zijlstra Cc: Brian Gerst Cc: Matthew Whitehead Cc: Borislav Petkov Cc: Henrique de Moraes Holschuh Cc: Andrew Cooper Cc: Boris Ostrovsky Cc: xen-devel Link: http://lkml.kernel.org/r/685bd083a7c036f7769510b6846315b17d6ba71f.1481307769.git.luto@kernel.org Signed-off-by: Thomas Gleixner Cc: "Zhang, Ning A" Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c124d6ab4bf9..86bccb4bd4dc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -574,7 +574,7 @@ static inline void sync_core(void) { int tmp; -#ifdef CONFIG_M486 +#ifdef CONFIG_X86_32 /* * Do a CPUID if available, otherwise do a jump. The jump * can conveniently enough be the jump around CPUID. From 28f467e0bdda754aa36390fd90b01823f0d3b18d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:49 -0700 Subject: [PATCH 05/79] usbip: prevent vhci_hcd driver from leaking a socket pointer address commit 2f2d0088eb93db5c649d2a5e34a3800a8a935fc5 upstream. When a client has a USB device attached over IP, the vhci_hcd driver is locally leaking a socket pointer address via the /sys/devices/platform/vhci_hcd/status file (world-readable) and in debug output when "usbip --debug port" is run. Fix it to not leak. The socket pointer address is not used at the moment and it was made visible as a convenient way to find IP address from socket pointer address by looking up /proc/net/{tcp,tcp6}. As this opens a security hole, the fix replaces socket pointer address with sockfd. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_common.h | 1 + drivers/usb/usbip/vhci_sysfs.c | 25 +++++++++++++++---------- tools/usb/usbip/libsrc/vhci_driver.c | 8 ++++---- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h index 86b08475c254..f875ccaa55f9 100644 --- a/drivers/usb/usbip/usbip_common.h +++ b/drivers/usb/usbip/usbip_common.h @@ -261,6 +261,7 @@ struct usbip_device { /* lock for status */ spinlock_t lock; + int sockfd; struct socket *tcp_socket; struct task_struct *tcp_rx; diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c index 211f43f67ea2..84c21c4ccf46 100644 --- a/drivers/usb/usbip/vhci_sysfs.c +++ b/drivers/usb/usbip/vhci_sysfs.c @@ -39,16 +39,20 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, /* * output example: - * prt sta spd dev socket local_busid - * 000 004 000 000 c5a7bb80 1-2.3 - * 001 004 000 000 d8cee980 2-3.4 + * port sta spd dev sockfd local_busid + * 0000 004 000 00000000 000003 1-2.3 + * 0001 004 000 00000000 000004 2-3.4 * - * IP address can be retrieved from a socket pointer address by looking - * up /proc/net/{tcp,tcp6}. Also, a userland program may remember a - * port number and its peer IP address. + * Output includes socket fd instead of socket pointer address to + * avoid leaking kernel memory address in: + * /sys/devices/platform/vhci_hcd.0/status and in debug output. + * The socket pointer address is not used at the moment and it was + * made visible as a convenient way to find IP address from socket + * pointer address by looking up /proc/net/{tcp,tcp6}. As this opens + * a security hole, the change is made to use sockfd instead. */ out += sprintf(out, - "prt sta spd bus dev socket local_busid\n"); + "prt sta spd bus dev sockfd local_busid\n"); for (i = 0; i < VHCI_NPORTS; i++) { struct vhci_device *vdev = port_to_vdev(i); @@ -60,11 +64,11 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, out += sprintf(out, "%03u %08x ", vdev->speed, vdev->devid); out += sprintf(out, "%16p ", vdev->ud.tcp_socket); + out += sprintf(out, "%06u", vdev->ud.sockfd); out += sprintf(out, "%s", dev_name(&vdev->udev->dev)); - } else { - out += sprintf(out, "000 000 000 0000000000000000 0-0"); - } + } else + out += sprintf(out, "000 000 000 000000 0-0"); out += sprintf(out, "\n"); spin_unlock(&vdev->ud.lock); @@ -223,6 +227,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->devid = devid; vdev->speed = speed; + vdev->ud.sockfd = sockfd; vdev->ud.tcp_socket = socket; vdev->ud.status = VDEV_ST_NOTASSIGNED; diff --git a/tools/usb/usbip/libsrc/vhci_driver.c b/tools/usb/usbip/libsrc/vhci_driver.c index ad9204773533..1274f326242c 100644 --- a/tools/usb/usbip/libsrc/vhci_driver.c +++ b/tools/usb/usbip/libsrc/vhci_driver.c @@ -55,12 +55,12 @@ static int parse_status(const char *value) while (*c != '\0') { int port, status, speed, devid; - unsigned long socket; + int sockfd; char lbusid[SYSFS_BUS_ID_SIZE]; - ret = sscanf(c, "%d %d %d %x %lx %31s\n", + ret = sscanf(c, "%d %d %d %x %u %31s\n", &port, &status, &speed, - &devid, &socket, lbusid); + &devid, &sockfd, lbusid); if (ret < 5) { dbg("sscanf failed: %d", ret); @@ -69,7 +69,7 @@ static int parse_status(const char *value) dbg("port %d status %d speed %d devid %x", port, status, speed, devid); - dbg("socket %lx lbusid %s", socket, lbusid); + dbg("sockfd %u lbusid %s", sockfd, lbusid); /* if a device is connected, look at it */ From 4493f49816acc5531ed31ffc600073330393cc61 Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Mon, 27 Feb 2017 10:31:04 +0200 Subject: [PATCH 06/79] usbip: Fix implicit fallthrough warning commit cfd6ed4537a9e938fa76facecd4b9cd65b6d1563 upstream. GCC 7 now warns when switch statements fall through implicitly, and with -Werror enabled in configure.ac, that makes these tools unbuildable. We fix this by notifying the compiler that this particular case statement is meant to fall through. Reviewed-by: Peter Senna Tschudin Signed-off-by: Jonathan Dieter Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/src/usbip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/usb/usbip/src/usbip.c b/tools/usb/usbip/src/usbip.c index d7599d943529..73d8eee8130b 100644 --- a/tools/usb/usbip/src/usbip.c +++ b/tools/usb/usbip/src/usbip.c @@ -176,6 +176,8 @@ int main(int argc, char *argv[]) break; case '?': printf("usbip: invalid option\n"); + /* Terminate after printing error */ + /* FALLTHRU */ default: usbip_usage(); goto out; From 579a9885cfe40645bc62207f318d172c4cad7942 Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Mon, 27 Feb 2017 10:31:03 +0200 Subject: [PATCH 07/79] usbip: Fix potential format overflow in userspace tools commit e5dfa3f902b9a642ae8c6997d57d7c41e384a90b upstream. The usbip userspace tools call sprintf()/snprintf() and don't check for the return value which can lead the paths to overflow, truncating the final file in the path. More urgently, GCC 7 now warns that these aren't checked with -Wformat-overflow, and with -Werror enabled in configure.ac, that makes these tools unbuildable. This patch fixes these problems by replacing sprintf() with snprintf() in one place and adding checks for the return value of snprintf(). Reviewed-by: Peter Senna Tschudin Signed-off-by: Jonathan Dieter Acked-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/libsrc/usbip_common.c | 9 +++++++- tools/usb/usbip/libsrc/usbip_host_driver.c | 27 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/tools/usb/usbip/libsrc/usbip_common.c b/tools/usb/usbip/libsrc/usbip_common.c index ac73710473de..8000445ff884 100644 --- a/tools/usb/usbip/libsrc/usbip_common.c +++ b/tools/usb/usbip/libsrc/usbip_common.c @@ -215,9 +215,16 @@ int read_usb_interface(struct usbip_usb_device *udev, int i, struct usbip_usb_interface *uinf) { char busid[SYSFS_BUS_ID_SIZE]; + int size; struct udev_device *sif; - sprintf(busid, "%s:%d.%d", udev->busid, udev->bConfigurationValue, i); + size = snprintf(busid, sizeof(busid), "%s:%d.%d", + udev->busid, udev->bConfigurationValue, i); + if (size < 0 || (unsigned int)size >= sizeof(busid)) { + err("busid length %i >= %lu or < 0", size, + (unsigned long)sizeof(busid)); + return -1; + } sif = udev_device_new_from_subsystem_sysname(udev_context, "usb", busid); if (!sif) { diff --git a/tools/usb/usbip/libsrc/usbip_host_driver.c b/tools/usb/usbip/libsrc/usbip_host_driver.c index bef08d5c44e8..071b9ce99420 100644 --- a/tools/usb/usbip/libsrc/usbip_host_driver.c +++ b/tools/usb/usbip/libsrc/usbip_host_driver.c @@ -39,13 +39,19 @@ struct udev *udev_context; static int32_t read_attr_usbip_status(struct usbip_usb_device *udev) { char status_attr_path[SYSFS_PATH_MAX]; + int size; int fd; int length; char status; int value = 0; - snprintf(status_attr_path, SYSFS_PATH_MAX, "%s/usbip_status", - udev->path); + size = snprintf(status_attr_path, SYSFS_PATH_MAX, "%s/usbip_status", + udev->path); + if (size < 0 || (unsigned int)size >= sizeof(status_attr_path)) { + err("usbip_status path length %i >= %lu or < 0", size, + (unsigned long)sizeof(status_attr_path)); + return -1; + } fd = open(status_attr_path, O_RDONLY); if (fd < 0) { @@ -225,6 +231,7 @@ int usbip_host_export_device(struct usbip_exported_device *edev, int sockfd) { char attr_name[] = "usbip_sockfd"; char sockfd_attr_path[SYSFS_PATH_MAX]; + int size; char sockfd_buff[30]; int ret; @@ -244,10 +251,20 @@ int usbip_host_export_device(struct usbip_exported_device *edev, int sockfd) } /* only the first interface is true */ - snprintf(sockfd_attr_path, sizeof(sockfd_attr_path), "%s/%s", - edev->udev.path, attr_name); + size = snprintf(sockfd_attr_path, sizeof(sockfd_attr_path), "%s/%s", + edev->udev.path, attr_name); + if (size < 0 || (unsigned int)size >= sizeof(sockfd_attr_path)) { + err("exported device path length %i >= %lu or < 0", size, + (unsigned long)sizeof(sockfd_attr_path)); + return -1; + } - snprintf(sockfd_buff, sizeof(sockfd_buff), "%d\n", sockfd); + size = snprintf(sockfd_buff, sizeof(sockfd_buff), "%d\n", sockfd); + if (size < 0 || (unsigned int)size >= sizeof(sockfd_buff)) { + err("socket length %i >= %lu or < 0", size, + (unsigned long)sizeof(sockfd_buff)); + return -1; + } ret = write_sysfs_attribute(sockfd_attr_path, sockfd_buff, strlen(sockfd_buff)); From 20e3fa5dd5818b425b18528571e133034833df27 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 24 Jan 2018 02:31:19 +0000 Subject: [PATCH 08/79] x86/microcode/intel: Fix BDW late-loading revision check The backport of commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") to 4.4-stable deleted a "return true" statement. This bug is not present upstream or other stable branches. Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index b428a8174be1..ee011bd7934d 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1005,6 +1005,7 @@ static bool is_blacklisted(unsigned int cpu) c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); + return true; } return false; From 0dfd5fbcae5d40e5e58bc7c34305498e965bf1ef Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Jun 2016 17:19:27 -0700 Subject: [PATCH 09/79] x86/cpu/intel: Introduce macros for Intel family numbers commit 970442c599b22ccd644ebfe94d1d303bf6f87c05 upstream. Problem: We have a boatload of open-coded family-6 model numbers. Half of them have these model numbers in hex and the other half in decimal. This makes grepping for them tons of fun, if you were to try. Solution: Consolidate all the magic numbers. Put all the definitions in one header. The names here are closely derived from the comments describing the models from arch/x86/events/intel/core.c. We could easily make them shorter by doing things like s/SANDYBRIDGE/SNB/, but they seemed fine even with the longer versions to me. Do not take any of these names too literally, like "DESKTOP" or "MOBILE". These are all colloquial names and not precise descriptions of everywhere a given model will show up. Signed-off-by: Dave Hansen Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Darren Hart Cc: Dave Hansen Cc: Denys Vlasenko Cc: Doug Thompson Cc: Eduardo Valentin Cc: H. Peter Anvin Cc: Jacob Pan Cc: Kan Liang Cc: Len Brown Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rajneesh Bhardwaj Cc: Souvik Kumar Chakravarty Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Luck Cc: Ulf Hansson Cc: Viresh Kumar Cc: Vishwanath Somayaji Cc: Zhang Rui Cc: jacob.jun.pan@intel.com Cc: linux-acpi@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20160603001927.F2A7D828@viggo.jf.intel.com Signed-off-by: Ingo Molnar Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/intel-family.h | 68 +++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/x86/include/asm/intel-family.h diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h new file mode 100644 index 000000000000..6999f7d01a0d --- /dev/null +++ b/arch/x86/include/asm/intel-family.h @@ -0,0 +1,68 @@ +#ifndef _ASM_X86_INTEL_FAMILY_H +#define _ASM_X86_INTEL_FAMILY_H + +/* + * "Big Core" Processors (Branded as Core, Xeon, etc...) + * + * The "_X" parts are generally the EP and EX Xeons, or the + * "Extreme" ones, like Broadwell-E. + * + * Things ending in "2" are usually because we have no better + * name for them. There's no processor called "WESTMERE2". + */ + +#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D + +#define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_FAM6_WESTMERE2 0x1F +#define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_FAM6_WESTMERE_EX 0x2F + +#define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_FAM6_IVYBRIDGE_X 0x3E + +#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_FAM6_HASWELL_ULT 0x45 +#define INTEL_FAM6_HASWELL_GT3E 0x46 + +#define INTEL_FAM6_BROADWELL_CORE 0x3D +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL_X 0x4F + +#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E +#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_X 0x55 +#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E +#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E + +/* "Small Core" Processors (Atom) */ + +#define INTEL_FAM6_ATOM_PINEVIEW 0x1C +#define INTEL_FAM6_ATOM_LINCROFT 0x26 +#define INTEL_FAM6_ATOM_PENWELL 0x27 +#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 +#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 +#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ +#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C +#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ + +/* Xeon Phi */ + +#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ + +#endif /* _ASM_X86_INTEL_FAMILY_H */ From 18bb117d1b7690181346e6365c6237b6ceaac4c4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 17:49:25 +0000 Subject: [PATCH 10/79] x86/retpoline: Fill RSB on context switch for affected CPUs commit c995efd5a740d9cbafbf58bde4973e8b50b4d761 upstream. On context switch from a shallow call stack to a deeper one, as the CPU does 'ret' up the deeper side it may encounter RSB entries (predictions for where the 'ret' goes to) which were populated in userspace. This is problematic if neither SMEP nor KPTI (the latter of which marks userspace pages as NX for the kernel) are active, as malicious code in userspace may then be executed speculatively. Overwrite the CPU's return prediction stack with calls which are predicted to return to an infinite loop, to "capture" speculation if this happens. This is required both for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI. On Skylake+ the problem is slightly different, and an *underflow* of the RSB may cause errant branch predictions to occur. So there it's not so much overwrite, as *filling* the RSB to attempt to prevent it getting empty. This is only a partial solution for Skylake+ since there are many other conditions which may result in the RSB becoming empty. The full solution on Skylake+ is to use IBRS, which will prevent the problem even when the RSB becomes empty. With IBRS, the RSB-stuffing will not be required on context switch. [ tglx: Added missing vendor check and slighty massaged comments and changelog ] [js] backport to 4.4 -- __switch_to_asm does not exist there, we have to patch the switch_to macros for both x86_32 and x86_64. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/switch_to.h | 38 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 36 +++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0fbc98568018..641f0f2c2982 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -199,6 +199,7 @@ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf11..025ecfaba9c9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); @@ -24,6 +26,23 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +#define __retpoline_fill_return_buffer \ + ALTERNATIVE("jmp 910f", \ + __stringify(__FILL_RETURN_BUFFER(%%ebx, RSB_CLEAR_LOOPS, %%esp)),\ + X86_FEATURE_RSB_CTXSW) \ + "910:\n\t" +#else +#define __retpoline_fill_return_buffer +#endif + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -46,6 +65,7 @@ do { \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ "pushl %[next_ip]\n\t" /* restore EIP */ \ __switch_canary \ + __retpoline_fill_return_buffer \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ @@ -100,6 +120,23 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +#define __retpoline_fill_return_buffer \ + ALTERNATIVE("jmp 910f", \ + __stringify(__FILL_RETURN_BUFFER(%%r12, RSB_CLEAR_LOOPS, %%rsp)),\ + X86_FEATURE_RSB_CTXSW) \ + "910:\n\t" +#else +#define __retpoline_fill_return_buffer +#endif + /* * There is no need to save or restore flags, because flags are always * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL @@ -112,6 +149,7 @@ do { \ "call __switch_to\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ + __retpoline_fill_return_buffer \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 49d25ddf0e9f..8cacf62ec458 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,6 +22,7 @@ #include #include #include +#include static void __init spectre_v2_select_mitigation(void); @@ -154,6 +155,23 @@ disable: return SPECTRE_V2_CMD_NONE; } +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -212,6 +230,24 @@ retpoline_auto: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If neither SMEP or KPTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_KAISER) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } } #undef pr_fmt From 1d00e3d9b7ed8fbf6729b7c5a8aae9f2711d2655 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 29 May 2017 16:24:03 +0200 Subject: [PATCH 11/79] sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks commit 3effcb4247e74a51f5d8b775a1ee4abf87cc089a upstream. We have been facing some problems with self-suspending constrained deadline tasks. The main reason is that the original CBS was not designed for such sort of tasks. One problem reported by Xunlei Pang takes place when a task suspends, and then is awakened before the deadline, but so close to the deadline that its remaining runtime can cause the task to have an absolute density higher than allowed. In such situation, the original CBS assumes that the task is facing an early activation, and so it replenishes the task and set another deadline, one deadline in the future. This rule works fine for implicit deadline tasks. Moreover, it allows the system to adapt the period of a task in which the external event source suffered from a clock drift. However, this opens the window for bandwidth leakage for constrained deadline tasks. For instance, a task with the following parameters: runtime = 5 ms deadline = 7 ms [density] = 5 / 7 = 0.71 period = 1000 ms If the task runs for 1 ms, and then suspends for another 1ms, it will be awakened with the following parameters: remaining runtime = 4 laxity = 5 presenting a absolute density of 4 / 5 = 0.80. In this case, the original CBS would assume the task had an early wakeup. Then, CBS will reset the runtime, and the absolute deadline will be postponed by one relative deadline, allowing the task to run. The problem is that, if the task runs this pattern forever, it will keep receiving bandwidth, being able to run 1ms every 2ms. Following this behavior, the task would be able to run 500 ms in 1 sec. Thus running more than the 5 ms / 1 sec the admission control allowed it to run. Trying to address the self-suspending case, Luca Abeni, Giuseppe Lipari, and Juri Lelli [1] revisited the CBS in order to deal with self-suspending tasks. In the new approach, rather than replenishing/postponing the absolute deadline, the revised wakeup rule adjusts the remaining runtime, reducing it to fit into the allowed density. A revised version of the idea is: At a given time t, the maximum absolute density of a task cannot be higher than its relative density, that is: runtime / (deadline - t) <= dl_runtime / dl_deadline Knowing the laxity of a task (deadline - t), it is possible to move it to the other side of the equality, thus enabling to define max remaining runtime a task can use within the absolute deadline, without over-running the allowed density: runtime = (dl_runtime / dl_deadline) * (deadline - t) For instance, in our previous example, the task could still run: runtime = ( 5 / 7 ) * 5 runtime = 3.57 ms Without causing damage for other deadline tasks. It is note worthy that the laxity cannot be negative because that would cause a negative runtime. Thus, this patch depends on the patch: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline") Which throttles a constrained deadline task activated after the deadline. Finally, it is also possible to use the revised wakeup rule for all other tasks, but that would require some more discussions about pros and cons. [The main difference from the original commit is that the BW_SHIFT define was not present yet. As BW_SHIFT was introduced in a new feature, I just used the value (20), likewise we used to use before the #define. Other changes were required because of comments. - bistrot] Reported-by: Xunlei Pang Signed-off-by: Daniel Bristot de Oliveira [peterz: replaced dl_is_constrained with dl_is_implicit] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.1495803804.git.bristot@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 + kernel/sched/deadline.c | 98 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e887c8d6f395..90bea398e5e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1313,6 +1313,7 @@ struct sched_dl_entity { u64 dl_deadline; /* relative deadline of each instance */ u64 dl_period; /* separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_deadline */ + u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d6b3d869592..e6d1173a2046 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2109,6 +2109,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_new = 1; @@ -3647,6 +3648,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); /* * Changing the parameters of a task is 'tricky' and we're not doing diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6be2afd9bfd6..e12b0a4df891 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -480,13 +480,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> 20; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. + * + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -505,6 +576,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -991,11 +1070,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -1027,7 +1101,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); /* From e7514af60542830a37cd8f2fc0419fc5e320d70c Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: [PATCH 12/79] can: af_can: can_rcv(): replace WARN_ONCE by pr_warn_once commit 8cb68751c115d176ec851ca56ecfbb411568c9e8 upstream. If an invalid CAN frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+4386709c0c1284dca827@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Signed-off-by: Greg Kroah-Hartman --- net/can/af_can.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 928f58064098..924ad0513af9 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -722,13 +722,12 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!net_eq(dev_net(dev), &init_net))) goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN, - "PF_CAN: dropped non conform CAN skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); goto drop; + } can_receive(skb, dev); return NET_RX_SUCCESS; From 08f39b7bcccb532a6b96fdeaf91ed08e4af7dda4 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: [PATCH 13/79] can: af_can: canfd_rcv(): replace WARN_ONCE by pr_warn_once commit d4689846881d160a4d12a514e991a740bcb5d65a upstream. If an invalid CANFD frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+e3b775f40babeff6e68b@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Signed-off-by: Greg Kroah-Hartman --- net/can/af_can.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 924ad0513af9..c866e761651a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -745,13 +745,12 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!net_eq(dev_net(dev), &init_net))) goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN, - "PF_CAN: dropped non conform CAN FD skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); goto drop; + } can_receive(skb, dev); return NET_RX_SUCCESS; From 0cb5ae4db47abc3985016aac6326ef98c533fe4d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Jan 2016 20:08:52 -0600 Subject: [PATCH 14/79] PM / sleep: declare __tracedata symbols as char[] rather than char commit f97238373b8662a6d580e204df2e7bcbfa43e27a upstream. Accessing more than one byte from a symbol declared simply 'char' is undefined behavior, as reported by UBSAN: UBSAN: Undefined behaviour in drivers/base/power/trace.c:178:18 load of address ffffffff8203fc78 with insufficient space for an object of type 'char' Avoid this by declaring the symbols as arrays. Signed-off-by: Eric Biggers Signed-off-by: Rafael J. Wysocki Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/base/power/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index a311cfa4c5bd..a6975795e7f3 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -166,14 +166,14 @@ void generate_pm_trace(const void *tracedata, unsigned int user) } EXPORT_SYMBOL(generate_pm_trace); -extern char __tracedata_start, __tracedata_end; +extern char __tracedata_start[], __tracedata_end[]; static int show_file_hash(unsigned int value) { int match; char *tracedata; match = 0; - for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; + for (tracedata = __tracedata_start ; tracedata < __tracedata_end ; tracedata += 2 + sizeof(unsigned long)) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); From 08b1cf4964d526cf3c2ea9e36ffd752752543ef3 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 13 Aug 2016 01:37:04 +0200 Subject: [PATCH 15/79] time: Avoid undefined behaviour in ktime_add_safe() commit 979515c5645830465739254abc1b1648ada41518 upstream. I ran into this: ================================================================================ UBSAN: Undefined behaviour in kernel/time/hrtimer.c:310:16 signed integer overflow: 9223372036854775807 + 50000 cannot be represented in type 'long long int' CPU: 2 PID: 4798 Comm: trinity-c2 Not tainted 4.8.0-rc1+ #91 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 0000000000000000 ffff88010ce6fb88 ffffffff82344740 0000000041b58ab3 ffffffff84f97a20 ffffffff82344694 ffff88010ce6fbb0 ffff88010ce6fb60 000000000000c350 ffff88010ce6f968 dffffc0000000000 ffffffff857bc320 Call Trace: [] dump_stack+0xac/0xfc [] ? _atomic_dec_and_lock+0xc4/0xc4 [] ubsan_epilogue+0xd/0x8a [] handle_overflow+0x202/0x23d [] ? val_to_string.constprop.6+0x11e/0x11e [] ? timerqueue_add+0x151/0x410 [] ? hrtimer_start_range_ns+0x3b8/0x1380 [] ? memset+0x31/0x40 [] __ubsan_handle_add_overflow+0xe/0x10 [] hrtimer_nanosleep+0x5d9/0x790 [] ? hrtimer_init_sleeper+0x80/0x80 [] ? __might_sleep+0x5b/0x260 [] common_nsleep+0x20/0x30 [] SyS_clock_nanosleep+0x197/0x210 [] ? SyS_clock_getres+0x150/0x150 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? SyS_clock_getres+0x150/0x150 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================================ Add a new ktime_add_unsafe() helper which doesn't check for overflow, but doesn't throw a UBSAN warning when it does overflow either. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Vegard Nossum Signed-off-by: John Stultz Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/ktime.h | 7 +++++++ kernel/time/hrtimer.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2b6a204bd8d4..3ffc69ebe967 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -63,6 +63,13 @@ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) #define ktime_add(lhs, rhs) \ ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; }) +/* + * Same as ktime_add(), but avoids undefined behaviour on overflow; however, + * this means that you must check the result for overflow yourself. + */ +#define ktime_add_unsafe(lhs, rhs) \ + ({ (ktime_t){ .tv64 = (u64) (lhs).tv64 + (rhs).tv64 }; }) + /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 17f7bcff1e02..1dc94768b5a3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -312,7 +312,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can From 839003061aa9362f23866614862b49b57bfdd6ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:41:56 +0200 Subject: [PATCH 16/79] timers: Plug locking race vs. timer migration commit b831275a3553c32091222ac619cfddd73a5553fb upstream. Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the timer flags. As a consequence the compiler is allowed to reload the flags between the initial check for TIMER_MIGRATION and the following timer base computation and the spin lock of the base. While this has not been observed (yet), we need to make sure that it never happens. Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Mike Galbraith Signed-off-by: Greg Kroah-Hartman --- kernel/time/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 125407144c01..3d7588a2e97c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -764,8 +764,15 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, __acquires(timer->base->lock) { for (;;) { - u32 tf = timer->flags; struct tvec_base *base; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); From 29b73cace1b1cc2daf2556b3285d4e79f19bcb3b Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Tue, 25 Apr 2017 16:44:03 -0500 Subject: [PATCH 17/79] Prevent timer value 0 for MWAITX commit 88d879d29f9cc0de2d930b584285638cdada6625 upstream. Newer hardware has uncovered a bug in the software implementation of using MWAITX for the delay function. A value of 0 for the timer is meant to indicate that a timeout will not be used to exit MWAITX. On newer hardware this can result in MWAITX never returning, resulting in NMI soft lockup messages being printed. On older hardware, some of the other conditions under which MWAITX can exit masked this issue. The AMD APM does not currently document this and will be updated. Please refer to http://marc.info/?l=kvm&m=148950623231140 for information regarding NMI soft lockup messages on an AMD Ryzen 1800X. This has been root-caused as a 0 passed to MWAITX causing it to wait indefinitely. This change has the added benefit of avoiding the unnecessary setup of MONITORX/MWAITX when the delay value is zero. Signed-off-by: Janakarajan Natarajan Link: http://lkml.kernel.org/r/1493156643-29366-1-git-send-email-Janakarajan.Natarajan@amd.com Signed-off-by: Thomas Gleixner Signed-off-by: Davidlohr Bueso Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/delay.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e912b2f6d36e..45772560aceb 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops) { u64 start, end, delay, loops = __loops; + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (loops == 0) + return; + start = rdtsc_ordered(); for (;;) { From f31a8450c09327339939fe195707749c1d67016d Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 28 Oct 2016 09:45:28 +0100 Subject: [PATCH 18/79] drivers: base: cacheinfo: fix x86 with CONFIG_OF enabled commit fac51482577d5e05bbb0efa8d602a3c2111098bf upstream. With CONFIG_OF enabled on x86, we get the following error on boot: " Failed to find cpu0 device node Unable to detect cache hierarchy from DT for CPU 0 " and the cacheinfo fails to get populated in the corresponding sysfs entries. This is because cache_setup_of_node looks for of_node for setting up the shared cpu_map without checking that it's already populated in the architecture specific callback. In order to indicate that the shared cpu_map is already populated, this patch introduces a boolean `cpu_map_populated` in struct cpu_cacheinfo that can be used by the generic code to skip cache_shared_cpu_map_setup. This patch also sets that boolean for x86. Cc: Greg Kroah-Hartman Signed-off-by: Sudeep Holla Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 ++ drivers/base/cacheinfo.c | 3 +++ include/linux/cacheinfo.h | 1 + 3 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index e38d338a6447..b4ca91cf55b0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index e9fd32e91668..ecde8957835a 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -106,6 +106,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) unsigned int index; int ret; + if (this_cpu_ci->cpu_map_populated) + return 0; + ret = cache_setup_of_node(cpu); if (ret) return ret; diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2189935075b4..a951fd10aaaa 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -71,6 +71,7 @@ struct cpu_cacheinfo { struct cacheinfo *info_list; unsigned int num_levels; unsigned int num_leaves; + bool cpu_map_populated; }; /* From a91cdfa754905b702a5841b8d2b895599d8fad72 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 28 Oct 2016 09:45:29 +0100 Subject: [PATCH 19/79] drivers: base: cacheinfo: fix boot error message when acpi is enabled commit 55877ef45fbd7f975d078426866b7d1a2435dcc3 upstream. ARM64 enables both CONFIG_OF and CONFIG_ACPI and the firmware can pass both ACPI tables and the device tree. Based on the kernel parameter, one of the two will be chosen. If acpi is enabled, then device tree is not unflattened. Currently ARM64 platforms report: " Failed to find cpu0 device node Unable to detect cache hierarchy from DT for CPU 0 " which is incorrect when booting with ACPI. Also latest ACPI v6.1 has no support for cache properties/hierarchy. This patch adds check for unflattened device tree and also returns as "not supported" if ACPI is runtime enabled. It also removes the reference to DT from the error message as the cache hierarchy can be detected from the firmware(OF/DT/ACPI) Cc: Greg Kroah-Hartman Signed-off-by: Sudeep Holla Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Greg Kroah-Hartman --- drivers/base/cacheinfo.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index ecde8957835a..70e13cf06ed0 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -16,6 +16,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#include #include #include #include @@ -104,12 +105,16 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf, *sib_leaf; unsigned int index; - int ret; + int ret = 0; if (this_cpu_ci->cpu_map_populated) return 0; - ret = cache_setup_of_node(cpu); + if (of_have_populated_dt()) + ret = cache_setup_of_node(cpu); + else if (!acpi_disabled) + /* No cache property/hierarchy support yet in ACPI */ + ret = -ENOTSUPP; if (ret) return ret; @@ -206,8 +211,7 @@ static int detect_cache_attributes(unsigned int cpu) */ ret = cache_shared_cpu_map_setup(cpu); if (ret) { - pr_warn("Unable to detect cache hierarchy from DT for CPU %d\n", - cpu); + pr_warn("Unable to detect cache hierarchy for CPU %d\n", cpu); goto free_ci; } return 0; From 12de6baf0637e2b0067bf5d71c3a7af1ec5b4ff1 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 27 Jan 2016 09:32:05 -0800 Subject: [PATCH 20/79] PCI: layerscape: Add "fsl,ls2085a-pcie" compatible ID commit dbae40b76abef2f8a7e7bf1701f77df9e73def48 upstream. The Layerscape PCI host driver must recognize ls2085a compatible when using firmware with ls2085a compatible property, otherwise the PCI bus won't be detected even though ls2085a compatible is included by the dts. Signed-off-by: Yang Shi Signed-off-by: Bjorn Helgaas Signed-off-by: Matthias Brugger Signed-off-by: Greg Kroah-Hartman --- drivers/pci/host/pci-layerscape.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c index 3923bed93c7e..c40d8b2ce330 100644 --- a/drivers/pci/host/pci-layerscape.c +++ b/drivers/pci/host/pci-layerscape.c @@ -203,6 +203,7 @@ static const struct of_device_id ls_pcie_of_match[] = { { .compatible = "fsl,ls1021a-pcie", .data = &ls1021_drvdata }, { .compatible = "fsl,ls1043a-pcie", .data = &ls1043_drvdata }, { .compatible = "fsl,ls2080a-pcie", .data = &ls2080_drvdata }, + { .compatible = "fsl,ls2085a-pcie", .data = &ls2080_drvdata }, { }, }; MODULE_DEVICE_TABLE(of, ls_pcie_of_match); From 5ee8fef15d0affce9ff482dd99ff33a0578c2368 Mon Sep 17 00:00:00 2001 From: Minghuan Lian Date: Mon, 29 Feb 2016 17:24:15 -0600 Subject: [PATCH 21/79] PCI: layerscape: Fix MSG TLP drop setting commit 1195c103f6c98d9ff381cac3a8760d4f8a133627 upstream. Some kinds of Layerscape PCIe controllers will forward the received message TLPs to system application address space, which could corrupt system memory or lead to a system hang. Enable MSG_DROP to fix this issue. Signed-off-by: Minghuan Lian Signed-off-by: Bjorn Helgaas Signed-off-by: Matthias Brugger Signed-off-by: Greg Kroah-Hartman --- drivers/pci/host/pci-layerscape.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c index c40d8b2ce330..a21e229d95e0 100644 --- a/drivers/pci/host/pci-layerscape.c +++ b/drivers/pci/host/pci-layerscape.c @@ -77,6 +77,16 @@ static void ls_pcie_fix_class(struct ls_pcie *pcie) iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->dbi + PCI_CLASS_DEVICE); } +/* Drop MSG TLP except for Vendor MSG */ +static void ls_pcie_drop_msg_tlp(struct ls_pcie *pcie) +{ + u32 val; + + val = ioread32(pcie->dbi + PCIE_STRFMR1); + val &= 0xDFFFFFFF; + iowrite32(val, pcie->dbi + PCIE_STRFMR1); +} + static int ls1021_pcie_link_up(struct pcie_port *pp) { u32 state; @@ -97,7 +107,7 @@ static int ls1021_pcie_link_up(struct pcie_port *pp) static void ls1021_pcie_host_init(struct pcie_port *pp) { struct ls_pcie *pcie = to_ls_pcie(pp); - u32 val, index[2]; + u32 index[2]; pcie->scfg = syscon_regmap_lookup_by_phandle(pp->dev->of_node, "fsl,pcie-scfg"); @@ -116,13 +126,7 @@ static void ls1021_pcie_host_init(struct pcie_port *pp) dw_pcie_setup_rc(pp); - /* - * LS1021A Workaround for internal TKT228622 - * to fix the INTx hang issue - */ - val = ioread32(pcie->dbi + PCIE_STRFMR1); - val &= 0xffff; - iowrite32(val, pcie->dbi + PCIE_STRFMR1); + ls_pcie_drop_msg_tlp(pcie); } static int ls_pcie_link_up(struct pcie_port *pp) @@ -147,6 +151,7 @@ static void ls_pcie_host_init(struct pcie_port *pp) iowrite32(1, pcie->dbi + PCIE_DBI_RO_WR_EN); ls_pcie_fix_class(pcie); ls_pcie_clear_multifunction(pcie); + ls_pcie_drop_msg_tlp(pcie); iowrite32(0, pcie->dbi + PCIE_DBI_RO_WR_EN); } From 0fd49331f9c1fee00ad9ef125faea9d14d7c6a59 Mon Sep 17 00:00:00 2001 From: yangbo lu Date: Wed, 25 Nov 2015 10:05:37 +0800 Subject: [PATCH 22/79] mmc: sdhci-of-esdhc: add/remove some quirks according to vendor version commit 1ef5e49e46b919052474d9b54a15debc79ff0133 upstream. A previous patch had removed esdhc_of_platform_init() by mistake. static void esdhc_of_platform_init(struct sdhci_host *host) { u32 vvn; vvn = in_be32(host->ioaddr + SDHCI_SLOT_INT_STATUS); vvn = (vvn & SDHCI_VENDOR_VER_MASK) >> SDHCI_VENDOR_VER_SHIFT; if (vvn == VENDOR_V_22) host->quirks2 |= SDHCI_QUIRK2_HOST_NO_CMD23; if (vvn > VENDOR_V_22) host->quirks &= ~SDHCI_QUIRK_NO_BUSY_IRQ; } This patch is used to fix it by add/remove some quirks according to verdor version in probe. Signed-off-by: Yangbo Lu Fixes: f4932cfd22f1 ("mmc: sdhci-of-esdhc: support both BE and LE host controller") Signed-off-by: Ulf Hansson Signed-off-by: Matthias Brugger Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-of-esdhc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 90e94a028a49..83b1226471c1 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -584,6 +584,8 @@ static int sdhci_esdhc_probe(struct platform_device *pdev) { struct sdhci_host *host; struct device_node *np; + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_esdhc *esdhc; int ret; np = pdev->dev.of_node; @@ -600,6 +602,14 @@ static int sdhci_esdhc_probe(struct platform_device *pdev) sdhci_get_of_property(pdev); + pltfm_host = sdhci_priv(host); + esdhc = pltfm_host->priv; + if (esdhc->vendor_ver == VENDOR_V_22) + host->quirks2 |= SDHCI_QUIRK2_HOST_NO_CMD23; + + if (esdhc->vendor_ver > VENDOR_V_22) + host->quirks &= ~SDHCI_QUIRK_NO_BUSY_IRQ; + if (of_device_is_compatible(np, "fsl,p5040-esdhc") || of_device_is_compatible(np, "fsl,p5020-esdhc") || of_device_is_compatible(np, "fsl,p4080-esdhc") || From 13757fc50433b6b6f68c21bd88a094d56514cb5b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 11 Oct 2016 13:51:14 -0700 Subject: [PATCH 23/79] fs/select: add vmalloc fallback for select(2) commit 2d19309cf86883f634a4f8ec55a54bda87db19bf upstream. The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows with the number of fds passed. We had a customer report page allocation failures of order-4 for this allocation. This is a costly order, so it might easily fail, as the VM expects such allocation to have a lower-order fallback. Such trivial fallback is vmalloc(), as the memory doesn't have to be physically contiguous and the allocation is temporary for the duration of the syscall only. There were some concerns, whether this would have negative impact on the system by exposing vmalloc() to userspace. Although an excessive use of vmalloc can cause some system wide performance issues - TLB flushes etc. - a large order allocation is not for free either and an excessive reclaim/compaction can have a similar effect. Also note that the size is effectively limited by RLIMIT_NOFILE which defaults to 1024 on the systems I checked. That means the bitmaps will fit well within single page and thus the vmalloc() fallback could be only excercised for processes where root allows a higher limit. Note that the poll(2) syscall seems to use a linked list of order-0 pages, so it doesn't need this kind of fallback. [eric.dumazet@gmail.com: fix failure path logic] [akpm@linux-foundation.org: use proper type for size] Link: http://lkml.kernel.org/r/20160927084536.5923-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Alexander Viro Cc: Eric Dumazet Cc: David Laight Cc: Hillf Danton Cc: Nicholas Piggin Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/select.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/select.c b/fs/select.c index 015547330e88..f4dd55fc638c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -550,7 +551,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set_bits fds; void *bits; int ret, max_fds; - unsigned int size; + size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -577,7 +578,14 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + if (size > (SIZE_MAX / 6)) + goto out_nofds; + + alloc_size = 6 * size; + bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); + if (!bits && alloc_size > PAGE_SIZE) + bits = vmalloc(alloc_size); + if (!bits) goto out_nofds; } @@ -614,7 +622,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, out: if (bits != stack_fds) - kfree(bits); + kvfree(bits); out_nofds: return ret; } From 7175e56fa7076ea176ffa6dd41ddad288294ddb5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:49:51 -0700 Subject: [PATCH 24/79] mm/mmap.c: do not blow on PROT_NONE MAP_FIXED holes in the stack commit 561b5e0709e4a248c67d024d4d94b6e31e3edf2f upstream. Commit 1be7107fbe18 ("mm: larger stack guard gap, between vmas") has introduced a regression in some rust and Java environments which are trying to implement their own stack guard page. They are punching a new MAP_FIXED mapping inside the existing stack Vma. This will confuse expand_{downwards,upwards} into thinking that the stack expansion would in fact get us too close to an existing non-stack vma which is a correct behavior wrt safety. It is a real regression on the other hand. Let's work around the problem by considering PROT_NONE mapping as a part of the stack. This is a gros hack but overflowing to such a mapping would trap anyway an we only can hope that usespace knows what it is doing and handle it propely. Fixes: 1be7107fbe18 ("mm: larger stack guard gap, between vmas") Link: http://lkml.kernel.org/r/20170705182849.GA18027@dhcp22.suse.cz Signed-off-by: Michal Hocko Debugged-by: Vlastimil Babka Cc: Ben Hutchings Cc: Willy Tarreau Cc: Oleg Nesterov Cc: Rik van Riel Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index eaa460ddcaf9..cc84b97ca250 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2188,7 +2188,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr) { + if (next && next->vm_start < gap_addr && + (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2273,7 +2274,8 @@ int expand_downwards(struct vm_area_struct *vma, if (gap_addr > address) return -ENOMEM; prev = vma->vm_prev; - if (prev && prev->vm_end > gap_addr) { + if (prev && prev->vm_end > gap_addr && + (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(prev->vm_flags & VM_GROWSDOWN)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ From 361376a53481a7e042a1dd320b6bdbde8dc1ba48 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 12 May 2017 15:46:26 -0700 Subject: [PATCH 25/79] hwpoison, memcg: forcibly uncharge LRU pages commit 18365225f0440d09708ad9daade2ec11275c3df9 upstream. Laurent Dufour has noticed that hwpoinsoned pages are kept charged. In his particular case he has hit a bad_page("page still charged to cgroup") when onlining a hwpoison page. While this looks like something that shouldn't happen in the first place because onlining hwpages and returning them to the page allocator makes only little sense it shows a real problem. hwpoison pages do not get freed usually so we do not uncharge them (at least not since commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API")). Each charge pins memcg (since e8ea14cc6ead ("mm: memcontrol: take a css reference for each charged page")) as well and so the mem_cgroup and the associated state will never go away. Fix this leak by forcibly uncharging a LRU hwpoisoned page in delete_from_lru_cache(). We also have to tweak uncharge_list because it cannot rely on zero ref count for these pages. [akpm@linux-foundation.org: coding-style fixes] Fixes: 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") Link: http://lkml.kernel.org/r/20170502185507.GB19165@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Laurent Dufour Tested-by: Laurent Dufour Reviewed-by: Balbir Singh Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 2 +- mm/memory-failure.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e25b93a4267d..55a9facb8e8d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5576,7 +5576,7 @@ static void uncharge_list(struct list_head *page_list) next = page->lru.next; VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); if (!page->mem_cgroup) continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 091fe9b06663..92a647957f91 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -539,6 +539,13 @@ static int delete_from_lru_cache(struct page *p) */ ClearPageActive(p); ClearPageUnevictable(p); + + /* + * Poisoned page might never drop its ref count to 0 so we have + * to uncharge it manually from its memcg. + */ + mem_cgroup_uncharge(p); + /* * drop the page count elevated by isolate_lru_page() */ From 1ab0bb9834766841e0119d472c78077c459654ac Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 10 Jul 2017 15:49:44 -0700 Subject: [PATCH 26/79] cma: fix calculation of aligned offset commit e048cb32f69038aa1c8f11e5c1b331be4181659d upstream. The align_offset parameter is used by bitmap_find_next_zero_area_off() to represent the offset of map's base from the previous alignment boundary; the function ensures that the returned index, plus the align_offset, honors the specified align_mask. The logic introduced by commit b5be83e308f7 ("mm: cma: align to physical address, not CMA region position") has the cma driver calculate the offset to the *next* alignment boundary. In most cases, the base alignment is greater than that specified when making allocations, resulting in a zero offset whether we align up or down. In the example given with the commit, the base alignment (8MB) was half the requested alignment (16MB) so the math also happened to work since the offset is 8MB in both directions. However, when requesting allocations with an alignment greater than twice that of the base, the returned index would not be correctly aligned. Also, the align_order arguments of cma_bitmap_aligned_mask() and cma_bitmap_aligned_offset() should not be negative so the argument type was made unsigned. Fixes: b5be83e308f7 ("mm: cma: align to physical address, not CMA region position") Link: http://lkml.kernel.org/r/20170628170742.2895-1-opendmb@gmail.com Signed-off-by: Angus Clark Signed-off-by: Doug Berger Acked-by: Gregory Fong Cc: Doug Berger Cc: Angus Clark Cc: Laura Abbott Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Cc: Lucas Stach Cc: Catalin Marinas Cc: Shiraz Hashim Cc: Jaewon Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Vlastimil Babka Signed-off-by: Greg Kroah-Hartman --- mm/cma.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index bd0e1412475e..43f4a122e969 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -54,7 +54,7 @@ unsigned long cma_get_size(const struct cma *cma) } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, - int align_order) + unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -62,17 +62,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, } /* - * Find a PFN aligned to the specified order and return an offset represented in - * order_per_bits. + * Find the offset of the base PFN from the specified align_order. + * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, - int align_order) + unsigned int align_order) { - if (align_order <= cma->order_per_bit) - return 0; - - return (ALIGN(cma->base_pfn, (1UL << align_order)) - - cma->base_pfn) >> cma->order_per_bit; + return (cma->base_pfn & ((1UL << align_order) - 1)) + >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, From 90ca50c56bfba6c670c4b32cc062aa4c2b850938 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Nov 2017 17:38:30 -0800 Subject: [PATCH 27/79] mm, page_alloc: fix potential false positive in __zone_watermark_ok commit b050e3769c6b4013bb937e879fc43bf1847ee819 upstream. Since commit 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations"), __zone_watermark_ok() check for high-order allocations will shortcut per-migratetype free list checks for ALLOC_HARDER allocations, and return true as long as there's free page of any migratetype. The intention is that ALLOC_HARDER can allocate from MIGRATE_HIGHATOMIC free lists, while normal allocations can't. However, as a side effect, the watermark check will then also return true when there are pages only on the MIGRATE_ISOLATE list, or (prior to CMA conversion to ZONE_MOVABLE) on the MIGRATE_CMA list. Since the allocation cannot actually obtain isolated pages, and might not be able to obtain CMA pages, this can result in a false positive. The condition should be rare and perhaps the outcome is not a fatal one. Still, it's better if the watermark check is correct. There also shouldn't be a performance tradeoff here. Link: http://lkml.kernel.org/r/20171102125001.23708-1-vbabka@suse.cz Fixes: 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations") Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Rik van Riel Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c70f03d91ec..a4c9cd80c7b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2468,9 +2468,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -2482,6 +2479,9 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } From bab93a6196224c862c679a1b1e97be8ceef926a5 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 14 Dec 2016 15:06:07 -0800 Subject: [PATCH 28/79] ipc: msg, make msgrcv work with LONG_MIN commit 999898355e08ae3b92dfd0a08db706e0c6703d30 upstream. When LONG_MIN is passed to msgrcv, one would expect to recieve any message. But convert_mode does *msgtyp = -*msgtyp and -LONG_MIN is undefined. In particular, with my gcc -LONG_MIN produces -LONG_MIN again. So handle this case properly by assigning LONG_MAX to *msgtyp if LONG_MIN was specified as msgtyp to msgrcv. This code: long msg[] = { 100, 200 }; int m = msgget(IPC_PRIVATE, IPC_CREAT | 0644); msgsnd(m, &msg, sizeof(msg), 0); msgrcv(m, &msg, sizeof(msg), LONG_MIN, 0); produces currently nothing: msgget(IPC_PRIVATE, IPC_CREAT|0644) = 65538 msgsnd(65538, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, 0) = 0 msgrcv(65538, ... Except a UBSAN warning: UBSAN: Undefined behaviour in ipc/msg.c:745:13 negation of -9223372036854775808 cannot be represented in type 'long int': With the patch, I see what I expect: msgget(IPC_PRIVATE, IPC_CREAT|0644) = 0 msgsnd(0, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, 0) = 0 msgrcv(0, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, -9223372036854775808, 0) = 16 Link: http://lkml.kernel.org/r/20161024082633.10148-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/msg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ipc/msg.c b/ipc/msg.c index c6521c205cb4..f993f441f852 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -742,7 +742,10 @@ static inline int convert_mode(long *msgtyp, int msgflg) if (*msgtyp == 0) return SEARCH_ANY; if (*msgtyp < 0) { - *msgtyp = -*msgtyp; + if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ + *msgtyp = LONG_MAX; + else + *msgtyp = -*msgtyp; return SEARCH_LESSEQUAL; } if (msgflg & MSG_EXCEPT) From 699a6cc7dd321113c99e06c6c997e8d15557439c Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 8 Jun 2016 14:59:52 +0800 Subject: [PATCH 29/79] x86/ioapic: Fix incorrect pointers in ioapic_setup_resources() commit 9d98bcec731756b8688b59ec998707924d716d7b upstream. On a 4-socket Brickland system, hot-removing one ioapic is fine. Hot-removing the 2nd one causes panic in mp_unregister_ioapic() while calling release_resource(). It is because the iomem_res pointer has already been released when removing the first ioapic. To explain the use of &res[num] here: res is assigned to ioapic_resources, and later in ioapic_insert_resources() we do: struct resource *r = ioapic_resources; for_each_ioapic(i) { insert_resource(&iomem_resource, r); r++; } Here 'r' is treated as an arry of 'struct resource', and the r++ ensures that each element of the array is inserted separately. Thus we should call release_resouce() on each element at &res[num]. Fix it by assigning the correct pointers to ioapics[i].iomem_res in ioapic_setup_resources(). Signed-off-by: Rui Wang Signed-off-by: Thomas Gleixner Cc: tony.luck@intel.com Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: bhelgaas@google.com Link: http://lkml.kernel.org/r/1465369193-4816-3-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar Acked-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fc91c98bee01..fd945099fc95 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2592,8 +2592,8 @@ static struct resource * __init ioapic_setup_resources(void) res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; + ioapics[i].iomem_res = &res[num]; num++; - ioapics[i].iomem_res = res; } ioapic_resources = res; From 4d48ba75ec7877939e9f69ef66894f7f9ce4cf38 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Jun 2016 01:57:50 +0200 Subject: [PATCH 30/79] ACPI / processor: Avoid reserving IO regions too early commit 86314751c7945fa0c67f459beeda2e7c610ca429 upstream. Roland Dreier reports that one of his systems cannot boot because of the changes made by commit ac212b6980d8 (ACPI / processor: Use common hotplug infrastructure). The problematic part of it is the request_region() call in acpi_processor_get_info() that used to run at module init time before the above commit and now it runs much earlier. Unfortunately, the region(s) reserved by it fall into a range the PCI subsystem attempts to reserve for AHCI IO BARs. As a result, the PCI reservation fails and AHCI doesn't work, while previously the PCI reservation would be made before acpi_processor_get_info() and it would succeed. That request_region() call, however, was overlooked by commit ac212b6980d8, as it is not necessary for the enumeration of the processors. It only is needed when the ACPI processor driver actually attempts to handle them which doesn't happen before loading the ACPI processor driver module. Therefore that call should have been moved from acpi_processor_get_info() into that module. Address the problem by moving the request_region() call in question out of acpi_processor_get_info() and use the observation that the region reserved by it is only needed if the FADT-based CPU throttling method is going to be used, which means that it should be sufficient to invoke it from acpi_processor_get_throttling_fadt(). Fixes: ac212b6980d8 (ACPI / processor: Use common hotplug infrastructure) Reported-by: Roland Dreier Tested-by: Roland Dreier Signed-off-by: Rafael J. Wysocki Acked-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpi_processor.c | 9 --------- drivers/acpi/processor_throttling.c | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 9f77943653fb..b63a173786d5 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -331,15 +331,6 @@ static int acpi_processor_get_info(struct acpi_device *device) pr->throttling.duty_width = acpi_gbl_FADT.duty_width; pr->pblk = object.processor.pblk_address; - - /* - * We don't care about error returns - we just try to mark - * these reserved so that nobody else is confused into thinking - * that this region might be unused.. - * - * (In particular, allocating the IO range for Cardbus) - */ - request_region(pr->throttling.address, 6, "ACPI CPU throttle"); } /* diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index f170d746336d..c72e64893d03 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -676,6 +676,15 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr) if (!pr->flags.throttling) return -ENODEV; + /* + * We don't care about error returns - we just try to mark + * these reserved so that nobody else is confused into thinking + * that this region might be unused.. + * + * (In particular, allocating the IO range for Cardbus) + */ + request_region(pr->throttling.address, 6, "ACPI CPU throttle"); + pr->throttling.state = 0; duty_mask = pr->throttling.state_count - 1; From 1fe277d48f998bd928f1b6ddcbe72f2a34b6f5ad Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Dec 2016 02:27:31 +0100 Subject: [PATCH 31/79] ACPI / scan: Prefer devices without _HID/_CID for _ADR matching commit c2a6bbaf0c5f90463a7011a295bbdb7e33c80b51 upstream. The way acpi_find_child_device() works currently is that, if there are two (or more) devices with the same _ADR value in the same namespace scope (which is not specifically allowed by the spec and the OS behavior in that case is not defined), the first one of them found to be present (with the help of _STA) will be returned. This covers the majority of cases, but is not sufficient if some of the devices in question have a _HID (or _CID) returning some valid ACPI/PNP device IDs (which is disallowed by the spec) and the ASL writers' expectation appears to be that the OS will match devices without a valid ACPI/PNP device ID against a given bus address first. To cover this special case as well, modify find_child_checks() to prefer devices without ACPI/PNP device IDs over devices that have them. Suggested-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki Tested-by: Hans de Goede Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/glue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 73c9c7fa9001..f06317d6fc38 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -99,13 +99,13 @@ static int find_child_checks(struct acpi_device *adev, bool check_children) return -ENODEV; /* - * If the device has a _HID (or _CID) returning a valid ACPI/PNP - * device ID, it is better to make it look less attractive here, so that - * the other device with the same _ADR value (that may not have a valid - * device ID) can be matched going forward. [This means a second spec - * violation in a row, so whatever we do here is best effort anyway.] + * If the device has a _HID returning a valid ACPI/PNP device ID, it is + * better to make it look less attractive here, so that the other device + * with the same _ADR value (that may not have a valid device ID) can be + * matched going forward. [This means a second spec violation in a row, + * so whatever we do here is best effort anyway.] */ - return sta_present && list_empty(&adev->pnp.ids) ? + return sta_present && !adev->pnp.type.platform_id ? FIND_CHILD_MAX_SCORE : FIND_CHILD_MIN_SCORE; } From 4c19b00e5588828f0d3198b926efade766dcf2c8 Mon Sep 17 00:00:00 2001 From: Seunghun Han Date: Wed, 26 Apr 2017 16:18:08 +0800 Subject: [PATCH 32/79] ACPICA: Namespace: fix operand cache leak commit 3b2d69114fefa474fca542e51119036dceb4aa6f upstream. ACPICA commit a23325b2e583556eae88ed3f764e457786bf4df6 I found some ACPI operand cache leaks in ACPI early abort cases. Boot log of ACPI operand cache leak is as follows: >[ 0.174332] ACPI: Added _OSI(Module Device) >[ 0.175504] ACPI: Added _OSI(Processor Device) >[ 0.176010] ACPI: Added _OSI(3.0 _SCP Extensions) >[ 0.177032] ACPI: Added _OSI(Processor Aggregator Device) >[ 0.178284] ACPI: SCI (IRQ16705) allocation failed >[ 0.179352] ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20160930/evevent-131) >[ 0.180008] ACPI: Unable to start the ACPI Interpreter >[ 0.181125] ACPI Error: Could not remove SCI handler (20160930/evmisc-281) >[ 0.184068] kmem_cache_destroy Acpi-Operand: Slab cache still has objects >[ 0.185358] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.10.0-rc3 #2 >[ 0.186820] Hardware name: innotek gmb_h virtual_box/virtual_box, BIOS virtual_box 12/01/2006 >[ 0.188000] Call Trace: >[ 0.188000] ? dump_stack+0x5c/0x7d >[ 0.188000] ? kmem_cache_destroy+0x224/0x230 >[ 0.188000] ? acpi_sleep_proc_init+0x22/0x22 >[ 0.188000] ? acpi_os_delete_cache+0xa/0xd >[ 0.188000] ? acpi_ut_delete_caches+0x3f/0x7b >[ 0.188000] ? acpi_terminate+0x5/0xf >[ 0.188000] ? acpi_init+0x288/0x32e >[ 0.188000] ? __class_create+0x4c/0x80 >[ 0.188000] ? video_setup+0x7a/0x7a >[ 0.188000] ? do_one_initcall+0x4e/0x1b0 >[ 0.188000] ? kernel_init_freeable+0x194/0x21a >[ 0.188000] ? rest_init+0x80/0x80 >[ 0.188000] ? kernel_init+0xa/0x100 >[ 0.188000] ? ret_from_fork+0x25/0x30 When early abort is occurred due to invalid ACPI information, Linux kernel terminates ACPI by calling acpi_terminate() function. The function calls acpi_ns_terminate() function to delete namespace data and ACPI operand cache (acpi_gbl_module_code_list). But the deletion code in acpi_ns_terminate() function is wrapped in ACPI_EXEC_APP definition, therefore the code is only executed when the definition exists. If the define doesn't exist, ACPI operand cache (acpi_gbl_module_code_list) is leaked, and stack dump is shown in kernel log. This causes a security threat because the old kernel (<= 4.9) shows memory locations of kernel functions in stack dump, therefore kernel ASLR can be neutralized. To fix ACPI operand leak for enhancing security, I made a patch which removes the ACPI_EXEC_APP define in acpi_ns_terminate() function for executing the deletion code unconditionally. Link: https://github.com/acpica/acpica/commit/a23325b2 Signed-off-by: Seunghun Han Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki Acked-by: Lee, Chun-Yi Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpica/nsutils.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c index de325ae04ce1..3b3c5b90bd20 100644 --- a/drivers/acpi/acpica/nsutils.c +++ b/drivers/acpi/acpica/nsutils.c @@ -593,25 +593,20 @@ struct acpi_namespace_node *acpi_ns_validate_handle(acpi_handle handle) void acpi_ns_terminate(void) { acpi_status status; + union acpi_operand_object *prev; + union acpi_operand_object *next; ACPI_FUNCTION_TRACE(ns_terminate); -#ifdef ACPI_EXEC_APP - { - union acpi_operand_object *prev; - union acpi_operand_object *next; + /* Delete any module-level code blocks */ - /* Delete any module-level code blocks */ - - next = acpi_gbl_module_code_list; - while (next) { - prev = next; - next = next->method.mutex; - prev->method.mutex = NULL; /* Clear the Mutex (cheated) field */ - acpi_ut_remove_reference(prev); - } + next = acpi_gbl_module_code_list; + while (next) { + prev = next; + next = next->method.mutex; + prev->method.mutex = NULL; /* Clear the Mutex (cheated) field */ + acpi_ut_remove_reference(prev); } -#endif /* * Free the entire namespace -- all nodes and all objects From 45cf54e13c70ce0ec4875220103916978ce3ed07 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 14 Jul 2016 17:51:26 +0200 Subject: [PATCH 33/79] netfilter: x_tables: speed up jump target validation commit f4dc77713f8016d2e8a3295e1c9c53a21f296def upstream. The dummy ruleset I used to test the original validation change was broken, most rules were unreachable and were not tested by mark_source_chains(). In some cases rulesets that used to load in a few seconds now require several minutes. sample ruleset that shows the behaviour: echo "*filter" for i in $(seq 0 100000);do printf ":chain_%06x - [0:0]\n" $i done for i in $(seq 0 100000);do printf -- "-A INPUT -j chain_%06x\n" $i printf -- "-A INPUT -j chain_%06x\n" $i printf -- "-A INPUT -j chain_%06x\n" $i done echo COMMIT [ pipe result into iptables-restore ] This ruleset will be about 74mbyte in size, with ~500k searches though all 500k[1] rule entries. iptables-restore will take forever (gave up after 10 minutes) Instead of always searching the entire blob for a match, fill an array with the start offsets of every single ipt_entry struct, then do a binary search to check if the jump target is present or not. After this change ruleset restore times get again close to what one gets when reverting 36472341017529e (~3 seconds on my workstation). [1] every user-defined rule gets an implicit RETURN, so we get 300k jumps + 100k userchains + 100k returns -> 500k rule entries Fixes: 36472341017529e ("netfilter: x_tables: validate targets of jumps") Reported-by: Jeff Wu Tested-by: Jeff Wu Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter/x_tables.h | 4 +++ net/ipv4/netfilter/arp_tables.c | 47 +++++++++++++++------------- net/ipv4/netfilter/ip_tables.c | 45 ++++++++++++++------------- net/ipv6/netfilter/ip6_tables.c | 45 ++++++++++++++------------- net/netfilter/x_tables.c | 50 ++++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 64 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 04078e8a4803..26a41ddd3dec 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -243,6 +243,10 @@ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); +unsigned int *xt_alloc_entry_offsets(unsigned int size); +bool xt_find_jump_offset(const unsigned int *offsets, + unsigned int target, unsigned int size); + int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6e3e0e8b1ce3..d6531c544a82 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -367,23 +367,12 @@ static inline bool unconditional(const struct arpt_entry *e) memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } -static bool find_jump_target(const struct xt_table_info *t, - const struct arpt_entry *target) -{ - struct arpt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -472,10 +461,11 @@ static int mark_source_chains(const struct xt_table_info *newinfo, /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct arpt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -642,6 +632,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { struct arpt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -655,6 +646,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: size %u\n", newinfo->size); + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ @@ -665,7 +659,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - break; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(arpt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) @@ -673,12 +669,13 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) - return ret; + goto out_free; + ret = -EINVAL; if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", i, repl->num_entries); - return -EINVAL; + goto out_free; } /* Check hooks all assigned */ @@ -689,17 +686,20 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, repl->hook_entry[i]); - return -EINVAL; + goto out_free; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, repl->underflow[i]); - return -EINVAL; + goto out_free; } } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -719,6 +719,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a399c5419622..2ba158f2cb72 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -443,23 +443,12 @@ ipt_do_table(struct sk_buff *skb, #endif } -static bool find_jump_target(const struct xt_table_info *t, - const struct ipt_entry *target) -{ - struct ipt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -552,10 +541,11 @@ mark_source_chains(const struct xt_table_info *newinfo, /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ipt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -811,6 +801,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { struct ipt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -824,6 +815,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: size %u\n", newinfo->size); + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -833,17 +827,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ipt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", i, repl->num_entries); - return -EINVAL; + goto out_free; } /* Check hooks all assigned */ @@ -854,17 +851,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, repl->hook_entry[i]); - return -EINVAL; + goto out_free; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, repl->underflow[i]); - return -EINVAL; + goto out_free; } } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -884,6 +884,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 22f39e00bef3..c26bed92f523 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -455,23 +455,12 @@ ip6t_do_table(struct sk_buff *skb, #endif } -static bool find_jump_target(const struct xt_table_info *t, - const struct ip6t_entry *target) -{ - struct ip6t_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -564,10 +553,11 @@ mark_source_chains(const struct xt_table_info *newinfo, /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ip6t_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -823,6 +813,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct ip6t_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -836,6 +827,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: size %u\n", newinfo->size); + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -845,17 +839,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", i, repl->num_entries); - return -EINVAL; + goto out_free; } /* Check hooks all assigned */ @@ -866,17 +863,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, repl->hook_entry[i]); - return -EINVAL; + goto out_free; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, repl->underflow[i]); - return -EINVAL; + goto out_free; } } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -896,6 +896,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2fc6ca9d1286..7b42b0ad3f9b 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -701,6 +701,56 @@ int xt_check_entry_offsets(const void *base, } EXPORT_SYMBOL(xt_check_entry_offsets); +/** + * xt_alloc_entry_offsets - allocate array to store rule head offsets + * + * @size: number of entries + * + * Return: NULL or kmalloc'd or vmalloc'd array + */ +unsigned int *xt_alloc_entry_offsets(unsigned int size) +{ + unsigned int *off; + + off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); + + if (off) + return off; + + if (size < (SIZE_MAX / sizeof(unsigned int))) + off = vmalloc(size * sizeof(unsigned int)); + + return off; +} +EXPORT_SYMBOL(xt_alloc_entry_offsets); + +/** + * xt_find_jump_offset - check if target is a valid jump offset + * + * @offsets: array containing all valid rule start offsets of a rule blob + * @target: the jump target to search for + * @size: entries in @offset + */ +bool xt_find_jump_offset(const unsigned int *offsets, + unsigned int target, unsigned int size) +{ + int m, low = 0, hi = size; + + while (hi > low) { + m = (low + hi) / 2u; + + if (offsets[m] > target) + hi = m; + else if (offsets[m] < target) + low = m + 1; + else + return true; + } + + return false; +} +EXPORT_SYMBOL(xt_find_jump_offset); + int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { From a84338dad3c9501a5301db27ee665cda663219fc Mon Sep 17 00:00:00 2001 From: Hongxu Jia Date: Tue, 29 Nov 2016 21:56:26 -0500 Subject: [PATCH 34/79] netfilter: arp_tables: fix invoking 32bit "iptable -P INPUT ACCEPT" failed in 64bit kernel commit 17a49cd549d9dc8707dc9262210166455c612dde upstream. Since 09d9686047db ("netfilter: x_tables: do compat validation via translate_table"), it used compatr structure to assign newinfo structure. In translate_compat_table of ip_tables.c and ip6_tables.c, it used compatr->hook_entry to replace info->hook_entry and compatr->underflow to replace info->underflow, but not do the same replacement in arp_tables.c. It caused invoking 32-bit "arptbale -P INPUT ACCEPT" failed in 64bit kernel. -------------------------------------- root@qemux86-64:~# arptables -P INPUT ACCEPT root@qemux86-64:~# arptables -P INPUT ACCEPT ERROR: Policy for `INPUT' offset 448 != underflow 0 arptables: Incompatible with this kernel -------------------------------------- Fixes: 09d9686047db ("netfilter: x_tables: do compat validation via translate_table") Signed-off-by: Hongxu Jia Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/ipv4/netfilter/arp_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d6531c544a82..c75211528991 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1339,8 +1339,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; + newinfo->hook_entry[i] = compatr->hook_entry[i]; + newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; From da00455d38a73c9412dd3f99285b48753ff1b61a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 26 May 2016 19:08:10 +0200 Subject: [PATCH 35/79] netfilter: nf_dup_ipv6: set again FLOWI_FLAG_KNOWN_NH at flowi6_flags commit 83170f3beccccd7ceb4f9a0ac0c4dc736afde90c upstream. With the commit 48e8aa6e3137 ("ipv6: Set FLOWI_FLAG_KNOWN_NH at flowi6_flags") ip6_pol_route() callers were asked to to set the FLOWI_FLAG_KNOWN_NH properly and xt_TEE was updated accordingly, but with the later refactor in commit bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6") the flowi6_flags update was lost. This commit re-add it just before the routing decision. Fixes: bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6") Signed-off-by: Paolo Abeni Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/ipv6/netfilter/nf_dup_ipv6.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 6989c70ae29f..4a84b5ad9ecb 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, fl6.daddr = *gw; fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]); + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); From d8f5cce0a8bce0decd125ef430ee6196692ff8da Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 8 Aug 2016 21:57:58 +0800 Subject: [PATCH 36/79] netfilter: nf_ct_expect: remove the redundant slash when policy name is empty commit b173a28f62cf929324a8a6adcc45adadce311d16 upstream. The 'name' filed in struct nf_conntrack_expect_policy{} is not a pointer, so check it is NULL or not will always return true. Even if the name is empty, slash will always be displayed like follows: # cat /proc/net/nf_conntrack_expect 297 l3proto = 2 proto=6 src=1.1.1.1 dst=2.2.2.2 sport=1 dport=1025 ftp/ ^ Fixes: 3a8fc53a45c4 ("netfilter: nf_ct_helper: allocate 16 bytes for the helper and policy names") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_expect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 7f16d19d6198..a91f8bd51d05 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -560,7 +560,7 @@ static int exp_seq_show(struct seq_file *s, void *v) helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); - if (helper->expect_policy[expect->class].name) + if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } From 2adda392d28baa25ca7803002fdafcaa95b7b1fb Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 8 Aug 2016 22:07:27 +0800 Subject: [PATCH 37/79] netfilter: nfnetlink_queue: reject verdict request from different portid commit 00a3101f561816e58de054a470484996f78eb5eb upstream. Like NFQNL_MSG_VERDICT_BATCH do, we should also reject the verdict request when the portid is not same with the initial portid(maybe from another process). Fixes: 97d32cf9440d ("netfilter: nfnetlink_queue: batch verdict support") Signed-off-by: Liping Zhang Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_queue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f6837f9b6d6c..c14d2e8eaec3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1053,10 +1053,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, struct net *net = sock_net(ctnl); struct nfnl_queue_net *q = nfnl_queue_pernet(net); - queue = instance_lookup(q, queue_num); - if (!queue) - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); From 2cf26badef7bde1f9ecb9fbdea2079ce366f3013 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:29 +0200 Subject: [PATCH 38/79] netfilter: restart search if moved to other chain commit 95a8d19f28e6b29377a880c6264391a62e07fccc upstream. In case nf_conntrack_tuple_taken did not find a conflicting entry check that all entries in this hash slot were tested and restart in case an entry was moved to another chain. Reported-by: Eric Dumazet Fixes: ea781f197d6a ("netfilter: nf_conntrack: use SLAB_DESTROY_BY_RCU and get rid of call_rcu()") Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 86a3c6f0c871..5f747089024f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -719,6 +719,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * least once for the stats anyway. */ rcu_read_lock_bh(); + begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && @@ -730,6 +731,12 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, } NF_CT_STAT_INC(net, searched); } + + if (get_nulls_value(n) != hash) { + NF_CT_STAT_INC(net, search_restart); + goto begin; + } + rcu_read_unlock_bh(); return 0; From 80291566d8f324563f2d889ef5a470e223d44f5c Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Mon, 24 Oct 2016 18:07:23 +0200 Subject: [PATCH 39/79] netfilter: nf_conntrack_sip: extend request line validation commit 444f901742d054a4cd5ff045871eac5131646cfb upstream. on SIP requests, so a fragmented TCP SIP packet from an allow header starting with INVITE,NOTIFY,OPTIONS,REFER,REGISTER,UPDATE,SUBSCRIBE Content-Length: 0 will not bet interpreted as an INVITE request. Also Request-URI must start with an alphabetic character. Confirm with RFC 3261 Request-Line = Method SP Request-URI SP SIP-Version CRLF Fixes: 30f33e6dee80 ("[NETFILTER]: nf_conntrack_sip: support method specific request/response handling") Signed-off-by: Ulrich Weber Acked-by: Marco Angaroni Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_sip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 885b4aba3695..1665c2159e4b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1434,9 +1434,12 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, handler = &sip_handlers[i]; if (handler->request == NULL) continue; - if (*datalen < handler->len || + if (*datalen < handler->len + 2 || strncasecmp(*dptr, handler->method, handler->len)) continue; + if ((*dptr)[handler->len] != ' ' || + !isalpha((*dptr)[handler->len+1])) + continue; if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, &matchoff, &matchlen) <= 0) { From f4ca7cba8ffa94c43913ed322c7d35ab3d6d1e38 Mon Sep 17 00:00:00 2001 From: Pau Espin Pedrol Date: Fri, 6 Jan 2017 20:33:27 +0100 Subject: [PATCH 40/79] netfilter: use fwmark_reflect in nf_send_reset commit cc31d43b4154ad5a7d8aa5543255a93b7e89edc2 upstream. Otherwise, RST packets generated by ipt_REJECT always have mark 0 when the routing is checked later in the same code path. Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies") Cc: Lorenzo Colitti Signed-off-by: Pau Espin Pedrol Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/ipv4/netfilter/nf_reject_ipv4.c | 2 ++ net/ipv6/netfilter/nf_reject_ipv6.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index c747b2d9eb77..d4acf38b60fd 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -124,6 +124,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); + nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); + skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index e0f922b777e3..7117e5bef412 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; + fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst == NULL || dst->error) { @@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) skb_dst_set(nskb, dst); + nskb->mark = fl6.flowi6_mark; + skb_reserve(nskb, hh_len + dst->header_len); ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); From 936b21419e7c5be2f81e6dea02fc3d8852f3fb83 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Apr 2016 10:39:34 +0200 Subject: [PATCH 41/79] netfilter: fix IS_ERR_VALUE usage commit 92b4423e3a0bc5d43ecde4bcad871f8b5ba04efd upstream. This is a forward-port of the original patch from Andrzej Hajda, he said: "IS_ERR_VALUE should be used only with unsigned long type. Otherwise it can work incorrectly. To achieve this function xt_percpu_counter_alloc is modified to return unsigned long, and its result is assigned to temporary variable to perform error checking, before assigning to .pcnt field. The patch follows conclusion from discussion on LKML [1][2]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2120927 [2]: http://permalink.gmane.org/gmane.linux.kernel/2150581" Original patch from Andrzej is here: http://patchwork.ozlabs.org/patch/582970/ This patch has clashed with input validation fixes for x_tables. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Acked-by: Michal Kubecek --- include/linux/netfilter/x_tables.h | 6 +++--- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 26a41ddd3dec..d6c53fce006b 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -381,16 +381,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a, * allows us to return 0 for single core systems without forcing * callers to deal with SMP vs. NONSMP issues. */ -static inline u64 xt_percpu_counter_alloc(void) +static inline unsigned long xt_percpu_counter_alloc(void) { if (nr_cpu_ids > 1) { void __percpu *res = __alloc_percpu(sizeof(struct xt_counters), sizeof(struct xt_counters)); if (res == NULL) - return (u64) -ENOMEM; + return -ENOMEM; - return (u64) (__force unsigned long) res; + return (__force unsigned long) res; } return 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c75211528991..4cfcc22f7430 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -511,11 +511,13 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct xt_entry_target *t; struct xt_target *target; + unsigned long pcnt; int ret; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2ba158f2cb72..a98173d1ea97 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -653,10 +653,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c26bed92f523..bb1b5453a7a1 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -666,10 +666,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; From a359a437fbc6bb08aa9cc8e25ef4ac3b77ca727b Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 3 Dec 2017 12:12:45 -0800 Subject: [PATCH 42/79] netfilter: nfnetlink_cthelper: Add missing permission checks commit 4b380c42f7d00a395feede754f0bc2292eebe6e5 upstream. The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, nfnl_cthelper_list is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: $ nfct helper list nfct v1.4.4: netlink error: Operation not permitted $ vpnns -- nfct helper list { .name = ftp, .queuenum = 0, .l3protonum = 2, .l4protonum = 6, .priv_data_len = 24, .status = enabled, }; Add capable() checks in nfnetlink_cthelper, as this is cleaner than trying to generalize the solution. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 8d34a488efc0..ac143ae4f7b6 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,9 @@ nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -595,6 +599,9 @@ nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -661,6 +668,9 @@ nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); From d01ceb4722cd8d64176272434fe332b596750d9c Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 5 Dec 2017 15:42:41 -0800 Subject: [PATCH 43/79] netfilter: xt_osf: Add missing permission checks commit 916a27901de01446bcf57ecca4783f6cff493309 upstream. The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, xt_osf_fingers is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: vpnns -- nfnl_osf -f /tmp/pf.os vpnns -- nfnl_osf -f /tmp/pf.os -d These non-root operations successfully modify the systemwide OS fingerprint list. Add new capable() checks so that they can't. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/xt_osf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index df8801e02a32..7eae0d0af89a 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -69,6 +70,9 @@ static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -112,6 +116,9 @@ static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; From 8e8224de0e1848a5905ebda018969e2ce72258ca Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 21 Jun 2017 14:34:15 +0200 Subject: [PATCH 44/79] ext2: Don't clear SGID when inheriting ACLs commit a992f2d38e4ce17b8c7d1f7f67b2de0eebdea069 upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by creating __ext2_set_acl() function that does not call posix_acl_update_mode() and use it when inheriting ACLs. That prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org CC: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext2/acl.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d6aeb84e90b6..d882d873c5a3 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -178,11 +178,8 @@ ext2_get_acl(struct inode *inode, int type) return acl; } -/* - * inode->i_mutex: down - */ -int -ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int +__ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; @@ -192,13 +189,6 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: @@ -224,6 +214,24 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; } +/* + * inode->i_mutex: down + */ +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) + return error; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + return __ext2_set_acl(inode, acl, type); +} + /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * @@ -241,12 +249,12 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } return error; From ac125d89f8e980f100f63634a81b84fed7e997cf Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 22 Jun 2017 16:47:34 -0400 Subject: [PATCH 45/79] reiserfs: fix race in prealloc discard commit 08db141b5313ac2f64b844fb5725b8d81744b417 upstream. The main loop in __discard_prealloc is protected by the reiserfs write lock which is dropped across schedules like the BKL it replaced. The problem is that it checks the value, calls a routine that schedules, and then adjusts the state. As a result, two threads that are calling reiserfs_prealloc_discard at the same time can race when one calls reiserfs_free_prealloc_block, the lock is dropped, and the other calls reiserfs_free_prealloc_block with the same block number. In the right circumstances, it can cause the prealloc count to go negative. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/bitmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index dc198bc64c61..73705d4bb069 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -513,9 +513,17 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, "inode has negative prealloc blocks count."); #endif while (ei->i_prealloc_count > 0) { - reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); - ei->i_prealloc_block++; + b_blocknr_t block_to_free; + + /* + * reiserfs_free_prealloc_block can drop the write lock, + * which could allow another caller to free the same block. + * We can protect against it by modifying the prealloc + * state before calling it. + */ + block_to_free = ei->i_prealloc_block++; ei->i_prealloc_count--; + reiserfs_free_prealloc_block(th, inode, block_to_free); dirty = 1; } if (dirty) From 512e3b18531eb9526a1e695c19043f78e3a0abb1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 22 Jun 2017 16:35:04 -0400 Subject: [PATCH 46/79] reiserfs: don't preallocate blocks for extended attributes commit 54930dfeb46e978b447af0fb8ab4e181c1bf9d7a upstream. Most extended attributes will fit in a single block. More importantly, we drop the reference to the inode while holding the transaction open so the preallocated blocks aren't released. As a result, the inode may be evicted before it's removed from the transaction's prealloc list which can cause memory corruption. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 73705d4bb069..edc8ef78b63f 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1136,7 +1136,7 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) hint->prealloc_size = 0; if (!hint->formatted_node && hint->preallocate) { - if (S_ISREG(hint->inode->i_mode) + if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode) && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options. preallocmin * hint->inode->i_sb->s_blocksize) From e22c8378505e1354b4346c7ecccb446b7c505deb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2017 09:32:49 +0200 Subject: [PATCH 47/79] reiserfs: Don't clear SGID when inheriting ACLs commit 6883cd7f68245e43e91e5ee583b7550abf14523f upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __reiserfs_set_acl() into reiserfs_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: reiserfs-devel@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/xattr_acl.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 9b1824f35501..91b036902a17 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -37,7 +37,14 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &inode->i_mode, + &acl); + if (error) + goto unlock; + } error = __reiserfs_set_acl(&th, inode, type, acl); +unlock: reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); @@ -245,11 +252,6 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - } break; case ACL_TYPE_DEFAULT: name = POSIX_ACL_XATTR_DEFAULT; From ad941c49a40237b77c543469e9e8daee0482a851 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 13 Jun 2017 13:35:51 +0200 Subject: [PATCH 48/79] fs/fcntl: f_setown, avoid undefined behaviour commit fc3dc67471461c0efcb1ed22fb7595121d65fad9 upstream. fcntl(0, F_SETOWN, 0x80000000) triggers: UBSAN: Undefined behaviour in fs/fcntl.c:118:7 negation of -2147483648 cannot be represented in type 'int': CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1 ... Call Trace: ... [] ? f_setown+0x1d8/0x200 [] ? SyS_fcntl+0x999/0xf30 [] ? entry_SYSCALL_64_fastpath+0x23/0xc1 Fix that by checking the arg parameter properly (against INT_MAX) before "who = -who". And return immediatelly with -EINVAL in case it is wrong. Note that according to POSIX we can return EINVAL: http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html [EINVAL] The cmd argument is F_SETOWN and the value of the argument is not valid as a process or process group identifier. [v2] returns an error, v1 used to fail silently [v3] implement proper check for the bad value INT_MIN Signed-off-by: Jiri Slaby Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Jeff Layton Signed-off-by: Greg Kroah-Hartman --- fs/fcntl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 62376451bbce..5df914943d96 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -113,6 +113,10 @@ void f_setown(struct file *filp, unsigned long arg, int force) int who = arg; type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return; + type = PIDTYPE_PGID; who = -who; } From 509ed20926aa67aa9933b13f96002dc86b71ca60 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 9 Oct 2017 13:33:19 +0200 Subject: [PATCH 49/79] scsi: libiscsi: fix shifting of DID_REQUEUE host byte commit eef9ffdf9cd39b2986367bc8395e2772bc1284ba upstream. The SCSI host byte should be shifted left by 16 in order to have scsi_decide_disposition() do the right thing (.i.e. requeue the command). Signed-off-by: Johannes Thumshirn Fixes: 661134ad3765 ("[SCSI] libiscsi, bnx2i: make bound ep check common") Cc: Lee Duncan Cc: Hannes Reinecke Cc: Bart Van Assche Cc: Chris Leech Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libiscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index c1ccf1ee99ea..efce04df2109 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1727,7 +1727,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { reason = FAILURE_SESSION_IN_RECOVERY; - sc->result = DID_REQUEUE; + sc->result = DID_REQUEUE << 16; goto fault; } From 5f1dc06152e1683a9277a2836517d6574757dd1f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Jan 2018 15:28:17 +0100 Subject: [PATCH 50/79] Revert "module: Add retpoline tag to VERMAGIC" commit 5132ede0fe8092b043dae09a7cc32b8ae7272baa upstream. This reverts commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12. Turns out distros do not want to make retpoline as part of their "ABI", so this patch should not have been merged. Sorry Andi, this was my fault, I suggested it when your original patch was the "correct" way of doing this instead. Reported-by: Jiri Kosina Fixes: 6cfb521ac0d5 ("module: Add retpoline tag to VERMAGIC") Acked-by: Andi Kleen Cc: Thomas Gleixner Cc: David Woodhouse Cc: rusty@rustcorp.com.au Cc: arjan.van.de.ven@intel.com Cc: jeyu@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/vermagic.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a3d04934aa96..6f8fbcf10dfb 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -24,16 +24,10 @@ #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif -#ifdef RETPOLINE -#define MODULE_VERMAGIC_RETPOLINE "retpoline " -#else -#define MODULE_VERMAGIC_RETPOLINE "" -#endif #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC \ - MODULE_VERMAGIC_RETPOLINE + MODULE_ARCH_VERMAGIC From 9af75003499c7a441e5a483220f5d2cf65861e73 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Fri, 19 Jan 2018 09:43:39 -0800 Subject: [PATCH 51/79] Input: trackpoint - force 3 buttons if 0 button is reported commit f5d07b9e98022d50720e38aa936fc11c67868ece upstream. Lenovo introduced trackpoint compatible sticks with minimum PS/2 commands. They supposed to reply with 0x02, 0x03, or 0x04 in response to the "Read Extended ID" command, so we would know not to try certain extended commands. Unfortunately even some trackpoints reporting the original IBM version (0x01 firmware 0x0e) now respond with incorrect data to the "Get Extended Buttons" command: thinkpad_acpi: ThinkPad BIOS R0DET87W (1.87 ), EC unknown thinkpad_acpi: Lenovo ThinkPad E470, model 20H1004SGE psmouse serio2: trackpoint: IBM TrackPoint firmware: 0x0e, buttons: 0/0 Since there are no trackpoints without buttons, let's assume the trackpoint has 3 buttons when we get 0 response to the extended buttons query. Signed-off-by: Aaron Ma Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196253 Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/trackpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 7e2dc5e56632..0b49f29bf0da 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -383,6 +383,9 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) if (trackpoint_read(&psmouse->ps2dev, TP_EXT_BTN, &button_info)) { psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); button_info = 0x33; + } else if (!button_info) { + psmouse_warn(psmouse, "got 0 in extended button data, assuming 3 buttons\n"); + button_info = 0x33; } psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); From 18e905a839ed54b135b00e9d9c9dee2ed6908a08 Mon Sep 17 00:00:00 2001 From: Andrew Goodbody Date: Tue, 2 Feb 2016 17:36:39 +0000 Subject: [PATCH 52/79] usb: usbip: Fix possible deadlocks reported by lockdep commit 21619792d1eca7e772ca190ba68588e57f29595b upstream. Change spin_lock calls to spin_lock_irqsave to prevent attmpted recursive lock taking in interrupt context. This patch fixes Bug 109351 https://bugzilla.kernel.org/show_bug.cgi?id=109351 Signed-off-by: Andrew Goodbody Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_event.c | 5 +- drivers/usb/usbip/vhci_hcd.c | 88 +++++++++++++++++++-------------- drivers/usb/usbip/vhci_rx.c | 30 ++++++----- drivers/usb/usbip/vhci_sysfs.c | 19 ++++--- drivers/usb/usbip/vhci_tx.c | 14 +++--- 5 files changed, 91 insertions(+), 65 deletions(-) diff --git a/drivers/usb/usbip/usbip_event.c b/drivers/usb/usbip/usbip_event.c index 64933b993d7a..2580a32bcdff 100644 --- a/drivers/usb/usbip/usbip_event.c +++ b/drivers/usb/usbip/usbip_event.c @@ -117,11 +117,12 @@ EXPORT_SYMBOL_GPL(usbip_event_add); int usbip_event_happened(struct usbip_device *ud) { int happened = 0; + unsigned long flags; - spin_lock(&ud->lock); + spin_lock_irqsave(&ud->lock, flags); if (ud->event != 0) happened = 1; - spin_unlock(&ud->lock); + spin_unlock_irqrestore(&ud->lock, flags); return happened; } diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index f9af04d7f02f..0aaa8e524afd 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -121,9 +121,11 @@ static void dump_port_status_diff(u32 prev_status, u32 new_status) void rh_port_connect(int rhport, enum usb_device_speed speed) { + unsigned long flags; + usbip_dbg_vhci_rh("rh_port_connect %d\n", rhport); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); the_controller->port_status[rhport] |= USB_PORT_STAT_CONNECTION | (1 << USB_PORT_FEAT_C_CONNECTION); @@ -139,22 +141,24 @@ void rh_port_connect(int rhport, enum usb_device_speed speed) break; } - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_poll_rh_status(vhci_to_hcd(the_controller)); } static void rh_port_disconnect(int rhport) { + unsigned long flags; + usbip_dbg_vhci_rh("rh_port_disconnect %d\n", rhport); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); the_controller->port_status[rhport] &= ~USB_PORT_STAT_CONNECTION; the_controller->port_status[rhport] |= (1 << USB_PORT_FEAT_C_CONNECTION); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_poll_rh_status(vhci_to_hcd(the_controller)); } @@ -182,13 +186,14 @@ static int vhci_hub_status(struct usb_hcd *hcd, char *buf) int retval; int rhport; int changed = 0; + unsigned long flags; retval = DIV_ROUND_UP(VHCI_NPORTS + 1, 8); memset(buf, 0, retval); vhci = hcd_to_vhci(hcd); - spin_lock(&vhci->lock); + spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) { usbip_dbg_vhci_rh("hw accessible flag not on?\n"); goto done; @@ -209,7 +214,7 @@ static int vhci_hub_status(struct usb_hcd *hcd, char *buf) usb_hcd_resume_root_hub(hcd); done: - spin_unlock(&vhci->lock); + spin_unlock_irqrestore(&vhci->lock, flags); return changed ? retval : 0; } @@ -236,6 +241,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, struct vhci_hcd *dum; int retval = 0; int rhport; + unsigned long flags; u32 prev_port_status[VHCI_NPORTS]; @@ -254,7 +260,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, dum = hcd_to_vhci(hcd); - spin_lock(&dum->lock); + spin_lock_irqsave(&dum->lock, flags); /* store old status and compare now and old later */ if (usbip_dbg_flag_vhci_rh) { @@ -408,7 +414,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, } usbip_dbg_vhci_rh(" bye\n"); - spin_unlock(&dum->lock); + spin_unlock_irqrestore(&dum->lock, flags); return retval; } @@ -431,6 +437,7 @@ static void vhci_tx_urb(struct urb *urb) { struct vhci_device *vdev = get_vdev(urb->dev); struct vhci_priv *priv; + unsigned long flags; if (!vdev) { pr_err("could not get virtual device"); @@ -443,7 +450,7 @@ static void vhci_tx_urb(struct urb *urb) return; } - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); priv->seqnum = atomic_inc_return(&the_controller->seqnum); if (priv->seqnum == 0xffff) @@ -457,7 +464,7 @@ static void vhci_tx_urb(struct urb *urb) list_add_tail(&priv->list, &vdev->priv_tx); wake_up(&vdev->waitq_tx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); } static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, @@ -466,15 +473,16 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, struct device *dev = &urb->dev->dev; int ret = 0; struct vhci_device *vdev; + unsigned long flags; /* patch to usb_sg_init() is in 2.5.60 */ BUG_ON(!urb->transfer_buffer && urb->transfer_buffer_length); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); if (urb->status != -EINPROGRESS) { dev_err(dev, "URB already unlinked!, status %d\n", urb->status); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return urb->status; } @@ -486,7 +494,7 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, vdev->ud.status == VDEV_ST_ERROR) { dev_err(dev, "enqueue for inactive port %d\n", vdev->rhport); spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return -ENODEV; } spin_unlock(&vdev->ud.lock); @@ -559,14 +567,14 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, out: vhci_tx_urb(urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return 0; no_need_xmit: usb_hcd_unlink_urb_from_ep(hcd, urb); no_need_unlink: - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); if (!ret) usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); @@ -623,14 +631,15 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct vhci_priv *priv; struct vhci_device *vdev; + unsigned long flags; - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); priv = urb->hcpriv; if (!priv) { /* URB was never linked! or will be soon given back by * vhci_rx. */ - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return -EIDRM; } @@ -639,7 +648,7 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) ret = usb_hcd_check_unlink_urb(hcd, urb, status); if (ret) { - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return ret; } } @@ -664,10 +673,10 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) */ usb_hcd_unlink_urb_from_ep(hcd, urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); } else { /* tcp connection is alive */ @@ -679,7 +688,7 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) unlink = kzalloc(sizeof(struct vhci_unlink), GFP_ATOMIC); if (!unlink) { spin_unlock(&vdev->priv_lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_MALLOC); return -ENOMEM; } @@ -698,7 +707,7 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) spin_unlock(&vdev->priv_lock); } - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usbip_dbg_vhci_hc("leave\n"); return 0; @@ -707,8 +716,9 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) static void vhci_device_unlink_cleanup(struct vhci_device *vdev) { struct vhci_unlink *unlink, *tmp; + unsigned long flags; - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); spin_lock(&vdev->priv_lock); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_tx, list) { @@ -742,19 +752,19 @@ static void vhci_device_unlink_cleanup(struct vhci_device *vdev) list_del(&unlink->list); spin_unlock(&vdev->priv_lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); spin_lock(&vdev->priv_lock); kfree(unlink); } spin_unlock(&vdev->priv_lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); } /* @@ -821,8 +831,9 @@ static void vhci_shutdown_connection(struct usbip_device *ud) static void vhci_device_reset(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); + unsigned long flags; - spin_lock(&ud->lock); + spin_lock_irqsave(&ud->lock, flags); vdev->speed = 0; vdev->devid = 0; @@ -836,14 +847,16 @@ static void vhci_device_reset(struct usbip_device *ud) } ud->status = VDEV_ST_NULL; - spin_unlock(&ud->lock); + spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_unusable(struct usbip_device *ud) { - spin_lock(&ud->lock); + unsigned long flags; + + spin_lock_irqsave(&ud->lock, flags); ud->status = VDEV_ST_ERROR; - spin_unlock(&ud->lock); + spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_init(struct vhci_device *vdev) @@ -933,12 +946,13 @@ static int vhci_get_frame_number(struct usb_hcd *hcd) static int vhci_bus_suspend(struct usb_hcd *hcd) { struct vhci_hcd *vhci = hcd_to_vhci(hcd); + unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); - spin_lock(&vhci->lock); + spin_lock_irqsave(&vhci->lock, flags); hcd->state = HC_STATE_SUSPENDED; - spin_unlock(&vhci->lock); + spin_unlock_irqrestore(&vhci->lock, flags); return 0; } @@ -947,15 +961,16 @@ static int vhci_bus_resume(struct usb_hcd *hcd) { struct vhci_hcd *vhci = hcd_to_vhci(hcd); int rc = 0; + unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); - spin_lock(&vhci->lock); + spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) rc = -ESHUTDOWN; else hcd->state = HC_STATE_RUNNING; - spin_unlock(&vhci->lock); + spin_unlock_irqrestore(&vhci->lock, flags); return rc; } @@ -1053,17 +1068,18 @@ static int vhci_hcd_suspend(struct platform_device *pdev, pm_message_t state) int rhport = 0; int connected = 0; int ret = 0; + unsigned long flags; hcd = platform_get_drvdata(pdev); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); for (rhport = 0; rhport < VHCI_NPORTS; rhport++) if (the_controller->port_status[rhport] & USB_PORT_STAT_CONNECTION) connected += 1; - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); if (connected > 0) { dev_info(&pdev->dev, diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c index bc4eb0855314..323aa7789989 100644 --- a/drivers/usb/usbip/vhci_rx.c +++ b/drivers/usb/usbip/vhci_rx.c @@ -71,10 +71,11 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, { struct usbip_device *ud = &vdev->ud; struct urb *urb; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, pdu->base.seqnum); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { pr_err("cannot find a urb of seqnum %u max seqnum %d\n", @@ -103,9 +104,9 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_to_hcd(the_controller), urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); @@ -116,8 +117,9 @@ static struct vhci_unlink *dequeue_pending_unlink(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_unlink *unlink, *tmp; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_rx, list) { pr_info("unlink->seqnum %lu\n", unlink->seqnum); @@ -126,12 +128,12 @@ static struct vhci_unlink *dequeue_pending_unlink(struct vhci_device *vdev, unlink->seqnum); list_del(&unlink->list); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return unlink; } } - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } @@ -141,6 +143,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, { struct vhci_unlink *unlink; struct urb *urb; + unsigned long flags; usbip_dump_header(pdu); @@ -151,9 +154,9 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, return; } - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { /* @@ -170,9 +173,9 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, urb->status = pdu->u.ret_unlink.status; pr_info("urb->status %d\n", urb->status); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_to_hcd(the_controller), urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); @@ -184,10 +187,11 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, static int vhci_priv_tx_empty(struct vhci_device *vdev) { int empty = 0; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); empty = list_empty(&vdev->priv_rx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return empty; } diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c index 84c21c4ccf46..1c7f41a65565 100644 --- a/drivers/usb/usbip/vhci_sysfs.c +++ b/drivers/usb/usbip/vhci_sysfs.c @@ -32,10 +32,11 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, { char *s = out; int i = 0; + unsigned long flags; BUG_ON(!the_controller || !out); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); /* * output example: @@ -74,7 +75,7 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, spin_unlock(&vdev->ud.lock); } - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return out - s; } @@ -84,11 +85,12 @@ static DEVICE_ATTR_RO(status); static int vhci_port_disconnect(__u32 rhport) { struct vhci_device *vdev; + unsigned long flags; usbip_dbg_vhci_sysfs("enter\n"); /* lock */ - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); vdev = port_to_vdev(rhport); @@ -98,14 +100,14 @@ static int vhci_port_disconnect(__u32 rhport) /* unlock */ spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return -EINVAL; } /* unlock */ spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_DOWN); @@ -181,6 +183,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, int sockfd = 0; __u32 rhport = 0, devid = 0, speed = 0; int err; + unsigned long flags; /* * @rhport: port number of vhci_hcd @@ -206,14 +209,14 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, /* now need lock until setting vdev status as used */ /* begin a lock */ - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); vdev = port_to_vdev(rhport); spin_lock(&vdev->ud.lock); if (vdev->ud.status != VDEV_ST_NULL) { /* end of the lock */ spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); sockfd_put(socket); @@ -232,7 +235,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->ud.status = VDEV_ST_NOTASSIGNED; spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); /* end the lock */ vdev->ud.tcp_rx = kthread_get_run(vhci_rx_loop, &vdev->ud, "vhci_rx"); diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c index 3c5796c8633a..a9a663a578b6 100644 --- a/drivers/usb/usbip/vhci_tx.c +++ b/drivers/usb/usbip/vhci_tx.c @@ -47,16 +47,17 @@ static void setup_cmd_submit_pdu(struct usbip_header *pdup, struct urb *urb) static struct vhci_priv *dequeue_from_priv_tx(struct vhci_device *vdev) { struct vhci_priv *priv, *tmp; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(priv, tmp, &vdev->priv_tx, list) { list_move_tail(&priv->list, &vdev->priv_rx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return priv; } - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } @@ -137,16 +138,17 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) static struct vhci_unlink *dequeue_from_unlink_tx(struct vhci_device *vdev) { struct vhci_unlink *unlink, *tmp; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_tx, list) { list_move_tail(&unlink->list, &vdev->unlink_rx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return unlink; } - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } From 80e733a9b37fb6b40351bf1924d5a90d89c375ae Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:47 -0700 Subject: [PATCH 53/79] usbip: fix stub_rx: get_pipe() to validate endpoint number commit 635f545a7e8be7596b9b2b6a43cab6bbd5a88e43 upstream. get_pipe() routine doesn't validate the input endpoint number and uses to reference ep_in and ep_out arrays. Invalid endpoint number can trigger BUG(). Range check the epnum and returning error instead of calling BUG(). Change caller stub_recv_cmd_submit() to handle the get_pipe() error return. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 7de54a66044f..e617c90661b4 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -344,15 +344,15 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) struct usb_host_endpoint *ep; struct usb_endpoint_descriptor *epd = NULL; + if (epnum < 0 || epnum > 15) + goto err_ret; + if (dir == USBIP_DIR_IN) ep = udev->ep_in[epnum & 0x7f]; else ep = udev->ep_out[epnum & 0x7f]; - if (!ep) { - dev_err(&sdev->interface->dev, "no such endpoint?, %d\n", - epnum); - BUG(); - } + if (!ep) + goto err_ret; epd = &ep->desc; if (usb_endpoint_xfer_control(epd)) { @@ -383,9 +383,10 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) return usb_rcvisocpipe(udev, epnum); } +err_ret: /* NOT REACHED */ - dev_err(&sdev->interface->dev, "get pipe, epnum %d\n", epnum); - return 0; + dev_err(&sdev->udev->dev, "get pipe() invalid epnum %d\n", epnum); + return -1; } static void masking_bogus_flags(struct urb *urb) @@ -451,6 +452,9 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, struct usb_device *udev = sdev->udev; int pipe = get_pipe(sdev, pdu->base.ep, pdu->base.direction); + if (pipe == -1) + return; + priv = stub_priv_alloc(sdev, pdu); if (!priv) return; From b6f826ba10dce86f74efd3c0953cb9982a3c51e2 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:48 -0700 Subject: [PATCH 54/79] usbip: fix stub_rx: harden CMD_SUBMIT path to handle malicious input commit c6688ef9f29762e65bce325ef4acd6c675806366 upstream. Harden CMD_SUBMIT path to handle malicious input that could trigger large memory allocations. Add checks to validate transfer_buffer_length and number_of_packets to protect against bad input requesting for unbounded memory allocations. Validate early in get_pipe() and return failure. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index e617c90661b4..56cacb68040c 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -338,11 +338,13 @@ static struct stub_priv *stub_priv_alloc(struct stub_device *sdev, return priv; } -static int get_pipe(struct stub_device *sdev, int epnum, int dir) +static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu) { struct usb_device *udev = sdev->udev; struct usb_host_endpoint *ep; struct usb_endpoint_descriptor *epd = NULL; + int epnum = pdu->base.ep; + int dir = pdu->base.direction; if (epnum < 0 || epnum > 15) goto err_ret; @@ -355,6 +357,7 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) goto err_ret; epd = &ep->desc; + if (usb_endpoint_xfer_control(epd)) { if (dir == USBIP_DIR_OUT) return usb_sndctrlpipe(udev, epnum); @@ -377,6 +380,27 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) } if (usb_endpoint_xfer_isoc(epd)) { + /* validate packet size and number of packets */ + unsigned int maxp, packets, bytes; + +#define USB_EP_MAXP_MULT_SHIFT 11 +#define USB_EP_MAXP_MULT_MASK (3 << USB_EP_MAXP_MULT_SHIFT) +#define USB_EP_MAXP_MULT(m) \ + (((m) & USB_EP_MAXP_MULT_MASK) >> USB_EP_MAXP_MULT_SHIFT) + + maxp = usb_endpoint_maxp(epd); + maxp *= (USB_EP_MAXP_MULT( + __le16_to_cpu(epd->wMaxPacketSize)) + 1); + bytes = pdu->u.cmd_submit.transfer_buffer_length; + packets = DIV_ROUND_UP(bytes, maxp); + + if (pdu->u.cmd_submit.number_of_packets < 0 || + pdu->u.cmd_submit.number_of_packets > packets) { + dev_err(&sdev->udev->dev, + "CMD_SUBMIT: isoc invalid num packets %d\n", + pdu->u.cmd_submit.number_of_packets); + return -1; + } if (dir == USBIP_DIR_OUT) return usb_sndisocpipe(udev, epnum); else @@ -385,7 +409,7 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) err_ret: /* NOT REACHED */ - dev_err(&sdev->udev->dev, "get pipe() invalid epnum %d\n", epnum); + dev_err(&sdev->udev->dev, "CMD_SUBMIT: invalid epnum %d\n", epnum); return -1; } @@ -450,7 +474,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, struct stub_priv *priv; struct usbip_device *ud = &sdev->ud; struct usb_device *udev = sdev->udev; - int pipe = get_pipe(sdev, pdu->base.ep, pdu->base.direction); + int pipe = get_pipe(sdev, pdu); if (pipe == -1) return; From 04e7c734e33c53ca2f2b8d027d9f7251f12bb206 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 15 Dec 2017 10:50:09 -0700 Subject: [PATCH 55/79] usbip: prevent leaking socket pointer address in messages commit 90120d15f4c397272aaf41077960a157fc4212bf upstream. usbip driver is leaking socket pointer address in messages. Remove the messages that aren't useful and print sockfd in the ones that are useful for debugging. Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_dev.c | 3 +-- drivers/usb/usbip/usbip_common.c | 15 ++++----------- drivers/usb/usbip/vhci_hcd.c | 2 +- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c index a3ec49bdc1e6..ec38370ffcab 100644 --- a/drivers/usb/usbip/stub_dev.c +++ b/drivers/usb/usbip/stub_dev.c @@ -163,8 +163,7 @@ static void stub_shutdown_connection(struct usbip_device *ud) * step 1? */ if (ud->tcp_socket) { - dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n", - ud->tcp_socket); + dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 9752b93f754e..1838f1b2c2fa 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -317,18 +317,14 @@ int usbip_recv(struct socket *sock, void *buf, int size) struct msghdr msg; struct kvec iov; int total = 0; - /* for blocks of if (usbip_dbg_flag_xmit) */ char *bp = buf; int osize = size; - usbip_dbg_xmit("enter\n"); - - if (!sock || !buf || !size) { - pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf, - size); + if (!sock || !buf || !size) return -EINVAL; - } + + usbip_dbg_xmit("enter\n"); do { sock->sk->sk_allocation = GFP_NOIO; @@ -341,11 +337,8 @@ int usbip_recv(struct socket *sock, void *buf, int size) msg.msg_flags = MSG_NOSIGNAL; result = kernel_recvmsg(sock, &msg, &iov, 1, size, MSG_WAITALL); - if (result <= 0) { - pr_debug("receive sock %p buf %p size %u ret %d total %d\n", - sock, buf, size, result, total); + if (result <= 0) goto err; - } size -= result; buf += result; diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 0aaa8e524afd..00d68945548e 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -778,7 +778,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud) /* need this? see stub_dev.c */ if (ud->tcp_socket) { - pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket); + pr_debug("shutdown sockfd %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } From e1e457a49544718928282c56b69af082e1ebe405 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sun, 20 Aug 2017 13:26:04 +0200 Subject: [PATCH 56/79] um: link vmlinux with -no-pie commit 883354afbc109c57f925ccc19840055193da0cc0 upstream. Debian's gcc defaults to pie. The global Makefile already defines the -fno-pie option. Link UML dynamic kernel image also with -no-pie to fix the build. Signed-off-by: Thomas Meyer Signed-off-by: Richard Weinberger Cc: Bernie Innocenti Signed-off-by: Greg Kroah-Hartman --- arch/um/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/Makefile b/arch/um/Makefile index e3abe6f3156d..9ccf462131c4 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -117,7 +117,7 @@ archheaders: archprepare: include/generated/user_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib +LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie) CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ $(call cc-option, -fno-stack-protector,) \ From ed73df0b7f23c95b3243a0f4bfc40f962e61d349 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 26 Jan 2018 16:23:02 +0000 Subject: [PATCH 57/79] vsyscall: Fix permissions for emulate mode with KAISER/PTI The backport of KAISER to 4.4 turned vsyscall emulate mode into native mode. Add a vsyscall_pgprot variable to hold the correct page protections, like Borislav and Hugh did for 3.2 and 3.18. Cc: Borislav Petkov Cc: Hugh Dickins Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/vsyscall/vsyscall_64.c | 7 ++++--- arch/x86/include/asm/vsyscall.h | 1 + arch/x86/mm/kaiser.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 112178b401a1..2d359991a273 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -46,6 +46,7 @@ static enum { EMULATE, NATIVE, NONE } vsyscall_mode = #else EMULATE; #endif +unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL; static int __init vsyscall_setup(char *str) { @@ -336,11 +337,11 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + if (vsyscall_mode != NATIVE) + vsyscall_pgprot = __PAGE_KERNEL_VVAR; if (vsyscall_mode != NONE) __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + __pgprot(vsyscall_pgprot)); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 4865e10dbb55..9ee85066f407 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,6 +13,7 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); extern bool vsyscall_enabled(void); +extern unsigned long vsyscall_pgprot; #else static inline void map_vsyscall(void) {} static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 8af98513d36c..2298434f7bdb 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -345,7 +345,7 @@ void __init kaiser_init(void) if (vsyscall_enabled()) kaiser_add_user_map_early((void *)VSYSCALL_ADDR, PAGE_SIZE, - __PAGE_KERNEL_VSYSCALL); + vsyscall_pgprot); for_each_possible_cpu(cpu) { void *percpu_vaddr = __per_cpu_user_mapped_start + From 506ff217e470bdaa78477406a051c0570403307d Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 8 Mar 2017 19:03:44 +0100 Subject: [PATCH 58/79] eventpoll.h: add missing epoll event masks commit 7e040726850a106587485c21bdacc0bfc8a0cbed upstream. [resend due to me forgetting to cc: linux-api the first time around I posted these back on Feb 23] From: Greg Kroah-Hartman For some reason these values are not in the uapi header file, so any libc has to define it themselves. To prevent them from needing to do this, just have the kernel provide the correct values. Reported-by: Elliott Hughes Signed-off-by: Greg Hackmann Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/eventpoll.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index bc81fb2e1f0e..6f04cb419115 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,6 +26,19 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Epoll event masks */ +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 + /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. From 20c0a04284496b262b172707b7dd122cdd8fe9a1 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Tue, 23 Jan 2018 11:41:32 +0100 Subject: [PATCH 59/79] x86/microcode/intel: Extend BDW late-loading further with LLC size check commit 7e702d17ed138cf4ae7c00e8c00681ed464587c7 upstream. Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") reduced the impact of erratum BDF90 for Broadwell model 79. The impact can be reduced further by checking the size of the last level cache portion per core. Tony: "The erratum says the problem only occurs on the large-cache SKUs. So we only need to avoid the update if we are on a big cache SKU that is also running old microcode." For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ee011bd7934d..2c76a1801393 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,6 +39,9 @@ #include #include +/* last level cache size per core */ +static int llc_size_per_core; + static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; static struct mc_saved_data { unsigned int mc_saved_count; @@ -996,12 +999,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 - * may result in a system hang. This behavior is documented in item - * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). */ if (c->x86 == 6 && c->x86_model == 79 && c->x86_mask == 0x01 && + llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); @@ -1068,6 +1073,15 @@ static struct microcode_ops microcode_intel_ops = { .microcode_fini_cpu = microcode_fini_cpu, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -1078,6 +1092,8 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } From 360496cab53af2f1dd77dbe353b7b665bcfdc1e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jan 2018 14:54:32 +0100 Subject: [PATCH 60/79] hrtimer: Reset hrtimer cpu base proper on CPU hotplug commit d5421ea43d30701e03cadc56a38854c36a8b4433 upstream. The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos Signed-off-by: Greg Kroah-Hartman --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1dc94768b5a3..323282e63865 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -669,7 +669,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next.tv64 = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1615,6 +1617,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } From e91e5b700fb6f887d4df4a18de0d8865df43d77d Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 26 Jan 2018 15:14:16 +0300 Subject: [PATCH 61/79] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state [ Upstream commit dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 ] ccid2_hc_tx_rto_expire() timer callback always restarts the timer again and can run indefinitely (unless it is stopped outside), and after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from dccp_destroy_sock() to sk_destruct(), this started to happen quite often. The timer prevents releasing the socket, as a result, sk_destruct() won't be called. Found with LTP/dccp_ipsec tests running on the bonding device, which later couldn't be unloaded after the tests were completed: unregister_netdevice: waiting for bond0 to become free. Usage count = 148 Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/ccids/ccid2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 5e3a7302f774..7753681195c1 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) From c5371a321a4a3c1f181f4482b3b3ceae06b72879 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:06:42 +0000 Subject: [PATCH 62/79] ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL [ Upstream commit e9191ffb65d8e159680ce0ad2224e1acbde6985c ] Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl setting") removed the initialisation of ipv6_pinfo::autoflowlabel and added a second flag to indicate whether this field or the net namespace default should be used. The getsockopt() handling for this case was not updated, so it currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is not explicitly enabled. Fix it to return the effective value, whether that has been set at the socket or net namespace level. Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...") Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ipv6.h | 1 + net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7a8066b90289..84f0d0602433 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -281,6 +281,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b809958f7388..d82b814e6c27 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -148,7 +148,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { if (!np->autoflowlabel_set) return ip6_default_np_autolabel(net); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 435e26210587..9011176c8387 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1313,7 +1313,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; default: From c867a05df5d1baefbb2040474356b5e02ca5fcaf Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Wed, 10 Jan 2018 12:45:10 -0500 Subject: [PATCH 63/79] ipv6: fix udpv6 sendmsg crash caused by too small MTU [ Upstream commit 749439bfac6e1a2932c582e2699f91d329658196 ] The logic in __ip6_append_data() assumes that the MTU is at least large enough for the headers. A device's MTU may be adjusted after being added while sendmsg() is processing data, resulting in __ip6_append_data() seeing any MTU. For an mtu smaller than the size of the fragmentation header, the math results in a negative 'maxfraglen', which causes problems when refragmenting any previous skb in the skb_write_queue, leaving it possibly malformed. Instead sendmsg returns EINVAL when the mtu is calculated to be less than IPV6_MIN_MTU. Found by syzkaller: kernel BUG at ./include/linux/skbuff.h:2064! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d0b68580 task.stack: ffff8801ac6b8000 RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline] RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216 RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000 RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0 RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000 R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8 R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000 FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_finish_skb include/net/ipv6.h:911 [inline] udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093 udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x352/0x5a0 net/socket.c:1750 SyS_sendto+0x40/0x50 net/socket.c:1718 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9 RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005 RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69 R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000 Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570 RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570 Reported-by: syzbot Signed-off-by: Mike Maloney Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d82b814e6c27..dfd4a9a02dc3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1246,14 +1246,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; if (dst_allfrag(rt->dst.path)) cork->base.flags |= IPCORK_ALLFRAG; From f64568e420e6b9b523611ef564941b83c7527614 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2018 22:31:18 -0800 Subject: [PATCH 64/79] ipv6: ip6_make_skb() needs to clear cork.base.dst [ Upstream commit 95ef498d977bf44ac094778fd448b98af158a3e6 ] In my last patch, I missed fact that cork.base.dst was not initialized in ip6_make_skb() : If ip6_setup_cork() returns an error, we might attempt a dst_release() on some random pointer. Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index dfd4a9a02dc3..3ef81c387923 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1785,6 +1785,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6); if (err) { From 1202fc020465b67297f84dfc7127b83805082c69 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Mon, 15 Jan 2018 13:24:28 -0500 Subject: [PATCH 65/79] lan78xx: Fix failure in USB Full Speed [ Upstream commit a5b1379afbfabf91e3a689e82ac619a7157336b3 ] Fix initialize the uninitialized tx_qlen to an appropriate value when USB Full Speed is used. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/lan78xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 41e9ebd7d0a6..ebdee8f01f65 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1859,6 +1859,7 @@ static int lan78xx_reset(struct lan78xx_net *dev) buf = DEFAULT_BURST_CAP_SIZE / FS_USB_PKT_SIZE; dev->rx_urb_size = DEFAULT_BURST_CAP_SIZE; dev->rx_qlen = 4; + dev->tx_qlen = 4; } ret = lan78xx_write_reg(dev, BURST_CAP, buf); From 6c489ab43ccacb9f26620ded90374c582c466344 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 19 Jan 2018 11:50:46 +0100 Subject: [PATCH 66/79] net: igmp: fix source address check for IGMPv3 reports [ Upstream commit ad23b750933ea7bf962678972a286c78a8fa36aa ] Commit "net: igmp: Use correct source address on IGMPv3 reports" introduced a check to validate the source address of locally generated IGMPv3 packets. Instead of checking the local interface address directly, it uses inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the local subnet (or equal to the point-to-point address if used). This breaks for point-to-point interfaces, so check against ifa->ifa_local directly. Cc: Kevin Cernekee Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Reported-by: Sebastian Gottschall Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller Tested-by: Florian Wolters Signed-off-by: Greg Kroah-Hartman --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b60106d34346..8212ed80da48 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -338,7 +338,7 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, return htonl(INADDR_ANY); for_ifa(in_dev) { - if (inet_ifa_match(fl4->saddr, ifa)) + if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } endfor_ifa(in_dev); From bed42ef5ebbfc046e0440f7e0f941cc07f8ca1e8 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:37 -0500 Subject: [PATCH 67/79] tcp: __tcp_hdrlen() helper commit d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 upstream. tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr. This splits the size calculation into a helper function that can be used if a struct tcphdr is already available. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 318c24612458..2260f92f1492 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) return (struct tcphdr *)skb_transport_header(skb); } +static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) +{ + return th->doff * 4; +} + static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { - return tcp_hdr(skb)->doff * 4; + return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) From f50fc5f4f3e5f0d2b55cd09c01f3a09d0ddeb9dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2018 19:59:19 -0800 Subject: [PATCH 68/79] net: qdisc_pkt_len_init() should be more robust [ Upstream commit 7c68d1a6b4db9012790af7ac0f0fdc0d2083422a ] Without proper validation of DODGY packets, we might very well feed qdisc_pkt_len_init() with invalid GSO packets. tcp_hdrlen() might access out-of-bound data, so let's use skb_header_pointer() and proper checks. Whole story is described in commit d0c081b49137 ("flow_dissector: properly cap thoff field") We have the goal of validating DODGY packets earlier in the stack, so we might very well revert this fix in the future. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Jason Wang Reported-by: syzbot+9da69ebac7dddd804552@syzkaller.appspotmail.com Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 3b67c1e5756f..cb58ba15d51e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2889,10 +2889,21 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, From a49558ebc2c4ce129a381d7cf39c4f01ea02c78b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 22 Jan 2018 18:06:37 +0100 Subject: [PATCH 69/79] pppoe: take ->needed_headroom of lower device into account on xmit [ Upstream commit 02612bb05e51df8489db5e94d0cf8d1c81f87b0c ] In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom was probably fine before the introduction of ->needed_headroom in commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom"). But now, virtual devices typically advertise the size of their overhead in dev->needed_headroom, so we must also take it into account in skb_reserve(). Allocation size of skb is also updated to take dev->needed_tailroom into account and replace the arbitrary 32 bytes with the real size of a PPPoE header. This issue was discovered by syzbot, who connected a pppoe socket to a gre device which had dev->header_ops->create == ipgre_header and dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any headroom, and dev_hard_header() crashed when ipgre_header() tried to prepend its header to skb->data. skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24 head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted 4.15.0-rc7-next-20180115+ #97 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282 RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000 RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0 R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180 FS: 00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_under_panic net/core/skbuff.c:114 [inline] skb_push+0xce/0xf0 net/core/skbuff.c:1714 ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879 dev_hard_header include/linux/netdevice.h:2723 [inline] pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg+0xca/0x110 net/socket.c:640 sock_write_iter+0x31a/0x5d0 net/socket.c:909 call_write_iter include/linux/fs.h:1775 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 vfs_writev+0x18a/0x340 fs/read_write.c:977 do_writev+0xfc/0x2a0 fs/read_write.c:1012 SYSC_writev fs/read_write.c:1085 [inline] SyS_writev+0x27/0x30 fs/read_write.c:1082 entry_SYSCALL_64_fastpath+0x29/0xa0 Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like interfaces, but reserving space for ->needed_headroom is a more fundamental issue that needs to be addressed first. Same problem exists for __pppoe_xmit(), which also needs to take dev->needed_headroom into account in skb_cow_head(). Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom") Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Reviewed-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ppp/pppoe.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4e0068e775f9..b7b859c3a0c7 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -860,6 +860,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, struct pppoe_hdr *ph; struct net_device *dev; char *start; + int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { @@ -878,16 +879,16 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, if (total_len > (dev->mtu + dev->hard_header_len)) goto end; - - skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32, - 0, GFP_KERNEL); + hlen = LL_RESERVED_SPACE(dev); + skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; @@ -948,7 +949,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) /* Copy the data if there is no space for the header or if it's * read-only. */ - if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len)) + if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); From d6611191f5a18d03314ab4a1b621d1892116af79 Mon Sep 17 00:00:00 2001 From: Francois Romieu Date: Fri, 26 Jan 2018 01:53:26 +0100 Subject: [PATCH 70/79] r8169: fix memory corruption on retrieval of hardware statistics. [ Upstream commit a78e93661c5fd30b9e1dee464b2f62f966883ef7 ] Hardware statistics retrieval hurts in tight invocation loops. Avoid extraneous write and enforce strict ordering of writes targeted to the tally counters dump area address registers. Signed-off-by: Francois Romieu Tested-by: Oliver Freyermuth Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index c5ea1018cb47..24155380e43c 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -2205,19 +2205,14 @@ static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd) void __iomem *ioaddr = tp->mmio_addr; dma_addr_t paddr = tp->counters_phys_addr; u32 cmd; - bool ret; RTL_W32(CounterAddrHigh, (u64)paddr >> 32); + RTL_R32(CounterAddrHigh); cmd = (u64)paddr & DMA_BIT_MASK(32); RTL_W32(CounterAddrLow, cmd); RTL_W32(CounterAddrLow, cmd | counter_cmd); - ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); - - RTL_W32(CounterAddrLow, 0); - RTL_W32(CounterAddrHigh, 0); - - return ret; + return rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); } static bool rtl8169_reset_counters(struct net_device *dev) From 23f521bc70b935156e13897db2873312b18b0437 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:02:00 +0800 Subject: [PATCH 71/79] sctp: do not allow the v4 socket to bind a v4mapped v6 address [ Upstream commit c5006b8aa74599ce19104b31d322d2ea9ff887cc ] The check in sctp_sockaddr_af is not robust enough to forbid binding a v4mapped v6 addr on a v4 socket. The worse thing is that v4 socket's bind_verify would not convert this v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4 socket bound a v6 addr. This patch is to fix it by doing the common sa.sa_family check first, then AF_INET check for v4mapped v6 addrs. Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.") Reported-by: syzbot+7b7b518b1228d2743963@syzkaller.appspotmail.com Acked-by: Neil Horman Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a870d27ca778..acf487283e9f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -332,16 +332,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (len < sizeof (struct sockaddr)) return NULL; + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + /* V4 mapped address are really of AF_INET family */ if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { - if (!opt->pf->af_supported(AF_INET, opt)) - return NULL; - } else { - /* Does this PF support this AF? */ - if (!opt->pf->af_supported(addr->sa.sa_family, opt)) - return NULL; - } + ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); From 50194c3f48de2c2374731d4b6b7f3fbb89250094 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:36 +0800 Subject: [PATCH 72/79] sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf [ Upstream commit a0ff660058b88d12625a783ce9e5c1371c87951f ] After commit cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep"), it may change to lock another sk if the asoc has been peeled off in sctp_wait_for_sndbuf. However, the asoc's new sk could be already closed elsewhere, as it's in the sendmsg context of the old sk that can't avoid the new sk's closing. If the sk's last one refcnt is held by this asoc, later on after putting this asoc, the new sk will be freed, while under it's own lock. This patch is to revert that commit, but fix the old issue by returning error under the old sk's lock. Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep") Reported-by: syzbot+ac6ea7baa4432811eb50@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index acf487283e9f..e9851198a850 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -83,7 +83,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk); + size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1952,7 +1952,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -6974,12 +6974,12 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk) + size_t msg_len) { struct sock *sk = asoc->base.sk; - int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); + int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); @@ -7008,17 +7008,13 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } out: - *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ From 0d9bcadb2226150776c5bf84ef6eb5df03e32b03 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 22 Jan 2018 16:06:37 -0500 Subject: [PATCH 73/79] vmxnet3: repair memory leak [ Upstream commit 848b159835ddef99cc4193083f7e786c3992f580 ] with the introduction of commit b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info is improperly handled. While it is heap allocated when an rx queue is setup, and freed when torn down, an old line of code in vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0] being set to NULL prior to its being freed, causing a memory leak, which eventually exhausts the system on repeated create/destroy operations (for example, when the mtu of a vmxnet3 interface is changed frequently. Fix is pretty straight forward, just move the NULL set to after the free. Tested by myself with successful results Applies to net, and should likely be queued for stable, please Signed-off-by: Neil Horman Reported-By: boyang@redhat.com CC: boyang@redhat.com CC: Shrikrishna Khare CC: "VMware, Inc." CC: David S. Miller Acked-by: Shrikrishna Khare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 0cbf520cea77..82bf85ae5d08 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1563,7 +1563,6 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, rq->rx_ring[i].basePA); rq->rx_ring[i].base = NULL; } - rq->buf_info[i] = NULL; } if (rq->comp_ring.base) { @@ -1578,6 +1577,7 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, (rq->rx_ring[0].size + rq->rx_ring[1].size); dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0], rq->buf_info_pa); + rq->buf_info[0] = rq->buf_info[1] = NULL; } } From 29837a4a8764c1b73674eb78c99717cbc73aa9f3 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:50 -0800 Subject: [PATCH 74/79] net: Allow neigh contructor functions ability to modify the primary_key [ Upstream commit 096b9854c04df86f03b38a97d40b6506e5730919 ] Use n->primary_key instead of pkey to account for the possibility that a neigh constructor function may have modified the primary_key value. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ae92131c4f89..253c86b78ff0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -496,7 +496,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); @@ -508,7 +508,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; From cab8451486a6fd9ba4ce798d41352f45abb44fae Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:51 -0800 Subject: [PATCH 75/79] ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY [ Upstream commit cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 ] Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices to avoid making an entry for every remote ip the device needs to talk to. This used the be the old behavior but became broken in a263b3093641f (ipv4: Make neigh lookups directly in output packet path) and later removed in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point devices) because it was broken. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/arp.h | 3 +++ net/ipv4/arp.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/net/arp.h b/include/net/arp.h index 5e0f891d476c..1b3f86981757 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -19,6 +19,9 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = INADDR_ANY; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 711b4dfa17c3..cb5eb649ad5f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32 *)neigh->primary_key; + __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; + u32 inaddr_any = INADDR_ANY; + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); + + addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { From d35cd5e279881ec36ff8cd82a2d9caebc0cce3fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Jan 2018 14:21:13 -0800 Subject: [PATCH 76/79] flow_dissector: properly cap thoff field [ Upstream commit d0c081b49137cd3200f2023c0875723be66e7ce5 ] syzbot reported yet another crash [1] that is caused by insufficient validation of DODGY packets. Two bugs are happening here to trigger the crash. 1) Flow dissection leaves with incorrect thoff field. 2) skb_probe_transport_header() sets transport header to this invalid thoff, even if pointing after skb valid data. 3) qdisc_pkt_len_init() reads out-of-bound data because it trusts tcp_hdrlen(skb) Possible fixes : - Full flow dissector validation before injecting bad DODGY packets in the stack. This approach was attempted here : https://patchwork.ozlabs.org/patch/ 861874/ - Have more robust functions in the core. This might be needed anyway for stable versions. This patch fixes the flow dissection issue. [1] CPU: 1 PID: 3144 Comm: syzkaller271204 Not tainted 4.15.0-rc4-mm1+ #49 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:355 [inline] kasan_report+0x23b/0x360 mm/kasan/report.c:413 __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:432 __tcp_hdrlen include/linux/tcp.h:35 [inline] tcp_hdrlen include/linux/tcp.h:40 [inline] qdisc_pkt_len_init net/core/dev.c:3160 [inline] __dev_queue_xmit+0x20d3/0x2200 net/core/dev.c:3465 dev_queue_xmit+0x17/0x20 net/core/dev.c:3554 packet_snd net/packet/af_packet.c:2943 [inline] packet_sendmsg+0x3ad5/0x60a0 net/packet/af_packet.c:2968 sock_sendmsg_nosec net/socket.c:628 [inline] sock_sendmsg+0xca/0x110 net/socket.c:638 sock_write_iter+0x31a/0x5d0 net/socket.c:907 call_write_iter include/linux/fs.h:1776 [inline] new_sync_write fs/read_write.c:469 [inline] __vfs_write+0x684/0x970 fs/read_write.c:482 vfs_write+0x189/0x510 fs/read_write.c:544 SYSC_write fs/read_write.c:589 [inline] SyS_write+0xef/0x220 fs/read_write.c:581 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 34fad54c2537 ("net: __skb_flow_dissect() must cap its return value") Fixes: a6e544b0a88b ("flow_dissector: Jump to exit code in __skb_flow_dissect") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/flow_dissector.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ee9082792530..4d14908afaec 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -492,8 +492,8 @@ ip_proto_again: out_good: ret = true; - key_control->thoff = (u16)nhoff; out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; @@ -501,7 +501,6 @@ out: out_bad: ret = false; - key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); goto out; } EXPORT_SYMBOL(__skb_flow_dissect); From edaafa805e0f9d09560a4892790b8e19cab8bf09 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 18 Jan 2018 16:14:26 -0500 Subject: [PATCH 77/79] net: tcp: close sock if net namespace is exiting [ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ] When a tcp socket is closed, if it detects that its net namespace is exiting, close immediately and do not wait for FIN sequence. For normal sockets, a reference is taken to their net namespace, so it will never exit while the socket is open. However, kernel sockets do not take a reference to their net namespace, so it may begin exiting while the kernel socket is still open. In this case if the kernel socket is a tcp socket, it will stay open trying to complete its close sequence. The sock's dst(s) hold a reference to their interface, which are all transferred to the namespace's loopback interface when the real interfaces are taken down. When the namespace tries to take down its loopback interface, it hangs waiting for all references to the loopback interface to release, which results in messages like: unregister_netdevice: waiting for lo to become free. Usage count = 1 These messages continue until the socket finally times out and closes. Since the net namespace cleanup holds the net_mutex while calling its registered pernet callbacks, any new net namespace initialization is blocked until the current net namespace finishes exiting. After this change, the tcp socket notices the exiting net namespace, and closes immediately, releasing its dst(s) and their reference to the loopback interface, which lets the net namespace continue exiting. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 Signed-off-by: Dan Streetman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/net_namespace.h | 10 ++++++++++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_timer.c | 15 +++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 2dcea635ecce..93328c61934a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -209,6 +209,11 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } +static inline int check_net(const struct net *net) +{ + return atomic_read(&net->count) != 0; +} + void net_drop_ns(void *); #else @@ -233,6 +238,11 @@ int net_eq(const struct net *net1, const struct net *net2) return 1; } +static inline int check_net(const struct net *net) +{ + return 1; +} + #define net_drop_ns NULL #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5597120c8ffd..37e8966a457b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2176,6 +2176,9 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1ec12a4f327e..35f638cfc675 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -46,11 +46,19 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * + * Also close if our net namespace is exiting; in that case there is no + * hope of ever communicating again since all netns interfaces are already + * down (or about to be down), and we need to release our dst references, + * which have been moved to the netns loopback interface, so the namespace + * can finish exiting. This condition is only possible if we are a kernel + * socket, as those do not hold references to the namespace. + * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. + * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -79,6 +87,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } + + if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_done(sk); + return 1; + } + return 0; } From 3f84339bd344b2cf0afe64b78d3964bb6422d0f3 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:11:06 +0000 Subject: [PATCH 78/79] nfsd: auth: Fix gid sorting when rootsquash enabled commit 1995266727fa8143897e89b55f5d3c79aa828420 upstream. Commit bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility group_info allocators") appears to break nfsd rootsquash in a pretty major way. It adds a call to groups_sort() inside the loop that copies/squashes gids, which means the valid gids are sorted along with the following garbage. The net result is that the highest numbered valid gids are replaced with any lower-valued garbage gids, possibly including 0. We should sort only once, after filling in all the gids. Fixes: bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility ...") Signed-off-by: Ben Hutchings Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds Cc: Wolfgang Walter Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/auth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index a260060042ad..67eb154af881 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,9 +60,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else GROUP_AT(gi, i) = GROUP_AT(rqgi, i); - /* Each thread allocates its own gi, no race */ - groups_sort(gi); } + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } else { gi = get_group_info(rqgi); } From 49fe90b853dfb1087d0a734cd7f4af1aa00c8e53 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 31 Jan 2018 12:06:14 +0100 Subject: [PATCH 79/79] Linux 4.4.114 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 39019c9d205c..153440b1bbb0 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 113 +SUBLEVEL = 114 EXTRAVERSION = NAME = Blurry Fish Butt