From 760fed99e2bfcf0c4c7b1880ebbe19fdb4d9738a Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 18 Jul 2018 14:29:54 -0700 Subject: [PATCH 01/59] scsi: qla2xxx: Fix ISP recovery on unload commit b08abbd9f5996309f021684f9ca74da30dcca36a upstream. During unload process, the chip can encounter problem where a FW dump would be captured. For this case, the full reset sequence will be skip to bring the chip back to full operational state. Fixes: e315cd28b9ef ("[SCSI] qla2xxx: Code changes for qla data structure refactoring") Cc: Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_os.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 5cbf20ab94aa..18b19744398a 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4938,8 +4938,9 @@ qla2x00_do_dpc(void *data) } } - if (test_and_clear_bit(ISP_ABORT_NEEDED, - &base_vha->dpc_flags)) { + if (test_and_clear_bit + (ISP_ABORT_NEEDED, &base_vha->dpc_flags) && + !test_bit(UNLOADING, &base_vha->dpc_flags)) { ql_dbg(ql_dbg_dpc, base_vha, 0x4007, "ISP abort scheduled.\n"); From 468926f8dbb747b2612459113a926be404c9a6d6 Mon Sep 17 00:00:00 2001 From: Anil Gurumurthy Date: Wed, 18 Jul 2018 14:29:55 -0700 Subject: [PATCH 02/59] scsi: qla2xxx: Return error when TMF returns commit b4146c4929ef61d5afca011474d59d0918a0cd82 upstream. Propagate the task management completion status properly to avoid unnecessary waits for commands to complete. Fixes: faef62d13463 ("[SCSI] qla2xxx: Fix Task Management command asynchronous handling") Cc: Signed-off-by: Anil Gurumurthy Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_init.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index a9eb3cd453be..41a646696bab 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -325,11 +325,10 @@ qla2x00_async_tm_cmd(fc_port_t *fcport, uint32_t flags, uint32_t lun, wait_for_completion(&tm_iocb->u.tmf.comp); - rval = tm_iocb->u.tmf.comp_status == CS_COMPLETE ? - QLA_SUCCESS : QLA_FUNCTION_FAILED; + rval = tm_iocb->u.tmf.data; - if ((rval != QLA_SUCCESS) || tm_iocb->u.tmf.data) { - ql_dbg(ql_dbg_taskm, vha, 0x8030, + if (rval != QLA_SUCCESS) { + ql_log(ql_log_warn, vha, 0x8030, "TM IOCB failed (%x).\n", rval); } From 09a0de491c5ec0e40f8f9d21b13af306667360f4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Aug 2018 14:44:59 +0200 Subject: [PATCH 03/59] genirq: Make force irq threading setup more robust commit d1f0301b3333eef5efbfa1fe0f0edbea01863d5d upstream. The support of force threading interrupts which are set up with both a primary and a threaded handler wreckaged the setup of regular requested threaded interrupts (primary handler == NULL). The reason is that it does not check whether the primary handler is set to the default handler which wakes the handler thread. Instead it replaces the thread handler with the primary handler as it would do with force threaded interrupts which have been requested via request_irq(). So both the primary and the thread handler become the same which then triggers the warnon that the thread handler tries to wakeup a not configured secondary thread. Fortunately this only happens when the driver omits the IRQF_ONESHOT flag when requesting the threaded interrupt, which is normaly caught by the sanity checks when force irq threading is disabled. Fix it by skipping the force threading setup when a regular threaded interrupt is requested. As a consequence the interrupt request which lacks the IRQ_ONESHOT flag is rejected correctly instead of silently wreckaging it. Fixes: 2a1d3ab8986d ("genirq: Handle force threading of irqs with primary and thread handler") Reported-by: Kurt Kanzenbach Signed-off-by: Thomas Gleixner Tested-by: Kurt Kanzenbach Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f55a8bf5264..0df2b44dac7c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1012,6 +1012,13 @@ static int irq_setup_forced_threading(struct irqaction *new) if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; + /* + * No further action required for interrupts which are requested as + * threaded interrupts already + */ + if (new->handler == irq_default_primary_handler) + return 0; + new->flags |= IRQF_ONESHOT; /* @@ -1019,7 +1026,7 @@ static int irq_setup_forced_threading(struct irqaction *new) * thread handler. We force thread them as well by creating a * secondary action. */ - if (new->handler != irq_default_primary_handler && new->thread_fn) { + if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) From 0a022859e2304b21a6cd620c2eb008cc60de0ab8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 31 Jul 2018 18:13:58 +0200 Subject: [PATCH 04/59] nohz: Fix local_timer_softirq_pending() commit 80d20d35af1edd632a5e7a3b9c0ab7ceff92769e upstream. local_timer_softirq_pending() checks whether the timer softirq is pending with: local_softirq_pending() & TIMER_SOFTIRQ. This is wrong because TIMER_SOFTIRQ is the softirq number and not a bitmask. So the test checks for the wrong bit. Use BIT(TIMER_SOFTIRQ) instead. Fixes: 5d62c183f9e9 ("nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()") Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Daniel Bristot de Oliveira Acked-by: Frederic Weisbecker Cc: bigeasy@linutronix.de Cc: peterz@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180731161358.29472-1-anna-maria@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e5d228f7224c..5ad2e852e9f6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -570,7 +570,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & TIMER_SOFTIRQ; + return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, From 52296ab92b66e2a8c932b2a4cb318dd08cb8bd29 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 27 Jul 2018 16:54:44 +0100 Subject: [PATCH 05/59] netlink: Do not subscribe to non-existent groups [ Upstream commit 7acf9d4237c46894e0fa0492dd96314a41742e84 ] Make ABI more strict about subscribing to group > ngroups. Code doesn't check for that and it looks bogus. (one can subscribe to non-existing group) Still, it's possible to bind() to all possible groups with (-1) Cc: "David S. Miller" Cc: Herbert Xu Cc: Steffen Klassert Cc: netdev@vger.kernel.org Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9708fff318d5..aed2cfb1a4fe 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -985,6 +985,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (err) return err; } + groups &= (1UL << nlk->ngroups) - 1; bound = nlk->bound; if (bound) { From bc48f46f117e05af226a4c0dff617218f1c13650 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 30 Jul 2018 18:32:36 +0100 Subject: [PATCH 06/59] netlink: Don't shift with UB on nlk->ngroups [ Upstream commit 61f4b23769f0cc72ae62c9a81cf08f0397d40da8 ] On i386 nlk->ngroups might be 32 or 0. Which leads to UB, resulting in hang during boot. Check for 0 ngroups and use (unsigned long long) as a type to shift. Fixes: 7acf9d4237c4 ("netlink: Do not subscribe to non-existent groups"). Reported-by: kernel test robot Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netlink/af_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index aed2cfb1a4fe..87c25918c073 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -985,7 +985,11 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (err) return err; } - groups &= (1UL << nlk->ngroups) - 1; + + if (nlk->ngroups == 0) + groups = 0; + else + groups &= (1ULL << nlk->ngroups) - 1; bound = nlk->bound; if (bound) { From a5928d68418768e3e7ed9c75039060c1e70e047e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Sun, 5 Aug 2018 01:35:53 +0100 Subject: [PATCH 07/59] netlink: Don't shift on 64 for ngroups commit 91874ecf32e41b5d86a4cb9d60e0bee50d828058 upstream. It's legal to have 64 groups for netlink_sock. As user-supplied nladdr->nl_groups is __u32, it's possible to subscribe only to first 32 groups. The check for correctness of .bind() userspace supplied parameter is done by applying mask made from ngroups shift. Which broke Android as they have 64 groups and the shift for mask resulted in an overflow. Fixes: 61f4b23769f0 ("netlink: Don't shift with UB on nlk->ngroups") Cc: "David S. Miller" Cc: Herbert Xu Cc: Steffen Klassert Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org Reported-and-Tested-by: Nathan Chancellor Signed-off-by: Dmitry Safonov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 87c25918c073..bf292010760a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -988,8 +988,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->ngroups == 0) groups = 0; - else - groups &= (1ULL << nlk->ngroups) - 1; + else if (nlk->ngroups < 8*sizeof(groups)) + groups &= (1UL << nlk->ngroups) - 1; bound = nlk->bound; if (bound) { From 310eba0dfc8a7d5423516df7f4be7451505ac6ef Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 Jul 2018 19:35:02 -0400 Subject: [PATCH 08/59] ext4: fix false negatives *and* false positives in ext4_check_descriptors() commit 44de022c4382541cebdd6de4465d1f4f465ff1dd upstream. Ext4_check_descriptors() was getting called before s_gdb_count was initialized. So for file systems w/o the meta_bg feature, allocation bitmaps could overlap the block group descriptors and ext4 wouldn't notice. For file systems with the meta_bg feature enabled, there was a fencepost error which would cause the ext4_check_descriptors() to incorrectly believe that the block allocation bitmap overlaps with the block group descriptor blocks, and it would reject the mount. Fix both of these problems. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Benjamin Gilbert Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 49af3c50b263..3e4d8ac1974e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2102,7 +2102,7 @@ static int ext4_check_descriptors(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; - ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0) + 1; + ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; @@ -3777,13 +3777,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); From fef9866d278ee726b15cf251339660e77ba5488c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 14 Sep 2017 16:50:14 +0200 Subject: [PATCH 09/59] ACPI / PCI: Bail early in acpi_pci_add_bus() if there is no ACPI handle commit a0040c0145945d3bd203df8fa97f6dfa819f3f7d upstream. Hyper-V instances support PCI pass-through which is implemented through PV pci-hyperv driver. When a device is passed through, a new root PCI bus is created in the guest. The bus sits on top of VMBus and has no associated information in ACPI. acpi_pci_add_bus() in this case proceeds all the way to acpi_evaluate_dsm(), which reports ACPI: \: failed to evaluate _DSM (0x1001) While acpi_pci_slot_enumerate() and acpiphp_enumerate_slots() are protected against ACPI_HANDLE() being NULL and do nothing, acpi_evaluate_dsm() is not and gives us the error. It seems the correct fix is to not do anything in acpi_pci_add_bus() in such cases. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Bjorn Helgaas Cc: Sinan Kaya Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index a32ba753e413..afaf13474796 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -543,7 +543,7 @@ void acpi_pci_add_bus(struct pci_bus *bus) union acpi_object *obj; struct pci_host_bridge *bridge; - if (acpi_pci_disabled || !bus->bridge) + if (acpi_pci_disabled || !bus->bridge || !ACPI_HANDLE(bus->bridge)) return; acpi_pci_slot_enumerate(bus); From 731ccd90b8dc6697fefb62f43ed6f8d253d7fd5b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jul 2018 01:28:15 +0900 Subject: [PATCH 10/59] ring_buffer: tracing: Inherit the tracing setting to next ring buffer commit 73c8d8945505acdcbae137c2e00a1232e0be709f upstream. Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar Cc: Shuah Khan Cc: Tom Zanussi Cc: Hiraku Toyooka Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 16 ++++++++++++++++ kernel/trace/trace.c | 6 ++++++ 3 files changed, 23 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 4acc552e9279..19d0778ec382 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -162,6 +162,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); int ring_buffer_record_is_on(struct ring_buffer *buffer); +int ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9cd6191760b..fdaa88f38aec 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3141,6 +3141,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer) return !atomic_read(&buffer->record_disabled); } +/** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8aef4e63ac57..1b980a8ef791 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1088,6 +1088,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); + /* Inherit the recordable setting from trace_buffer */ + if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + buf = tr->trace_buffer.buffer; tr->trace_buffer.buffer = tr->max_buffer.buffer; tr->max_buffer.buffer = buf; From a8ec97dbac9027f2f4158aadf86010edc2a9ea5d Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Mon, 9 Jul 2018 11:43:01 +0200 Subject: [PATCH 11/59] i2c: imx: Fix reinit_completion() use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9f9e3e0d4dd3338b3f3dde080789f71901e1e4ff upstream. Make sure to call reinit_completion() before dma is started to avoid race condition where reinit_completion() is called after complete() and before wait_for_completion_timeout(). Signed-off-by: Esben Haabendal Fixes: ce1a78840ff7 ("i2c: imx: add DMA support for freescale i2c driver") Reviewed-by: Uwe Kleine-König Signed-off-by: Wolfram Sang Cc: stable@kernel.org Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-imx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index d4d853680ae4..a4abf7dc9576 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -382,6 +382,7 @@ static int i2c_imx_dma_xfer(struct imx_i2c_struct *i2c_imx, goto err_desc; } + reinit_completion(&dma->cmd_complete); txdesc->callback = i2c_imx_dma_callback; txdesc->callback_param = i2c_imx; if (dma_submit_error(dmaengine_submit(txdesc))) { @@ -631,7 +632,6 @@ static int i2c_imx_dma_write(struct imx_i2c_struct *i2c_imx, * The first byte must be transmitted by the CPU. */ imx_i2c_write_reg(msgs->addr << 1, i2c_imx, IMX_I2C_I2DR); - reinit_completion(&i2c_imx->dma->cmd_complete); time_left = wait_for_completion_timeout( &i2c_imx->dma->cmd_complete, msecs_to_jiffies(DMA_TIMEOUT)); @@ -690,7 +690,6 @@ static int i2c_imx_dma_read(struct imx_i2c_struct *i2c_imx, if (result) return result; - reinit_completion(&i2c_imx->dma->cmd_complete); time_left = wait_for_completion_timeout( &i2c_imx->dma->cmd_complete, msecs_to_jiffies(DMA_TIMEOUT)); From 0749d5b3ec62310b747751ea7d4d5ccca51bc80f Mon Sep 17 00:00:00 2001 From: Shankara Pailoor Date: Tue, 5 Jun 2018 08:33:27 -0500 Subject: [PATCH 12/59] jfs: Fix inconsistency between memory allocation and ea_buf->max_size commit 92d34134193e5b129dc24f8d79cb9196626e8d7a upstream. The code is assuming the buffer is max_size length, but we weren't allocating enough space for it. Signed-off-by: Shankara Pailoor Signed-off-by: Dave Kleikamp Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/jfs/xattr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 48b15a6e5558..40a26a542341 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -493,15 +493,17 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (size > PSIZE) { /* * To keep the rest of the code simple. Allocate a - * contiguous buffer to work with + * contiguous buffer to work with. Make the buffer large + * enough to make use of the whole extent. */ - ea_buf->xattr = kmalloc(size, GFP_KERNEL); + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + ea_buf->xattr = kmalloc(ea_buf->max_size, GFP_KERNEL); if (ea_buf->xattr == NULL) return -ENOMEM; ea_buf->flag = EA_MALLOC; - ea_buf->max_size = (size + sb->s_blocksize - 1) & - ~(sb->s_blocksize - 1); if (ea_size == 0) return 0; From 8404ae6c8c9ff23a06cf38112e83002e1088bfe1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 9 Aug 2018 12:19:28 +0200 Subject: [PATCH 13/59] Linux 4.4.147 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 030f5af05f4e..ee92a12e3a4b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 146 +SUBLEVEL = 147 EXTRAVERSION = NAME = Blurry Fish Butt From 7736fcede789b412ae1c5c2f12f9bef58903319c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 28 Jul 2018 08:12:04 -0400 Subject: [PATCH 14/59] ext4: fix check to prevent initializing reserved inodes commit 5012284700775a4e6e3fbe7eac4c543c4874b559 upstream. Commit 8844618d8aa7: "ext4: only look at the bg_flags field if it is valid" will complain if block group zero does not have the EXT4_BG_INODE_ZEROED flag set. Unfortunately, this is not correct, since a freshly created file system has this flag cleared. It gets almost immediately after the file system is mounted read-write --- but the following somewhat unlikely sequence will end up triggering a false positive report of a corrupted file system: mkfs.ext4 /dev/vdc mount -o ro /dev/vdc /vdc mount -o remount,rw /dev/vdc Instead, when initializing the inode table for block group zero, test to make sure that itable_unused count is not too large, since that is the case that will result in some or all of the reserved inodes getting cleared. This fixes the failures reported by Eric Whiteney when running generic/230 and generic/231 in the the nojournal test case. Fixes: 8844618d8aa7 ("ext4: only look at the bg_flags field if it is valid") Reported-by: Eric Whitney Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ialloc.c | 5 ++++- fs/ext4/super.c | 8 +------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 041117fd8fd7..0963213e9cd3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1308,7 +1308,10 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, ext4_itable_unused_count(sb, gdp)), sbi->s_inodes_per_block); - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || + ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)) < + EXT4_FIRST_INO(sb)))) { ext4_error(sb, "Something is wrong with group %u: " "used itable blocks: %d; " "itable unused count: %u", diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3e4d8ac1974e..8d18f6142da5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2875,14 +2875,8 @@ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) if (!gdp) continue; - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) - continue; - if (group != 0) + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; - ext4_error(sb, "Inode table for bg 0 marked as " - "needing zeroing"); - if (sb->s_flags & MS_RDONLY) - return ngroups; } return group; From 215f36e128f2b476cd3bfe91339a5e12b79d010c Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 22 May 2018 14:37:18 -0700 Subject: [PATCH 15/59] tpm: fix race condition in tpm_common_write() commit 3ab2011ea368ec3433ad49e1b9e1c7b70d2e65df upstream. There is a race condition in tpm_common_write function allowing two threads on the same /dev/tpm, or two different applications on the same /dev/tpmrm to overwrite each other commands/responses. Fixed this by taking the priv->buffer_mutex early in the function. Also converted the priv->data_pending from atomic to a regular size_t type. There is no need for it to be atomic since it is only touched under the protection of the priv->buffer_mutex. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Tadeusz Struk Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Greg Kroah-Hartman --- drivers/char/tpm/tpm-dev.c | 43 ++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/char/tpm/tpm-dev.c b/drivers/char/tpm/tpm-dev.c index 912ad30be585..4719aa781bf2 100644 --- a/drivers/char/tpm/tpm-dev.c +++ b/drivers/char/tpm/tpm-dev.c @@ -25,7 +25,7 @@ struct file_priv { struct tpm_chip *chip; /* Data passed to and from the tpm via the read/write calls */ - atomic_t data_pending; + size_t data_pending; struct mutex buffer_mutex; struct timer_list user_read_timer; /* user needs to claim result */ @@ -46,7 +46,7 @@ static void timeout_work(struct work_struct *work) struct file_priv *priv = container_of(work, struct file_priv, work); mutex_lock(&priv->buffer_mutex); - atomic_set(&priv->data_pending, 0); + priv->data_pending = 0; memset(priv->data_buffer, 0, sizeof(priv->data_buffer)); mutex_unlock(&priv->buffer_mutex); } @@ -72,7 +72,6 @@ static int tpm_open(struct inode *inode, struct file *file) } priv->chip = chip; - atomic_set(&priv->data_pending, 0); mutex_init(&priv->buffer_mutex); setup_timer(&priv->user_read_timer, user_reader_timeout, (unsigned long)priv); @@ -86,28 +85,24 @@ static ssize_t tpm_read(struct file *file, char __user *buf, size_t size, loff_t *off) { struct file_priv *priv = file->private_data; - ssize_t ret_size; + ssize_t ret_size = 0; int rc; del_singleshot_timer_sync(&priv->user_read_timer); flush_work(&priv->work); - ret_size = atomic_read(&priv->data_pending); - if (ret_size > 0) { /* relay data */ - ssize_t orig_ret_size = ret_size; - if (size < ret_size) - ret_size = size; + mutex_lock(&priv->buffer_mutex); - mutex_lock(&priv->buffer_mutex); + if (priv->data_pending) { + ret_size = min_t(ssize_t, size, priv->data_pending); rc = copy_to_user(buf, priv->data_buffer, ret_size); - memset(priv->data_buffer, 0, orig_ret_size); + memset(priv->data_buffer, 0, priv->data_pending); if (rc) ret_size = -EFAULT; - mutex_unlock(&priv->buffer_mutex); + priv->data_pending = 0; } - atomic_set(&priv->data_pending, 0); - + mutex_unlock(&priv->buffer_mutex); return ret_size; } @@ -118,18 +113,20 @@ static ssize_t tpm_write(struct file *file, const char __user *buf, size_t in_size = size; ssize_t out_size; - /* cannot perform a write until the read has cleared - either via tpm_read or a user_read_timer timeout. - This also prevents splitted buffered writes from blocking here. - */ - if (atomic_read(&priv->data_pending) != 0) - return -EBUSY; - if (in_size > TPM_BUFSIZE) return -E2BIG; mutex_lock(&priv->buffer_mutex); + /* Cannot perform a write until the read has cleared either via + * tpm_read or a user_read_timer timeout. This also prevents split + * buffered writes from blocking here. + */ + if (priv->data_pending != 0) { + mutex_unlock(&priv->buffer_mutex); + return -EBUSY; + } + if (copy_from_user (priv->data_buffer, (void __user *) buf, in_size)) { mutex_unlock(&priv->buffer_mutex); @@ -153,7 +150,7 @@ static ssize_t tpm_write(struct file *file, const char __user *buf, return out_size; } - atomic_set(&priv->data_pending, out_size); + priv->data_pending = out_size; mutex_unlock(&priv->buffer_mutex); /* Set a timeout by which the reader must come claim the result */ @@ -172,7 +169,7 @@ static int tpm_release(struct inode *inode, struct file *file) del_singleshot_timer_sync(&priv->user_read_timer); flush_work(&priv->work); file->private_data = NULL; - atomic_set(&priv->data_pending, 0); + priv->data_pending = 0; clear_bit(0, &priv->chip->is_open); kfree(priv); return 0; From e424bee248c38266c6057d43f3e350072fc41c5d Mon Sep 17 00:00:00 2001 From: Thomas Egerer Date: Mon, 25 Jan 2016 12:58:44 +0100 Subject: [PATCH 16/59] ipv4+ipv6: Make INET*_ESP select CRYPTO_ECHAINIV commit 32b6170ca59ccf07d0e394561e54b2cd9726038c upstream. The ESP algorithms using CBC mode require echainiv. Hence INET*_ESP have to select CRYPTO_ECHAINIV in order to work properly. This solves the issues caused by a misconfiguration as described in [1]. The original approach, patching crypto/Kconfig was turned down by Herbert Xu [2]. [1] https://lists.strongswan.org/pipermail/users/2015-December/009074.html [2] http://marc.info/?l=linux-crypto-vger&m=145224655809562&w=2 Signed-off-by: Thomas Egerer Acked-by: Herbert Xu Signed-off-by: David S. Miller Cc: Yongqin Liu Signed-off-by: Greg Kroah-Hartman --- net/ipv4/Kconfig | 1 + net/ipv6/Kconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 93581bba8643..09d6c4a6b53d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -354,6 +354,7 @@ config INET_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 851d5c9e3ecc..0f50248bad17 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -69,6 +69,7 @@ config INET6_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. From 1e4006421429ab672c62ab25afb3c39e6f4aa94f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Apr 2018 14:55:31 -0700 Subject: [PATCH 17/59] fork: unconditionally clear stack on fork commit e01e80634ecdde1dd113ac43b3adad21b47f3957 upstream. One of the classes of kernel stack content leaks[1] is exposing the contents of prior heap or stack contents when a new process stack is allocated. Normally, those stacks are not zeroed, and the old contents remain in place. In the face of stack content exposure flaws, those contents can leak to userspace. Fixing this will make the kernel no longer vulnerable to these flaws, as the stack will be wiped each time a stack is assigned to a new process. There's not a meaningful change in runtime performance; it almost looks like it provides a benefit. Performing back-to-back kernel builds before: Run times: 157.86 157.09 158.90 160.94 160.80 Mean: 159.12 Std Dev: 1.54 and after: Run times: 159.31 157.34 156.71 158.15 160.81 Mean: 158.46 Std Dev: 1.46 Instead of making this a build or runtime config, Andy Lutomirski recommended this just be enabled by default. [1] A noisy search for many kinds of stack content leaks can be seen here: https://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=linux+kernel+stack+leak I did some more with perf and cycle counts on running 100,000 execs of /bin/true. before: Cycles: 218858861551 218853036130 214727610969 227656844122 224980542841 Mean: 221015379122.60 Std Dev: 4662486552.47 after: Cycles: 213868945060 213119275204 211820169456 224426673259 225489986348 Mean: 217745009865.40 Std Dev: 5935559279.99 It continues to look like it's faster, though the deviation is rather wide, but I'm not sure what I could do that would be less noisy. I'm open to ideas! Link: http://lkml.kernel.org/r/20180221021659.GA37073@beast Signed-off-by: Kees Cook Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Andy Lutomirski Cc: Laura Abbott Cc: Rasmus Villemoes Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [ Srivatsa: Backported to 4.4.y ] Signed-off-by: Srivatsa S. Bhat Reviewed-by: Srinidhi Rao Signed-off-by: Greg Kroah-Hartman --- include/linux/thread_info.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3..646891f3bc1e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -55,11 +55,7 @@ extern long do_no_restart_syscall(struct restart_block *parm); #ifdef __KERNEL__ -#ifdef CONFIG_DEBUG_STACK_USAGE -# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) -#else -# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) -#endif +#define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) /* * flag set/clear/test wrappers From a9252a70174362912fee1556f8c3a25d66cd7637 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 28 Jul 2018 11:47:17 +0200 Subject: [PATCH 18/59] parisc: Enable CONFIG_MLONGCALLS by default commit 66509a276c8c1d19ee3f661a41b418d101c57d29 upstream. Enable the -mlong-calls compiler option by default, because otherwise in most cases linking the vmlinux binary fails due to truncations of R_PARISC_PCREL22F relocations. This fixes building the 64-bit defconfig. Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 729f89163bc3..210b3d675261 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -177,7 +177,7 @@ config PREFETCH config MLONGCALLS bool "Enable the -mlong-calls compiler option for big kernels" - def_bool y if (!MODULES) + default y depends on PA8X00 help If you configure the kernel to include many drivers built-in instead From 277b161b1a1d339985b4c24e796e86eae9511382 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sun, 5 Aug 2018 13:30:31 -0400 Subject: [PATCH 19/59] parisc: Define mb() and add memory barriers to assembler unlock sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit fedb8da96355f5f64353625bf96dc69423ad1826 upstream. For years I thought all parisc machines executed loads and stores in order. However, Jeff Law recently indicated on gcc-patches that this is not correct. There are various degrees of out-of-order execution all the way back to the PA7xxx processor series (hit-under-miss). The PA8xxx series has full out-of-order execution for both integer operations, and loads and stores. This is described in the following article: http://web.archive.org/web/20040214092531/http://www.cpus.hp.com/technical_references/advperf.shtml For this reason, we need to define mb() and to insert a memory barrier before the store unlocking spinlocks. This ensures that all memory accesses are complete prior to unlocking. The ldcw instruction performs the same function on entry. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/include/asm/barrier.h | 32 +++++++++++++++++++++++++++++++ arch/parisc/kernel/entry.S | 2 ++ arch/parisc/kernel/pacache.S | 1 + arch/parisc/kernel/syscall.S | 4 ++++ 4 files changed, 39 insertions(+) create mode 100644 arch/parisc/include/asm/barrier.h diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h new file mode 100644 index 000000000000..dbaaca84f27f --- /dev/null +++ b/arch/parisc/include/asm/barrier.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#ifndef __ASSEMBLY__ + +/* The synchronize caches instruction executes as a nop on systems in + which all memory references are performed in order. */ +#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory") + +#if defined(CONFIG_SMP) +#define mb() do { synchronize_caches(); } while (0) +#define rmb() mb() +#define wmb() mb() +#define dma_rmb() mb() +#define dma_wmb() mb() +#else +#define mb() barrier() +#define rmb() barrier() +#define wmb() barrier() +#define dma_rmb() barrier() +#define dma_wmb() barrier() +#endif + +#define __smp_mb() mb() +#define __smp_rmb() mb() +#define __smp_wmb() mb() + +#include + +#endif /* !__ASSEMBLY__ */ +#endif /* __ASM_BARRIER_H */ diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 5dc831955de5..13cb2461fef5 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -481,6 +481,8 @@ /* Release pa_tlb_lock lock without reloading lock address. */ .macro tlb_unlock0 spc,tmp #ifdef CONFIG_SMP + or,COND(=) %r0,\spc,%r0 + sync or,COND(=) %r0,\spc,%r0 stw \spc,0(\tmp) #endif diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index 16073f472118..b3434a7fd3c9 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -354,6 +354,7 @@ ENDPROC(flush_data_cache_local) .macro tlb_unlock la,flags,tmp #ifdef CONFIG_SMP ldi 1,\tmp + sync stw \tmp,0(\la) mtsm \flags #endif diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 9f22195b90ed..f68eedc72484 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -631,6 +631,7 @@ cas_action: sub,<> %r28, %r25, %r0 2: stw,ma %r24, 0(%r26) /* Free lock */ + sync stw,ma %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG /* Clear thread register indicator */ @@ -645,6 +646,7 @@ cas_action: 3: /* Error occurred on load or store */ /* Free lock */ + sync stw %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG stw %r0, 4(%sr2,%r20) @@ -846,6 +848,7 @@ cas2_action: cas2_end: /* Free lock */ + sync stw,ma %r20, 0(%sr2,%r20) /* Enable interrupts */ ssm PSW_SM_I, %r0 @@ -856,6 +859,7 @@ cas2_end: 22: /* Error occurred on load or store */ /* Free lock */ + sync stw %r20, 0(%sr2,%r20) ssm PSW_SM_I, %r0 ldo 1(%r0),%r28 From 6b1f6243b39c4f49d44bafa9e4639be4f124577f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Aug 2018 16:42:16 +0200 Subject: [PATCH 20/59] xen/netfront: don't cache skb_shinfo() commit d472b3a6cf63cd31cae1ed61930f07e6cd6671b5 upstream. skb_shinfo() can change when calling __pskb_pull_tail(): Don't cache its return value. Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross Reviewed-by: Wei Liu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index bec9f099573b..68d0a5c9d437 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -879,7 +879,6 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, struct sk_buff *skb, struct sk_buff_head *list) { - struct skb_shared_info *shinfo = skb_shinfo(skb); RING_IDX cons = queue->rx.rsp_cons; struct sk_buff *nskb; @@ -888,15 +887,16 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, RING_GET_RESPONSE(&queue->rx, ++cons); skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0]; - if (shinfo->nr_frags == MAX_SKB_FRAGS) { + if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to; BUG_ON(pull_to <= skb_headlen(skb)); __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); } - BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS); + BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS); - skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag), + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + skb_frag_page(nfrag), rx->offset, rx->status, PAGE_SIZE); skb_shinfo(nskb)->nr_frags = 0; From 277131baccf9c96e01d5ffdb0c6447770b634eae Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 26 Apr 2018 14:10:24 +0200 Subject: [PATCH 21/59] ACPI / LPSS: Add missing prv_offset setting for byt/cht PWM devices commit fdcb613d49321b5bf5d5a1bd0fba8e7c241dcc70 upstream. The LPSS PWM device on on Bay Trail and Cherry Trail devices has a set of private registers at offset 0x800, the current lpss_device_desc for them already sets the LPSS_SAVE_CTX flag to have these saved/restored over device-suspend, but the current lpss_device_desc was not setting the prv_offset field, leading to the regular device registers getting saved/restored instead. This is causing the PWM controller to no longer work, resulting in a black screen, after a suspend/resume on systems where the firmware clears the APB clock and reset bits at offset 0x804. This commit fixes this by properly setting prv_offset to 0x800 for the PWM devices. Cc: stable@vger.kernel.org Fixes: e1c748179754 ("ACPI / LPSS: Add Intel BayTrail ACPI mode PWM") Fixes: 1bfbd8eb8a7f ("ACPI / LPSS: Add ACPI IDs for Intel Braswell") Signed-off-by: Hans de Goede Acked-by: Rafael J . Wysocki Signed-off-by: Thierry Reding Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpi_lpss.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index f9e0d09f7c66..8a0f77fb5181 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -154,10 +154,12 @@ static const struct lpss_device_desc lpt_sdio_dev_desc = { static const struct lpss_device_desc byt_pwm_dev_desc = { .flags = LPSS_SAVE_CTX, + .prv_offset = 0x800, }; static const struct lpss_device_desc bsw_pwm_dev_desc = { .flags = LPSS_SAVE_CTX | LPSS_NO_D3_DELAY, + .prv_offset = 0x800, }; static const struct lpss_device_desc byt_uart_dev_desc = { From 6aef4c4a1690b0b371d88babc41a8a314d0fd3f9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 2 Aug 2018 10:44:42 -0700 Subject: [PATCH 22/59] scsi: sr: Avoid that opening a CD-ROM hangs with runtime power management enabled commit 1214fd7b497400d200e3f4e64e2338b303a20949 upstream. Surround scsi_execute() calls with scsi_autopm_get_device() and scsi_autopm_put_device(). Note: removing sr_mutex protection from the scsi_cd_get() and scsi_cd_put() calls is safe because the purpose of sr_mutex is to serialize cdrom_*() calls. This patch avoids that complaints similar to the following appear in the kernel log if runtime power management is enabled: INFO: task systemd-udevd:650 blocked for more than 120 seconds. Not tainted 4.18.0-rc7-dbg+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. systemd-udevd D28176 650 513 0x00000104 Call Trace: __schedule+0x444/0xfe0 schedule+0x4e/0xe0 schedule_preempt_disabled+0x18/0x30 __mutex_lock+0x41c/0xc70 mutex_lock_nested+0x1b/0x20 __blkdev_get+0x106/0x970 blkdev_get+0x22c/0x5a0 blkdev_open+0xe9/0x100 do_dentry_open.isra.19+0x33e/0x570 vfs_open+0x7c/0xd0 path_openat+0x6e3/0x1120 do_filp_open+0x11c/0x1c0 do_sys_open+0x208/0x2d0 __x64_sys_openat+0x59/0x70 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Bart Van Assche Cc: Maurizio Lombardi Cc: Johannes Thumshirn Cc: Alan Stern Cc: Tested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sr.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index de53c9694b68..5dc288fecace 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -520,18 +520,26 @@ static int sr_init_command(struct scsi_cmnd *SCpnt) static int sr_block_open(struct block_device *bdev, fmode_t mode) { struct scsi_cd *cd; + struct scsi_device *sdev; int ret = -ENXIO; + cd = scsi_cd_get(bdev->bd_disk); + if (!cd) + goto out; + + sdev = cd->device; + scsi_autopm_get_device(sdev); check_disk_change(bdev); mutex_lock(&sr_mutex); - cd = scsi_cd_get(bdev->bd_disk); - if (cd) { - ret = cdrom_open(&cd->cdi, bdev, mode); - if (ret) - scsi_cd_put(cd); - } + ret = cdrom_open(&cd->cdi, bdev, mode); mutex_unlock(&sr_mutex); + + scsi_autopm_put_device(sdev); + if (ret) + scsi_cd_put(cd); + +out: return ret; } @@ -559,6 +567,8 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (ret) goto out; + scsi_autopm_get_device(sdev); + /* * Send SCSI addressing ioctls directly to mid level, send other * ioctls to cdrom/block level. @@ -567,15 +577,18 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case SCSI_IOCTL_GET_IDLUN: case SCSI_IOCTL_GET_BUS_NUMBER: ret = scsi_ioctl(sdev, cmd, argp); - goto out; + goto put; } ret = cdrom_ioctl(&cd->cdi, bdev, mode, cmd, arg); if (ret != -ENOSYS) - goto out; + goto put; ret = scsi_ioctl(sdev, cmd, argp); +put: + scsi_autopm_put_device(sdev); + out: mutex_unlock(&sr_mutex); return ret; From ba744147871e7c6d3b6b60eede06f74a1a7abcd9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Aug 2018 09:03:58 -0400 Subject: [PATCH 23/59] root dentries need RCU-delayed freeing commit 90bad5e05bcdb0308cfa3d3a60f5c0b9c8e2efb3 upstream. Since mountpoint crossing can happen without leaving lazy mode, root dentries do need the same protection against having their memory freed without RCU delay as everything else in the tree. It's partially hidden by RCU delay between detaching from the mount tree and dropping the vfsmount reference, but the starting point of pathwalk can be on an already detached mount, in which case umount-caused RCU delay has already passed by the time the lazy pathwalk grabs rcu_read_lock(). If the starting point happens to be at the root of that vfsmount *and* that vfsmount covers the entire filesystem, we get trouble. Fixes: 48a066e72d97 ("RCU'd vsfmounts") Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 250c1222e30c..807efaab838e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1954,10 +1954,12 @@ struct dentry *d_make_root(struct inode *root_inode) static const struct qstr name = QSTR_INIT("/", 1); res = __d_alloc(root_inode->i_sb, &name); - if (res) + if (res) { + res->d_flags |= DCACHE_RCUACCESS; d_instantiate(res, root_inode); - else + } else { iput(root_inode); + } } return res; } From a3ababd599e72b9b92420c159564684fcbfa489f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Aug 2018 17:21:17 -0400 Subject: [PATCH 24/59] fix mntput/mntput race commit 9ea0a46ca2c318fcc449c1e6b62a7230a17888f1 upstream. mntput_no_expire() does the calculation of total refcount under mount_lock; unfortunately, the decrement (as well as all increments) are done outside of it, leading to false positives in the "are we dropping the last reference" test. Consider the following situation: * mnt is a lazy-umounted mount, kept alive by two opened files. One of those files gets closed. Total refcount of mnt is 2. On CPU 42 mntput(mnt) (called from __fput()) drops one reference, decrementing component * After it has looked at component #0, the process on CPU 0 does mntget(), incrementing component #0, gets preempted and gets to run again - on CPU 69. There it does mntput(), which drops the reference (component #69) and proceeds to spin on mount_lock. * On CPU 42 our first mntput() finishes counting. It observes the decrement of component #69, but not the increment of component #0. As the result, the total it gets is not 1 as it should've been - it's 0. At which point we decide that vfsmount needs to be killed and proceed to free it and shut the filesystem down. However, there's still another opened file on that filesystem, with reference to (now freed) vfsmount, etc. and we are screwed. It's not a wide race, but it can be reproduced with artificial slowdown of the mnt_get_count() loop, and it should be easier to hit on SMP KVM setups. Fix consists of moving the refcount decrement under mount_lock; the tricky part is that we want (and can) keep the fast case (i.e. mount that still has non-NULL ->mnt_ns) entirely out of mount_lock. All places that zero mnt->mnt_ns are dropping some reference to mnt and they call synchronize_rcu() before that mntput(). IOW, if mntput() observes (under rcu_read_lock()) a non-NULL ->mnt_ns, it is guaranteed that there is another reference yet to be dropped. Reported-by: Jann Horn Tested-by: Jann Horn Fixes: 48a066e72d97 ("RCU'd vsfmounts") Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a879560ea144..643b7eecf7b7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1124,12 +1124,22 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { rcu_read_lock(); - mnt_add_count(mnt, -1); - if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); rcu_read_unlock(); return; } lock_mount_hash(); + mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { rcu_read_unlock(); unlock_mount_hash(); From b9341f5aebd89f46d2cda7dd9c39aabc0a559bdb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Aug 2018 17:51:32 -0400 Subject: [PATCH 25/59] fix __legitimize_mnt()/mntput() race commit 119e1ef80ecfe0d1deb6378d4ab41f5b71519de1 upstream. __legitimize_mnt() has two problems - one is that in case of success the check of mount_lock is not ordered wrt preceding increment of refcount, making it possible to have successful __legitimize_mnt() on one CPU just before the otherwise final mntpu() on another, with __legitimize_mnt() not seeing mntput() taking the lock and mntput() not seeing the increment done by __legitimize_mnt(). Solved by a pair of barriers. Another is that failure of __legitimize_mnt() on the second read_seqretry() leaves us with reference that'll need to be dropped by caller; however, if that races with final mntput() we can end up with caller dropping rcu_read_lock() and doing mntput() to release that reference - with the first mntput() having freed the damn thing just as rcu_read_lock() had been dropped. Solution: in "do mntput() yourself" failure case grab mount_lock, check if MNT_DOOMED has been set by racing final mntput() that has missed our increment and if it has - undo the increment and treat that as "failure, caller doesn't need to drop anything" case. It's not easy to hit - the final mntput() has to come right after the first read_seqretry() in __legitimize_mnt() *and* manage to miss the increment done by __legitimize_mnt() before the second read_seqretry() in there. The things that are almost impossible to hit on bare hardware are not impossible on SMP KVM, though... Reported-by: Oleg Nesterov Fixes: 48a066e72d97 ("RCU'd vsfmounts") Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 643b7eecf7b7..b56b50e3da11 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -603,12 +603,21 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); + smp_mb(); // see mntput_no_expire() if (likely(!read_seqretry(&mount_lock, seq))) return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1); return 1; } + lock_mount_hash(); + if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + mnt_add_count(mnt, -1); + unlock_mount_hash(); + return 1; + } + unlock_mount_hash(); + /* caller will mntput() */ return -1; } @@ -1139,6 +1148,11 @@ static void mntput_no_expire(struct mount *mnt) return; } lock_mount_hash(); + /* + * make sure that if __legitimize_mnt() has not seen us grab + * mount_lock, we'll see their refcount increment here. + */ + smp_mb(); mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { rcu_read_unlock(); From 01b377d3f0d286d071f46c30586cb261c79559f7 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 23 May 2018 15:30:30 +0300 Subject: [PATCH 26/59] IB/core: Make testing MR flags for writability a static inline function commit 08bb558ac11ab944e0539e78619d7b4c356278bd upstream. Make the MR writability flags check, which is performed in umem.c, a static inline function in file ib_verbs.h This allows the function to be used by low-level infiniband drivers. Cc: Signed-off-by: Jason Gunthorpe Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/umem.c | 11 +---------- include/rdma/ib_verbs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 6790ebb366dd..98fd9a594841 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -122,16 +122,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->address = addr; umem->page_size = PAGE_SIZE; umem->pid = get_task_pid(current, PIDTYPE_PID); - /* - * We ask for writable memory if any of the following - * access flags are set. "Local write" and "remote write" - * obviously require write access. "Remote atomic" can do - * things like fetch and add, which will modify memory, and - * "MW bind" can change permissions by binding a window. - */ - umem->writable = !!(access & - (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); + umem->writable = ib_access_writable(access); if (access & IB_ACCESS_ON_DEMAND) { put_pid(umem->pid); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 120da1d7f57e..10fefb0dc640 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3007,6 +3007,20 @@ static inline int ib_check_mr_access(int flags) return 0; } +static inline bool ib_access_writable(int access_flags) +{ + /* + * We have writable memory backing the MR if any of the following + * access flags are set. "Local write" and "remote write" obviously + * require write access. "Remote atomic" can do things like fetch and + * add, which will modify memory, and "MW bind" can change permissions + * by binding a window. + */ + return access_flags & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND); +} + /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected From d803aa2fe665f2dca0e46cefca982ad5c537ca7e Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 23 May 2018 15:30:31 +0300 Subject: [PATCH 27/59] IB/mlx4: Mark user MR as writable if actual virtual memory is writable commit d8f9cc328c8888369880e2527e9186d745f2bbf6 upstream. To allow rereg_user_mr to modify the MR from read-only to writable without using get_user_pages again, we needed to define the initial MR as writable. However, this was originally done unconditionally, without taking into account the writability of the underlying virtual memory. As a result, any attempt to register a read-only MR over read-only virtual memory failed. To fix this, do not add the writable flag bit when the user virtual memory is not writable (e.g. const memory). However, when the underlying memory is NOT writable (and we therefore do not define the initial MR as writable), the IB core adds a "force writable" flag to its user-pages request. If this succeeds, the reg_user_mr caller gets a writable copy of the original pages. If the user-space caller then does a rereg_user_mr operation to enable writability, this will succeed. This should not be allowed, since the original virtual memory was not writable. Cc: Fixes: 9376932d0c26 ("IB/mlx4_ib: Add support for user MR re-registration") Signed-off-by: Jason Gunthorpe Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/mr.c | 50 +++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index ce87e9cc7eff..bf52e35dd506 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -130,6 +130,40 @@ out: return err; } +static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start, + u64 length, u64 virt_addr, + int access_flags) +{ + /* + * Force registering the memory as writable if the underlying pages + * are writable. This is so rereg can change the access permissions + * from readable to writable without having to run through ib_umem_get + * again + */ + if (!ib_access_writable(access_flags)) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + /* + * FIXME: Ideally this would iterate over all the vmas that + * cover the memory, but for now it requires a single vma to + * entirely cover the MR to support RO mappings. + */ + vma = find_vma(current->mm, start); + if (vma && vma->vm_end >= start + length && + vma->vm_start <= start) { + if (vma->vm_flags & VM_WRITE) + access_flags |= IB_ACCESS_LOCAL_WRITE; + } else { + access_flags |= IB_ACCESS_LOCAL_WRITE; + } + + up_read(¤t->mm->mmap_sem); + } + + return ib_umem_get(context, start, length, access_flags, 0); +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -144,10 +178,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - /* Force registering the memory as writable. */ - /* Used for memory re-registeration. HCA protects the access */ - mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags | IB_ACCESS_LOCAL_WRITE, 0); + mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length, + virt_addr, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -214,6 +246,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, } if (flags & IB_MR_REREG_ACCESS) { + if (ib_access_writable(mr_access_flags) && !mmr->umem->writable) + return -EPERM; + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, convert_access(mr_access_flags)); @@ -227,10 +262,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = ib_umem_get(mr->uobject->context, start, length, - mr_access_flags | - IB_ACCESS_LOCAL_WRITE, - 0); + mmr->umem = + mlx4_get_umem_mr(mr->uobject->context, start, length, + virt_addr, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ From 45c679be34ac44ad24bc7abf60193b9f43a83490 Mon Sep 17 00:00:00 2001 From: Michael Mera Date: Mon, 1 May 2017 15:41:16 +0900 Subject: [PATCH 28/59] IB/ocrdma: fix out of bounds access to local buffer commit 062d0f22a30c39840ea49b72cfcfc1aa4cc538fa upstream. In write to debugfs file 'resource_stats' the local buffer 'tmp_str' is written at index 'count-1' where 'count' is the size of the write, so potentially 0. This patch filters odd values for the write size/position to avoid this type of problem. Signed-off-by: Michael Mera Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 748b63b86cbc..40242ead096f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -643,7 +643,7 @@ static ssize_t ocrdma_dbgfs_ops_write(struct file *filp, struct ocrdma_stats *pstats = filp->private_data; struct ocrdma_dev *dev = pstats->dev; - if (count > 32) + if (*ppos != 0 || count == 0 || count > sizeof(tmp_str)) goto err; if (copy_from_user(tmp_str, buffer, count)) From 916a57896e00d4f92318c8ff5a7b8ca07e4e95a7 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 15 Jun 2018 09:41:29 +0200 Subject: [PATCH 29/59] ARM: dts: imx6sx: fix irq for pcie bridge commit 1bcfe0564044be578841744faea1c2f46adc8178 upstream. Use the correct IRQ line for the MSI controller in the PCIe host controller. Apparently a different IRQ line is used compared to other i.MX6 variants. Without this change MSI IRQs aren't properly propagated to the upstream interrupt controller. Signed-off-by: Oleksij Rempel Reviewed-by: Lucas Stach Fixes: b1d17f68e5c5 ("ARM: dts: imx: add initial imx6sx device tree source") Signed-off-by: Shawn Guo Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/imx6sx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/imx6sx.dtsi b/arch/arm/boot/dts/imx6sx.dtsi index 167f77b3bd43..6963dff815dc 100644 --- a/arch/arm/boot/dts/imx6sx.dtsi +++ b/arch/arm/boot/dts/imx6sx.dtsi @@ -1250,7 +1250,7 @@ /* non-prefetchable memory */ 0x82000000 0 0x08000000 0x08000000 0 0x00f00000>; num-lanes = <1>; - interrupts = ; + interrupts = ; clocks = <&clks IMX6SX_CLK_PCIE_REF_125M>, <&clks IMX6SX_CLK_PCIE_AXI>, <&clks IMX6SX_CLK_LVDS1_OUT>, From 8dbce8a2e9cfc8e026565d75f7cb950393d04159 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Aug 2018 16:41:39 +0200 Subject: [PATCH 30/59] x86/paravirt: Fix spectre-v2 mitigations for paravirt guests commit 5800dc5c19f34e6e03b5adab1282535cb102fafd upstream. Nadav reported that on guests we're failing to rewrite the indirect calls to CALLEE_SAVE paravirt functions. In particular the pv_queued_spin_unlock() call is left unpatched and that is all over the place. This obviously wrecks Spectre-v2 mitigation (for paravirt guests) which relies on not actually having indirect calls around. The reason is an incorrect clobber test in paravirt_patch_call(); this function rewrites an indirect call with a direct call to the _SAME_ function, there is no possible way the clobbers can be different because of this. Therefore remove this clobber check. Also put WARNs on the other patch failure case (not enough room for the instruction) which I've not seen trigger in my (limited) testing. Three live kernel image disassemblies for lock_sock_nested (as a small function that illustrates the problem nicely). PRE is the current situation for guests, POST is with this patch applied and NATIVE is with or without the patch for !guests. PRE: (gdb) disassemble lock_sock_nested Dump of assembler code for function lock_sock_nested: 0xffffffff817be970 <+0>: push %rbp 0xffffffff817be971 <+1>: mov %rdi,%rbp 0xffffffff817be974 <+4>: push %rbx 0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx 0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched> 0xffffffff817be981 <+17>: mov %rbx,%rdi 0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh> 0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax 0xffffffff817be98f <+31>: test %eax,%eax 0xffffffff817be991 <+33>: jne 0xffffffff817be9ba 0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp) 0xffffffff817be99d <+45>: mov %rbx,%rdi 0xffffffff817be9a0 <+48>: callq *0xffffffff822299e8 0xffffffff817be9a7 <+55>: pop %rbx 0xffffffff817be9a8 <+56>: pop %rbp 0xffffffff817be9a9 <+57>: mov $0x200,%esi 0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi 0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip> 0xffffffff817be9ba <+74>: mov %rbp,%rdi 0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock> 0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 End of assembler dump. POST: (gdb) disassemble lock_sock_nested Dump of assembler code for function lock_sock_nested: 0xffffffff817be970 <+0>: push %rbp 0xffffffff817be971 <+1>: mov %rdi,%rbp 0xffffffff817be974 <+4>: push %rbx 0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx 0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched> 0xffffffff817be981 <+17>: mov %rbx,%rdi 0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh> 0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax 0xffffffff817be98f <+31>: test %eax,%eax 0xffffffff817be991 <+33>: jne 0xffffffff817be9ba 0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp) 0xffffffff817be99d <+45>: mov %rbx,%rdi 0xffffffff817be9a0 <+48>: callq 0xffffffff810a0c20 <__raw_callee_save___pv_queued_spin_unlock> 0xffffffff817be9a5 <+53>: xchg %ax,%ax 0xffffffff817be9a7 <+55>: pop %rbx 0xffffffff817be9a8 <+56>: pop %rbp 0xffffffff817be9a9 <+57>: mov $0x200,%esi 0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi 0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063aa0 <__local_bh_enable_ip> 0xffffffff817be9ba <+74>: mov %rbp,%rdi 0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock> 0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 End of assembler dump. NATIVE: (gdb) disassemble lock_sock_nested Dump of assembler code for function lock_sock_nested: 0xffffffff817be970 <+0>: push %rbp 0xffffffff817be971 <+1>: mov %rdi,%rbp 0xffffffff817be974 <+4>: push %rbx 0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx 0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched> 0xffffffff817be981 <+17>: mov %rbx,%rdi 0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh> 0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax 0xffffffff817be98f <+31>: test %eax,%eax 0xffffffff817be991 <+33>: jne 0xffffffff817be9ba 0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp) 0xffffffff817be99d <+45>: mov %rbx,%rdi 0xffffffff817be9a0 <+48>: movb $0x0,(%rdi) 0xffffffff817be9a3 <+51>: nopl 0x0(%rax) 0xffffffff817be9a7 <+55>: pop %rbx 0xffffffff817be9a8 <+56>: pop %rbp 0xffffffff817be9a9 <+57>: mov $0x200,%esi 0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi 0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip> 0xffffffff817be9ba <+74>: mov %rbp,%rdi 0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock> 0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 End of assembler dump. Fixes: 63f70270ccd9 ("[PATCH] i386: PARAVIRT: add common patching machinery") Fixes: 3010a0663fd9 ("x86/paravirt, objtool: Annotate indirect calls") Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Woodhouse Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/paravirt.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f534a0e3af53..632195b41688 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -97,10 +97,12 @@ unsigned paravirt_patch_call(void *insnbuf, struct branch *b = insnbuf; unsigned long delta = (unsigned long)target - (addr+5); - if (tgt_clobbers & ~site_clobbers) - return len; /* target would clobber too much for this site */ - if (len < 5) + if (len < 5) { +#ifdef CONFIG_RETPOLINE + WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); +#endif return len; /* call too long for patch site */ + } b->opcode = 0xe8; /* call */ b->delta = delta; @@ -115,8 +117,12 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, struct branch *b = insnbuf; unsigned long delta = (unsigned long)target - (addr+5); - if (len < 5) + if (len < 5) { +#ifdef CONFIG_RETPOLINE + WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); +#endif return len; /* call too long for patch site */ + } b->opcode = 0xe9; /* jmp */ b->delta = delta; From 7744abbe29a59db367f59b0c9890356732f25a3b Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 26 Jul 2018 13:14:55 +0200 Subject: [PATCH 31/59] x86/speculation: Protect against userspace-userspace spectreRSB commit fdf82a7856b32d905c39afc85e34364491e46346 upstream. The article "Spectre Returns! Speculation Attacks using the Return Stack Buffer" [1] describes two new (sub-)variants of spectrev2-like attacks, making use solely of the RSB contents even on CPUs that don't fallback to BTB on RSB underflow (Skylake+). Mitigate userspace-userspace attacks by always unconditionally filling RSB on context switch when the generic spectrev2 mitigation has been enabled. [1] https://arxiv.org/pdf/1807.07940.pdf Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Borislav Petkov Cc: David Woodhouse Cc: Peter Zijlstra Cc: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/nycvar.YFH.7.76.1807261308190.997@cbobk.fhfr.pm Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 38 +++++++------------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 12a8867071f3..7688ce0b26c5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -309,23 +309,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } -/* Check for Skylake-like CPUs (for RSB handling) */ -static bool __init is_skylake_era(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - return true; - } - } - return false; -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -386,22 +369,15 @@ retpoline_auto: pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP nor PTI are available, there is a risk of - * hitting userspace addresses in the RSB after a context switch - * from a shallow call stack to a deeper one. To prevent this fill - * the entire RSB, even when using IBRS. + * If spectre v2 protection has been enabled, unconditionally fill + * RSB during a context switch; this protects against two independent + * issues: * - * Skylake era CPUs have a separate issue with *underflow* of the - * RSB, when they will predict 'ret' targets from the generic BTB. - * The proper mitigation for this is IBRS. If IBRS is not supported - * or deactivated in favour of retpolines the RSB fill on context - * switch is required. + * - RSB underflow (and switch to BTB) on Skylake+ + * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs */ - if ((!boot_cpu_has(X86_FEATURE_KAISER) && - !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); - } + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* Initialize Indirect Branch Prediction Barrier if supported */ if (boot_cpu_has(X86_FEATURE_IBPB)) { From 866234c373a0f34774d5dcb3886a6c982397bbc9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:37:03 +0900 Subject: [PATCH 32/59] kprobes/x86: Fix %p uses in error messages commit 0ea063306eecf300fcf06d2f5917474b580f666f upstream. Remove all %p uses in error messages in kprobes/x86. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491902310.9916.13355297638917767319.stgit@devbox Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kprobes/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 1f5c47a49e35..c6f466d6cc57 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -393,7 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src) newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(&insn); @@ -609,8 +608,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, * Raise a BUG or we'll continue in an endless reentering loop * and eventually a stack overflow. */ - printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", - p->addr); + pr_err("Unrecoverable kprobe detected.\n"); dump_kprobe(p); BUG(); default: From ec5aa64fec7206537442a2f3cb67decabad252f4 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 3 Aug 2018 10:05:50 -0700 Subject: [PATCH 33/59] x86/irqflags: Provide a declaration for native_save_fl commit 208cbb32558907f68b3b2a081ca2337ac3744794 upstream. It was reported that the commit d0a8d9378d16 is causing users of gcc < 4.9 to observe -Werror=missing-prototypes errors. Indeed, it seems that: extern inline unsigned long native_save_fl(void) { return 0; } compiled with -Werror=missing-prototypes produces this warning in gcc < 4.9, but not gcc >= 4.9. Fixes: d0a8d9378d16 ("x86/paravirt: Make native_save_fl() extern inline"). Reported-by: David Laight Reported-by: Jean Delvare Signed-off-by: Nick Desaulniers Signed-off-by: Thomas Gleixner Cc: hpa@zytor.com Cc: jgross@suse.com Cc: kstewart@linuxfoundation.org Cc: gregkh@linuxfoundation.org Cc: boris.ostrovsky@oracle.com Cc: astrachan@google.com Cc: mka@chromium.org Cc: arnd@arndb.de Cc: tstellar@redhat.com Cc: sedat.dilek@gmail.com Cc: David.Laight@aculab.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180803170550.164688-1-ndesaulniers@google.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/irqflags.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0056bc945cd1..cb7f04981c6b 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -8,6 +8,8 @@ * Interrupt control: */ +/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ +extern inline unsigned long native_save_fl(void); extern inline unsigned long native_save_fl(void) { unsigned long flags; From 90a231c63cc28d896ab353b027011a949e9884d3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Jun 2018 15:48:21 -0700 Subject: [PATCH 34/59] x86/speculation/l1tf: Increase 32bit PAE __PHYSICAL_PAGE_SHIFT commit 50896e180c6aa3a9c61a26ced99e15d602666a4c upstream L1 Terminal Fault (L1TF) is a speculation related vulnerability. The CPU speculates on PTE entries which do not have the PRESENT bit set, if the content of the resulting physical address is available in the L1D cache. The OS side mitigation makes sure that a !PRESENT PTE entry points to a physical address outside the actually existing and cachable memory space. This is achieved by inverting the upper bits of the PTE. Due to the address space limitations this only works for 64bit and 32bit PAE kernels, but not for 32bit non PAE. This mitigation applies to both host and guest kernels, but in case of a 64bit host (hypervisor) and a 32bit PAE guest, inverting the upper bits of the PAE address space (44bit) is not enough if the host has more than 43 bits of populated memory address space, because the speculation treats the PTE content as a physical host address bypassing EPT. The host (hypervisor) protects itself against the guest by flushing L1D as needed, but pages inside the guest are not protected against attacks from other processes inside the same guest. For the guest the inverted PTE mask has to match the host to provide the full protection for all pages the host could possibly map into the guest. The hosts populated address space is not known to the guest, so the mask must cover the possible maximal host address space, i.e. 52 bit. On 32bit PAE the maximum PTE mask is currently set to 44 bit because that is the limit imposed by 32bit unsigned long PFNs in the VMs. This limits the mask to be below what the host could possible use for physical pages. The L1TF PROT_NONE protection code uses the PTE masks to determine which bits to invert to make sure the higher bits are set for unmapped entries to prevent L1TF speculation attacks against EPT inside guests. In order to invert all bits that could be used by the host, increase __PHYSICAL_PAGE_SHIFT to 52 to match 64bit. The real limit for a 32bit PAE kernel is still 44 bits because all Linux PTEs are created from unsigned long PFNs, so they cannot be higher than 44 bits on a 32bit kernel. So these extra PFN bits should be never set. The only users of this macro are using it to look at PTEs, so it's safe. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Michal Hocko Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/page_32_types.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 3a52ee0e726d..bfceb5cc6347 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -27,8 +27,13 @@ #define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE -/* 44=32+12, the limit we can fit into an unsigned long pfn */ -#define __PHYSICAL_MASK_SHIFT 44 +/* + * This is beyond the 44 bit limit imposed by the 32bit long pfns, + * but we need the full mask to make sure inverted PROT_NONE + * entries have all the host bits set in a guest. + * The real limit is still 44 bits. + */ +#define __PHYSICAL_MASK_SHIFT 52 #define __VIRTUAL_MASK_SHIFT 32 #else /* !CONFIG_X86_PAE */ From 0a5deacaac102f451bf8c1fb9d007047fcd712f6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Jul 2016 17:19:11 -0700 Subject: [PATCH 35/59] x86/mm: Move swap offset/type up in PTE to work around erratum commit 00839ee3b299303c6a5e26a0a2485427a3afcbbf upstream This erratum can result in Accessed/Dirty getting set by the hardware when we do not expect them to be (on !Present PTEs). Instead of trying to fix them up after this happens, we just allow the bits to get set and try to ignore them. We do this by shifting the layout of the bits we use for swap offset/type in our 64-bit PTEs. It looks like this: bitnrs: | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| names: | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| before: | OFFSET (9-63) |0|X|X| TYPE(1-5) |0| after: | OFFSET (14-63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| Note that D was already a don't care (X) even before. We just move TYPE up and turn its old spot (which could be hit by the A bit) into all don't cares. We take 5 bits away from the offset, but that still leaves us with 50 bits which lets us index into a 62-bit swapfile (4 EiB). I think that's probably fine for the moment. We could theoretically reclaim 5 of the bits (1, 2, 3, 4, 7) but it doesn't gain us anything. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: dave.hansen@intel.com Cc: linux-mm@kvack.org Cc: mhocko@suse.com Link: http://lkml.kernel.org/r/20160708001911.9A3FD2B6@viggo.jf.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable_64.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index c810226e741a..225405b690b8 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -163,18 +163,32 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -/* Encode and de-code a swap entry */ +/* + * Encode and de-code a swap entry + * + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names + * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * + * G (8) is aliased and used as a PROT_NONE indicator for + * !present ptes. We need to start storing swap entries above + * there. We also need to avoid using A and D because of an + * erratum where they can be incorrectly set by hardware on + * non-present PTEs. + */ +#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +/* Place the offset above the type: */ +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ +#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ - | ((offset) << SWP_OFFSET_SHIFT) }) + ((type) << (SWP_TYPE_FIRST_BIT)) \ + | ((offset) << SWP_OFFSET_FIRST_BIT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) From f487cf69cf1456ceb34857a50474373aae42dd8a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Aug 2016 10:23:25 -0700 Subject: [PATCH 36/59] x86/mm: Fix swap entry comment and macro commit ace7fab7a6cdd363a615ec537f2aa94dbc761ee2 upstream A recent patch changed the format of a swap PTE. The comment explaining the format of the swap PTE is wrong about the bits used for the swap type field. Amusingly, the ASCII art and the patch description are correct, but the comment itself is wrong. As I was looking at this, I also noticed that the SWP_OFFSET_FIRST_BIT has an off-by-one error. This does not really hurt anything. It just wasted a bit of space in the PTE, giving us 2^59 bytes of addressable space in our swapfiles instead of 2^60. But, it doesn't match with the comments, and it wastes a bit of space, so fix it. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Fixes: 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum") Link: http://lkml.kernel.org/r/20160810172325.E56AD7DA@viggo.jf.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 225405b690b8..ce97c8c6a310 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -168,7 +168,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -179,7 +179,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 /* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) From 86b0948d7c546feb01cd2d7ac2bfb15476e6e974 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 8 Sep 2017 16:10:46 -0700 Subject: [PATCH 37/59] mm: x86: move _PAGE_SWP_SOFT_DIRTY from bit 7 to bit 1 commit eee4818baac0f2b37848fdf90e4b16430dc536ac upstream _PAGE_PSE is used to distinguish between a truly non-present (_PAGE_PRESENT=0) PMD, and a PMD which is undergoing a THP split and should be treated as present. But _PAGE_SWP_SOFT_DIRTY currently uses the _PAGE_PSE bit, which would cause confusion between one of those PMDs undergoing a THP split, and a soft-dirty PMD. Dropping _PAGE_PSE check in pmd_present() does not work well, because it can hurt optimization of tlb handling in thp split. Thus, we need to move the bit. In the current kernel, bits 1-4 are not used in non-present format since commit 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum"). So let's move _PAGE_SWP_SOFT_DIRTY to bit 1. Bit 7 is used as reserved (always clear), so please don't use it for other purpose. [dwmw2: Pulled in to 4.9 backport to support L1TF changes] Link: http://lkml.kernel.org/r/20170717193955.20207-3-zi.yan@sent.com Signed-off-by: Naoya Horiguchi Signed-off-by: Zi Yan Acked-by: Dave Hansen Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: David Nellans Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Minchan Kim Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable_64.h | 12 +++++++++--- arch/x86/include/asm/pgtable_types.h | 10 +++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index ce97c8c6a310..008e1a58f96c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -166,15 +166,21 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* * Encode and de-code a swap entry * - * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number - * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names + * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above * there. We also need to avoid using A and D because of an * erratum where they can be incorrectly set by hardware on * non-present PTEs. + * + * SD (1) in swp entry is used to store soft dirty bit, which helps us + * remember soft dirty over page migration + * + * Bit 7 in swp entry should be 0 because pmd_present checks not only P, + * but also L and G. */ #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8dba273da25a..7572ce32055e 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -70,15 +70,15 @@ /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict - * with swap entry format. On x86 bits 6 and 7 are *not* involved - * into swap entry computation, but bit 6 is used for nonlinear - * file mapping, so we borrow bit 7 for soft dirty tracking. + * with swap entry format. On x86 bits 1-4 are *not* involved + * into swap entry computation, but bit 7 is used for thp migration, + * so we borrow bit 1 for soft dirty tracking. * * Please note that this bit must be treated as swap dirty page - * mark if and only if the PTE has present bit clear! + * mark if and only if the PTE/PMD has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW #else #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif From 614f5e84640e382b9916b6f606328191ed0264b3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 Jun 2018 15:48:22 -0700 Subject: [PATCH 38/59] x86/speculation/l1tf: Change order of offset/type in swap entry commit bcd11afa7adad8d720e7ba5ef58bdcd9775cf45f upstream If pages are swapped out, the swap entry is stored in the corresponding PTE, which has the Present bit cleared. CPUs vulnerable to L1TF speculate on PTE entries which have the present bit set and would treat the swap entry as phsyical address (PFN). To mitigate that the upper bits of the PTE must be set so the PTE points to non existent memory. The swap entry stores the type and the offset of a swapped out page in the PTE. type is stored in bit 9-13 and offset in bit 14-63. The hardware ignores the bits beyond the phsyical address space limit, so to make the mitigation effective its required to start 'offset' at the lowest possible bit so that even large swap offsets do not reach into the physical address space limit bits. Move offset to bit 9-58 and type to bit 59-63 which are the bits that hardware generally doesn't care about. That, in turn, means that if you on desktop chip with only 40 bits of physical addressing, now that the offset starts at bit 9, there needs to be 30 bits of offset actually *in use* until bit 39 ends up being set, which means when inverted it will again point into existing memory. So that's 4 terabyte of swap space (because the offset is counted in pages, so 30 bits of offset is 42 bits of actual coverage). With bigger physical addressing, that obviously grows further, until the limit of the offset is hit (at 50 bits of offset - 62 bits of actual swap file coverage). This is a preparatory change for the actual swap entry inversion to protect against L1TF. [ AK: Updated description and minor tweaks. Split into two parts ] [ tglx: Massaged changelog ] Signed-off-by: Linus Torvalds Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Tested-by: Andi Kleen Reviewed-by: Josh Poimboeuf Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable_64.h | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 008e1a58f96c..a72c2ab24006 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -168,7 +168,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -182,19 +182,28 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. */ -#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) -#define SWP_TYPE_BITS 5 -/* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ - & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (SWP_TYPE_FIRST_BIT)) \ - | ((offset) << SWP_OFFSET_FIRST_BIT) }) +/* Extract the high bits for type */ +#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) + +/* Shift up (to get rid of type), then down to get value */ +#define __swp_offset(x) ((x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) + +/* + * Shift the offset up "too far" by TYPE bits, then down again + */ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) From 9bbdab847fc9a0b8cf23fa7354e1210f0b492821 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 Jun 2018 15:48:23 -0700 Subject: [PATCH 39/59] x86/speculation/l1tf: Protect swap entries against L1TF commit 2f22b4cd45b67b3496f4aa4c7180a1271c6452f6 upstream With L1 terminal fault the CPU speculates into unmapped PTEs, and resulting side effects allow to read the memory the PTE is pointing too, if its values are still in the L1 cache. For swapped out pages Linux uses unmapped PTEs and stores a swap entry into them. To protect against L1TF it must be ensured that the swap entry is not pointing to valid memory, which requires setting higher bits (between bit 36 and bit 45) that are inside the CPUs physical address space, but outside any real memory. To do this invert the offset to make sure the higher bits are always set, as long as the swap file is not too big. Note there is no workaround for 32bit !PAE, or on systems which have more than MAX_PA/2 worth of memory. The later case is very unlikely to happen on real systems. [AK: updated description and minor tweaks by. Split out from the original patch ] Signed-off-by: Linus Torvalds Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Tested-by: Andi Kleen Reviewed-by: Josh Poimboeuf Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable_64.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index a72c2ab24006..67f2fe43a593 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -168,7 +168,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -181,6 +181,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. + * + * The offset is inverted by a binary not operation to make the high + * physical bits set. */ #define SWP_TYPE_BITS 5 @@ -195,13 +198,15 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) /* Shift up (to get rid of type), then down to get value */ -#define __swp_offset(x) ((x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) +#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) /* * Shift the offset up "too far" by TYPE bits, then down again + * The offset is inverted by a binary not operation to make the high + * physical bits set. */ #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) From 9ee2d2da676c48a459a99f10f45c71ffca8761a8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Jun 2018 15:48:24 -0700 Subject: [PATCH 40/59] x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation commit 6b28baca9b1f0d4a42b865da7a05b1c81424bd5c upstream When PTEs are set to PROT_NONE the kernel just clears the Present bit and preserves the PFN, which creates attack surface for L1TF speculation speculation attacks. This is important inside guests, because L1TF speculation bypasses physical page remapping. While the host has its own migitations preventing leaking data from other VMs into the guest, this would still risk leaking the wrong page inside the current guest. This uses the same technique as Linus' swap entry patch: while an entry is is in PROTNONE state invert the complete PFN part part of it. This ensures that the the highest bit will point to non existing memory. The invert is done by pte/pmd_modify and pfn/pmd/pud_pte for PROTNONE and pte/pmd/pud_pfn undo it. This assume that no code path touches the PFN part of a PTE directly without using these primitives. This doesn't handle the case that MMIO is on the top of the CPU physical memory. If such an MMIO region was exposed by an unpriviledged driver for mmap it would be possible to attack some real memory. However this situation is all rather unlikely. For 32bit non PAE the inversion is not done because there are really not enough bits to protect anything. Q: Why does the guest need to be protected when the HyperVisor already has L1TF mitigations? A: Here's an example: Physical pages 1 2 get mapped into a guest as GPA 1 -> PA 2 GPA 2 -> PA 1 through EPT. The L1TF speculation ignores the EPT remapping. Now the guest kernel maps GPA 1 to process A and GPA 2 to process B, and they belong to different users and should be isolated. A sets the GPA 1 PA 2 PTE to PROT_NONE to bypass the EPT remapping and gets read access to the underlying physical page. Which in this case points to PA 2, so it can read process B's data, if it happened to be in L1, so isolation inside the guest is broken. There's nothing the hypervisor can do about this. This mitigation has to be done in the guest itself. [ tglx: Massaged changelog ] [ dwmw2: backported to 4.9 ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable-2level.h | 17 ++++++++++++ arch/x86/include/asm/pgtable-3level.h | 2 ++ arch/x86/include/asm/pgtable-invert.h | 32 ++++++++++++++++++++++ arch/x86/include/asm/pgtable.h | 38 +++++++++++++++++++-------- arch/x86/include/asm/pgtable_64.h | 2 ++ 5 files changed, 80 insertions(+), 11 deletions(-) create mode 100644 arch/x86/include/asm/pgtable-invert.h diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index fd74a11959de..89c50332a71e 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -77,4 +77,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* No inverted PFNs on 2 level page tables */ + +static inline u64 protnone_mask(u64 val) +{ + return 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + return val; +} + +static inline bool __pte_needs_invert(u64 val) +{ + return false; +} + #endif /* _ASM_X86_PGTABLE_2LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index cdaa58c9b39e..0c89891c7b44 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -184,4 +184,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) +#include + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h new file mode 100644 index 000000000000..177564187fc0 --- /dev/null +++ b/arch/x86/include/asm/pgtable-invert.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PGTABLE_INVERT_H +#define _ASM_PGTABLE_INVERT_H 1 + +#ifndef __ASSEMBLY__ + +static inline bool __pte_needs_invert(u64 val) +{ + return (val & (_PAGE_PRESENT|_PAGE_PROTNONE)) == _PAGE_PROTNONE; +} + +/* Get a mask to xor with the page table entry to get the correct pfn. */ +static inline u64 protnone_mask(u64 val) +{ + return __pte_needs_invert(val) ? ~0ull : 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + /* + * When a PTE transitions from NONE to !NONE or vice-versa + * invert the PFN part to stop speculation. + * pte_pfn undoes this when needed. + */ + if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) + val = (val & ~mask) | (~val & mask); + return val; +} + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 84c62d950023..2ed1556d99b1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -148,19 +148,29 @@ static inline int pte_special(pte_t pte) return pte_flags(pte) & _PAGE_SPECIAL; } +/* Entries that were set to PROT_NONE are inverted */ + +static inline u64 protnone_mask(u64 val); + static inline unsigned long pte_pfn(pte_t pte) { - return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long pfn = pte_val(pte); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; + unsigned long pfn = pmd_val(pmd); + pfn ^= protnone_mask(pfn); + return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; + unsigned long pfn = pud_val(pud); + pfn ^= protnone_mask(pfn); + return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -359,19 +369,25 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + phys_addr_t pfn = page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pte(pfn | massage_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + phys_addr_t pfn = page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PMD_PAGE_MASK; + return __pmd(pfn | massage_pgprot(pgprot)); } +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { - pteval_t val = pte_val(pte); + pteval_t val = pte_val(pte), oldval = val; /* * Chop off the NX bit (if present), and add the NX portion of @@ -379,17 +395,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ val &= _PAGE_CHG_MASK; val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); return __pte(val); } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmdval_t val = pmd_val(pmd); + pmdval_t val = pmd_val(pmd), oldval = val; val &= _HPAGE_CHG_MASK; val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); return __pmd(val); } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 67f2fe43a593..221a32ed1372 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -235,6 +235,8 @@ extern void cleanup_highmap(void); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); +#include + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ From 52dc5c9f8eee1c569974308f0bb7be64ec63565c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Jun 2018 15:48:25 -0700 Subject: [PATCH 41/59] x86/speculation/l1tf: Make sure the first page is always reserved commit 10a70416e1f067f6c4efda6ffd8ea96002ac4223 upstream The L1TF workaround doesn't make any attempt to mitigate speculate accesses to the first physical page for zeroed PTEs. Normally it only contains some data from the early real mode BIOS. It's not entirely clear that the first page is reserved in all configurations, so add an extra reservation call to make sure it is really reserved. In most configurations (e.g. with the standard reservations) it's likely a nop. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbaae4cf9e8e..31c4bc0d3372 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -851,6 +851,12 @@ void __init setup_arch(char **cmdline_p) memblock_reserve(__pa_symbol(_text), (unsigned long)__bss_stop - (unsigned long)_text); + /* + * Make sure page 0 is always reserved because on systems with + * L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, PAGE_SIZE); + early_reserve_initrd(); /* From bf0cca01b8736a5e146a980434ba36eb036e37ac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Jun 2018 15:48:26 -0700 Subject: [PATCH 42/59] x86/speculation/l1tf: Add sysfs reporting for l1tf commit 17dbca119312b4e8173d4e25ff64262119fcef38 upstream L1TF core kernel workarounds are cheap and normally always enabled, However they still should be reported in sysfs if the system is vulnerable or mitigated. Add the necessary CPU feature/bug bits. - Extend the existing checks for Meltdowns to determine if the system is vulnerable. All CPUs which are not vulnerable to Meltdown are also not vulnerable to L1TF - Check for 32bit non PAE and emit a warning as there is no practical way for mitigation due to the limited physical address bits - If the system has more than MAX_PA/2 physical memory the invert page workarounds don't protect the system against the L1TF attack anymore, because an inverted physical address will also point to valid memory. Print a warning in this case and report that the system is vulnerable. Add a function which returns the PFN limit for the L1TF mitigation, which will be used in follow up patches for sanity and range checks. [ tglx: Renamed the CPU feature bit to L1TF_PTEINV ] [ dwmw2: Backport to 4.9 (cpufeatures.h, E820) ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 3 ++- arch/x86/include/asm/processor.h | 5 ++++ arch/x86/kernel/cpu/bugs.c | 40 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 20 +++++++++++++++ drivers/base/cpu.c | 8 ++++++ include/linux/cpu.h | 2 ++ 6 files changed, 77 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f4b175db70f4..f3a8479c0d87 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -214,7 +214,7 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ - +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -331,5 +331,6 @@ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8e415cf65457..a3a53955f01c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -172,6 +172,11 @@ extern const struct seq_operations cpuinfo_op; extern void cpu_detect(struct cpuinfo_x86 *c); +static inline unsigned long l1tf_pfn_limit(void) +{ + return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; +} + extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7688ce0b26c5..ce8d0abf3d35 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -26,9 +26,11 @@ #include #include #include +#include static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init l1tf_select_mitigation(void); /* * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any @@ -80,6 +82,8 @@ void __init check_bugs(void) */ ssb_select_mitigation(); + l1tf_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -203,6 +207,32 @@ static void x86_amd_ssb_disable(void) wrmsrl(MSR_AMD64_LS_CFG, msrval); } +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + /* + * This is extremely unlikely to happen because almost all + * systems have far more MAX_PA/2 than RAM can be fit into + * DIMM slots. + */ + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} + #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -655,6 +685,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + case X86_BUG_L1TF: + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return sprintf(buf, "Mitigation: Page Table Inversion\n"); + break; + default: break; } @@ -681,4 +716,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute * { return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3d21b28f9826..4d3fa79c0f09 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -880,6 +880,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { {} }; +static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { + /* in addition to cpu_no_speculation */ + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + {} +}; + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; @@ -905,6 +920,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + if (x86_match_cpu(cpu_no_l1tf)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); } /* diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 143edea1076f..41090ef5facb 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -524,16 +524,24 @@ ssize_t __weak cpu_show_spec_store_bypass(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_l1tf(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); +static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, &dev_attr_spectre_v1.attr, &dev_attr_spectre_v2.attr, &dev_attr_spec_store_bypass.attr, + &dev_attr_l1tf.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2f9d12022100..063c73ed6d78 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -48,6 +48,8 @@ extern ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_l1tf(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From 0371d9c4c822fceb290a0b4cd21119534f7bae47 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:20 -0800 Subject: [PATCH 43/59] mm: Add vm_insert_pfn_prot() commit 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 upstream The x86 vvar vma contains pages with differing cacheability flags. x86 currently implements this by manually inserting all the ptes using (io_)remap_pfn_range when the vma is set up. x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up the mappings as needed. The correct API to use to insert a pfn in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the vma's cache mode, and the HPET page in particular needs to be uncached despite the fact that the rest of the VMA is cached. Add vm_insert_pfn_prot() to support varying cacheability within the same non-COW VMA in a more sane manner. x86 could alternatively use multiple VMAs, but that's messy, would break CRIU, and would create unnecessary VMAs that would waste memory. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Acked-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 2 ++ mm/memory.c | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a100946607a5..1f4366567e7d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2083,6 +2083,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/mm/memory.c b/mm/memory.c index 177cb7d111a9..edb5b0d8a4a0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1604,9 +1604,30 @@ out: */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) +{ + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) { int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1628,7 +1649,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) From 9ac0dc7d949db7afd4116d55fa4fcf6a66d820f0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Oct 2016 17:00:18 -0700 Subject: [PATCH 44/59] mm: fix cache mode tracking in vm_insert_mixed() commit 87744ab3832b83ba71b931f86f9cfdb000d07da5 upstream vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(), fails to check the pgprot_t it uses for the mapping against the one recorded in the memtype tracking tree. Add the missing call to track_pfn_insert() to preclude cases where incompatible aliased mappings are established for a given physical address range. [groeck: Backport to v4.4.y] Link: http://lkml.kernel.org/r/147328717909.35069.14256589123570653697.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: David Airlie Cc: Matthew Wilcox Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index edb5b0d8a4a0..78efb8c7ee20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1654,10 +1654,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { + pgprot_t pgprot = vma->vm_page_prot; + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, struct page *page; page = pfn_to_page(pfn); - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); + return insert_pfn(vma, addr, pfn, pgprot); } EXPORT_SYMBOL(vm_insert_mixed); From d71af2dbacb5611c1dcdc16fd1d343821d61bd5e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Jun 2018 15:48:27 -0700 Subject: [PATCH 45/59] x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings commit 42e4089c7890725fcd329999252dc489b72f2921 upstream For L1TF PROT_NONE mappings are protected by inverting the PFN in the page table entry. This sets the high bits in the CPU's address space, thus making sure to point to not point an unmapped entry to valid cached memory. Some server system BIOSes put the MMIO mappings high up in the physical address space. If such an high mapping was mapped to unprivileged users they could attack low memory by setting such a mapping to PROT_NONE. This could happen through a special device driver which is not access protected. Normal /dev/mem is of course access protected. To avoid this forbid PROT_NONE mappings or mprotect for high MMIO mappings. Valid page mappings are allowed because the system is then unsafe anyways. It's not expected that users commonly use PROT_NONE on MMIO. But to minimize any impact this is only enforced if the mapping actually refers to a high MMIO address (defined as the MAX_PA-1 bit being set), and also skip the check for root. For mmaps this is straight forward and can be handled in vm_insert_pfn and in remap_pfn_range(). For mprotect it's a bit trickier. At the point where the actual PTEs are accessed a lot of state has been changed and it would be difficult to undo on an error. Since this is a uncommon case use a separate early page talk walk pass for MMIO PROT_NONE mappings that checks for this condition early. For non MMIO and non PROT_NONE there are no changes. [dwmw2: Backport to 4.9] [groeck: Backport to 4.4] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable.h | 8 ++++++ arch/x86/mm/mmap.c | 21 +++++++++++++++ include/asm-generic/pgtable.h | 12 +++++++++ mm/memory.c | 29 +++++++++++++++----- mm/mprotect.c | 49 ++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2ed1556d99b1..70e2248353cb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -942,6 +942,14 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif +#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 +extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); + +static inline bool arch_has_pfn_modify_check(void) +{ + return boot_cpu_has_bug(X86_BUG_L1TF); +} + #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 307f60ecfc6d..9a055ea279eb 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -121,3 +121,24 @@ const char *arch_vma_name(struct vm_area_struct *vma) return "[mpx]"; return NULL; } + +/* + * Only allow root to set high MMIO mappings to PROT_NONE. + * This prevents an unpriv. user to set them to PROT_NONE and invert + * them, then pointing to valid memory for L1TF speculation. + * + * Note: for locked down kernels may want to disable the root override. + */ +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return true; + if (!__pte_needs_invert(pgprot_val(prot))) + return true; + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; + if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; +} diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 25b793325b09..976f749099ef 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -805,4 +805,16 @@ static inline int pmd_free_pte_page(pmd_t *pmd) #define io_remap_pfn_range remap_pfn_range #endif +#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED +static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + return true; +} + +static inline bool arch_has_pfn_modify_check(void) +{ + return false; +} +#endif + #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/mm/memory.c b/mm/memory.c index 78efb8c7ee20..d5bb1465d30c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1645,6 +1645,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; + if (!pfn_modify_allowed(pfn, pgprot)) + return -EACCES; + ret = insert_pfn(vma, addr, pfn, pgprot); return ret; @@ -1663,6 +1666,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; + if (!pfn_modify_allowed(pfn, pgprot)) + return -EACCES; + /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* @@ -1691,6 +1697,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) @@ -1698,12 +1705,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -1712,6 +1723,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); @@ -1720,9 +1732,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); - if (remap_pte_range(mm, pmd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pmd++, addr = next, addr != end); return 0; } @@ -1733,6 +1746,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, { pud_t *pud; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, pgd, addr); @@ -1740,9 +1754,10 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (remap_pmd_range(mm, pud, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pud++, addr = next, addr != end); return 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index c0b4b2a49462..a277f3412a5d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -255,6 +255,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, return pages; } +static int prot_none_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_test(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return 0; +} + +static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + pgprot_t new_pgprot = vm_get_page_prot(newflags); + struct mm_walk prot_none_walk = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, + .mm = current->mm, + .private = &new_pgprot, + }; + + return walk_page_range(start, end, &prot_none_walk); +} + int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) @@ -272,6 +308,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, return 0; } + /* + * Do PROT_NONE PFN permission checks here when we can still + * bail out without undoing a lot of state. This is a rather + * uncommon case, so doesn't need to be very optimized. + */ + if (arch_has_pfn_modify_check() && + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { + error = prot_none_walk(vma, start, end, newflags); + if (error) + return error; + } + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we From 685b44483f077c949bd5016fdfe734b662b74aba Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Jun 2018 15:48:28 -0700 Subject: [PATCH 46/59] x86/speculation/l1tf: Limit swap file size to MAX_PA/2 commit 377eeaa8e11fe815b1d07c81c4a0e2843a8c15eb upstream For the L1TF workaround its necessary to limit the swap file size to below MAX_PA/2, so that the higher bits of the swap offset inverted never point to valid memory. Add a mechanism for the architecture to override the swap file size check in swapfile.c and add a x86 specific max swapfile check function that enforces that limit. The check is only enabled if the CPU is vulnerable to L1TF. In VMs with 42bit MAX_PA the typical limit is 2TB now, on a native system with 46bit PA it is 32TB. The limit is only per individual swap file, so it's always possible to exceed these limits with multiple swap files or partitions. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Michal Hocko Acked-by: Dave Hansen Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/init.c | 15 +++++++++++++ include/linux/swapfile.h | 2 ++ mm/swapfile.c | 46 ++++++++++++++++++++++++++-------------- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 151fd33e9043..afde6da2768f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -4,6 +4,8 @@ #include #include #include /* for max_low_pfn */ +#include +#include #include #include @@ -767,3 +769,16 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); __pte2cachemode_tbl[entry] = cache; } + +unsigned long max_swapfile_size(void) +{ + unsigned long pages; + + pages = generic_max_swapfile_size(); + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ + pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages); + } + return pages; +} diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 388293a91e8c..e4594de79bc4 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -9,5 +9,7 @@ extern spinlock_t swap_lock; extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; extern int try_to_unuse(unsigned int, bool, unsigned long); +extern unsigned long generic_max_swapfile_size(void); +extern unsigned long max_swapfile_size(void); #endif /* _LINUX_SWAPFILE_H */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 674bf177ce44..8e25ff2b693a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2206,6 +2206,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) return 0; } + +/* + * Find out how many pages are allowed for a single swap device. There + * are two limiting factors: + * 1) the number of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte, as defined by the different + * architectures. + * + * In order to find the largest possible bit mask, a swap entry with + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again, and finally the swap offset is + * extracted. + * + * This will mask all the bits from the initial ~0UL mask that can't + * be encoded in either the swp_entry_t or the architecture definition + * of a swap pte. + */ +unsigned long generic_max_swapfile_size(void) +{ + return swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; +} + +/* Can be overridden by an architecture for additional checks. */ +__weak unsigned long max_swapfile_size(void) +{ + return generic_max_swapfile_size(); +} + static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) @@ -2241,22 +2270,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->cluster_next = 1; p->cluster_nr = 0; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number - * of bits for the swap offset in the swp_entry_t type, and - * 2) the number of bits in the swap pte as defined by the - * different architectures. In order to find the - * largest possible bit mask, a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again, and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + maxpages = max_swapfile_size(); last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); From fa86c208d22d8179ef3d295f6084fc87390c8366 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 20 Jun 2018 16:42:57 -0400 Subject: [PATCH 47/59] x86/bugs: Move the l1tf function and define pr_fmt properly commit 56563f53d3066afa9e63d6c997bf67e76a8b05c0 upstream The pr_warn in l1tf_select_mitigation would have used the prior pr_fmt which was defined as "Spectre V2 : ". Move the function to be past SSBD and also define the pr_fmt. Fixes: 17dbca119312 ("x86/speculation/l1tf: Add sysfs reporting for l1tf") Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 55 ++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ce8d0abf3d35..34e4aaaf03d2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -207,32 +207,6 @@ static void x86_amd_ssb_disable(void) wrmsrl(MSR_AMD64_LS_CFG, msrval); } -static void __init l1tf_select_mitigation(void) -{ - u64 half_pa; - - if (!boot_cpu_has_bug(X86_BUG_L1TF)) - return; - -#if CONFIG_PGTABLE_LEVELS == 2 - pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); - return; -#endif - - /* - * This is extremely unlikely to happen because almost all - * systems have far more MAX_PA/2 than RAM can be fit into - * DIMM slots. - */ - half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; - if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { - pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); - return; - } - - setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); -} - #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -658,6 +632,35 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +#undef pr_fmt +#define pr_fmt(fmt) "L1TF: " fmt +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + /* + * This is extremely unlikely to happen because almost all + * systems have far more MAX_PA/2 than RAM can be fit into + * DIMM slots. + */ + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} +#undef pr_fmt + #ifdef CONFIG_SYSFS static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, From df7fd6ccb358bd4aa3abc8a6ff995b1f3da1b0fb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 21 Jun 2018 12:36:29 +0200 Subject: [PATCH 48/59] x86/speculation/l1tf: Extend 64bit swap file size limit commit 1a7ed1ba4bba6c075d5ad61bb75e3fbc870840d6 upstream The previous patch has limited swap file size so that large offsets cannot clear bits above MAX_PA/2 in the pte and interfere with L1TF mitigation. It assumed that offsets are encoded starting with bit 12, same as pfn. But on x86_64, offsets are encoded starting with bit 9. Thus the limit can be raised by 3 bits. That means 16TB with 42bit MAX_PA and 256TB with 46bit MAX_PA. Fixes: 377eeaa8e11f ("x86/speculation/l1tf: Limit swap file size to MAX_PA/2") Signed-off-by: Vlastimil Babka Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/init.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index afde6da2768f..8de904926d7f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -778,7 +778,15 @@ unsigned long max_swapfile_size(void) if (boot_cpu_has_bug(X86_BUG_L1TF)) { /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ - pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages); + unsigned long l1tf_limit = l1tf_pfn_limit() + 1; + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +#ifdef CONFIG_X86_64 + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; +#endif + pages = min_t(unsigned long, l1tf_limit, pages); } return pages; } From dc48c1a2f45b628d3128ad4bb31d1bcd342c059d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 20 Jun 2018 16:42:58 -0400 Subject: [PATCH 49/59] x86/cpufeatures: Add detection of L1D cache flush support. commit 11e34e64e4103955fc4568750914c75d65ea87ee upstream 336996-Speculative-Execution-Side-Channel-Mitigations.pdf defines a new MSR (IA32_FLUSH_CMD) which is detected by CPUID.7.EDX[28]=1 bit being set. This new MSR "gives software a way to invalidate structures with finer granularity than other architectual methods like WBINVD." A copy of this document is available at https://bugzilla.kernel.org/show_bug.cgi?id=199511 Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f3a8479c0d87..123a7105db1b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -310,6 +310,7 @@ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ From b55b06bd3b3c977da2c938d1a73d38674cb88086 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 22 Jun 2018 17:39:33 +0200 Subject: [PATCH 50/59] x86/speculation/l1tf: Protect PAE swap entries against L1TF commit 0d0f6249058834ffe1ceaad0bb31464af66f6e7a upstream The PAE 3-level paging code currently doesn't mitigate L1TF by flipping the offset bits, and uses the high PTE word, thus bits 32-36 for type, 37-63 for offset. The lower word is zeroed, thus systems with less than 4GB memory are safe. With 4GB to 128GB the swap type selects the memory locations vulnerable to L1TF; with even more memory, also the swap offfset influences the address. This might be a problem with 32bit PAE guests running on large 64bit hosts. By continuing to keep the whole swap entry in either high or low 32bit word of PTE we would limit the swap size too much. Thus this patch uses the whole PAE PTE with the same layout as the 64bit version does. The macros just become a bit tricky since they assume the arch-dependent swp_entry_t to be 32bit. Signed-off-by: Vlastimil Babka Signed-off-by: Thomas Gleixner Acked-by: Michal Hocko Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable-3level.h | 35 +++++++++++++++++++++++++-- arch/x86/mm/init.c | 2 +- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 0c89891c7b44..5c686382d84b 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -177,12 +177,43 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #endif /* Encode and de-code a swap entry */ +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) -#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) -#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +/* + * Normally, __swp_entry() converts from arch-independent swp_entry_t to + * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result + * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the + * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to + * __swp_entry_to_pte() through the following helper macro based on 64bit + * __swp_entry(). + */ +#define __swp_pteval_entry(type, offset) ((pteval_t) { \ + (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) + +#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ + __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) +/* + * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent + * swp_entry_t, but also has to convert it from 64bit to the 32bit + * intermediate representation, using the following macros based on 64bit + * __swp_type() and __swp_offset(). + */ +#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) +#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) + +#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ + __pteval_swp_offset(pte))) #include diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8de904926d7f..3a8e9abe0667 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -783,7 +783,7 @@ unsigned long max_swapfile_size(void) * We encode swap offsets also with 3 bits below those for pfn * which makes the usable limit higher. */ -#ifdef CONFIG_X86_64 +#if CONFIG_PGTABLE_LEVELS > 2 l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; #endif pages = min_t(unsigned long, l1tf_limit, pages); From 09049f022a9b96b0d09d90023d4f0a097a61a767 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 27 Jun 2018 17:46:50 +0200 Subject: [PATCH 51/59] x86/speculation/l1tf: Fix up pte->pfn conversion for PAE commit e14d7dfb41f5807a0c1c26a13f2b8ef16af24935 upstream Jan has noticed that pte_pfn and co. resp. pfn_pte are incorrect for CONFIG_PAE because phys_addr_t is wider than unsigned long and so the pte_val reps. shift left would get truncated. Fix this up by using proper types. [dwmw2: Backport to 4.9] Fixes: 6b28baca9b1f ("x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation") Reported-by: Jan Beulich Signed-off-by: Michal Hocko Signed-off-by: Thomas Gleixner Acked-by: Vlastimil Babka Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 70e2248353cb..16c6886a1ece 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -154,21 +154,21 @@ static inline u64 protnone_mask(u64 val); static inline unsigned long pte_pfn(pte_t pte) { - unsigned long pfn = pte_val(pte); + phys_addr_t pfn = pte_val(pte); pfn ^= protnone_mask(pfn); return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { - unsigned long pfn = pmd_val(pmd); + phys_addr_t pfn = pmd_val(pmd); pfn ^= protnone_mask(pfn); return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - unsigned long pfn = pud_val(pud); + phys_addr_t pfn = pud_val(pud); pfn ^= protnone_mask(pfn); return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } @@ -369,7 +369,7 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - phys_addr_t pfn = page_nr << PAGE_SHIFT; + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | massage_pgprot(pgprot)); @@ -377,7 +377,7 @@ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - phys_addr_t pfn = page_nr << PAGE_SHIFT; + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PHYSICAL_PMD_PAGE_MASK; return __pmd(pfn | massage_pgprot(pgprot)); From 0aae5fe8413dfcd949d0df1c7d6b835efecd5b3b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 7 Aug 2018 15:09:36 -0700 Subject: [PATCH 52/59] x86/speculation/l1tf: Invert all not present mappings commit f22cc87f6c1f771b57c407555cfefd811cdd9507 upstream For kernel mappings PAGE_PROTNONE is not necessarily set for a non present mapping, but the inversion logic explicitely checks for !PRESENT and PROT_NONE. Remove the PROT_NONE check and make the inversion unconditional for all not present mappings. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable-invert.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h index 177564187fc0..44b1203ece12 100644 --- a/arch/x86/include/asm/pgtable-invert.h +++ b/arch/x86/include/asm/pgtable-invert.h @@ -6,7 +6,7 @@ static inline bool __pte_needs_invert(u64 val) { - return (val & (_PAGE_PRESENT|_PAGE_PROTNONE)) == _PAGE_PROTNONE; + return !(val & _PAGE_PRESENT); } /* Get a mask to xor with the page table entry to get the correct pfn. */ From 9feecdb6cb73feaa55b0135aee8777eaac848c78 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 7 Aug 2018 15:09:37 -0700 Subject: [PATCH 53/59] x86/speculation/l1tf: Make pmd/pud_mknotpresent() invert commit 0768f91530ff46683e0b372df14fd79fe8d156e5 upstream Some cases in THP like: - MADV_FREE - mprotect - split mark the PMD non present for temporarily to prevent races. The window for an L1TF attack in these contexts is very small, but it wants to be fixed for correctness sake. Use the proper low level functions for pmd/pud_mknotpresent() to address this. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 16c6886a1ece..b5e157c065ae 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -315,11 +315,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); -} - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { @@ -383,6 +378,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) return __pmd(pfn | massage_pgprot(pgprot)); } +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pfn_pmd(pmd_pfn(pmd), + __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) From 02ff2769edbce2261e981effbc3c4b98fae4faf0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 7 Aug 2018 15:09:39 -0700 Subject: [PATCH 54/59] x86/mm/pat: Make set_memory_np() L1TF safe commit 958f79b9ee55dfaf00c8106ed1c22a2919e0028b upstream set_memory_np() is used to mark kernel mappings not present, but it has it's own open coded mechanism which does not have the L1TF protection of inverting the address bits. Replace the open coded PTE manipulation with the L1TF protecting low level PTE routines. Passes the CPA self test. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner [ dwmw2: Pull in pud_mkhuge() from commit a00cc7d9dd, and pfn_pud() ] Signed-off-by: David Woodhouse [groeck: port to 4.4] Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable.h | 27 +++++++++++++++++++++++++++ arch/x86/mm/pageattr.c | 8 ++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b5e157c065ae..4de6c282c02a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -378,12 +378,39 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) return __pmd(pfn | massage_pgprot(pgprot)); } +static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) +{ + phys_addr_t pfn = page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PUD_PAGE_MASK; + return __pud(pfn | massage_pgprot(pgprot)); +} + static inline pmd_t pmd_mknotpresent(pmd_t pmd) { return pfn_pmd(pmd_pfn(pmd), __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return __pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return __pud(v & ~clear); +} + +static inline pud_t pud_mkhuge(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_PSE); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 79377e2a7bcd..27610c2d1821 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1006,8 +1006,8 @@ static int populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | - massage_pgprot(pmd_pgprot))); + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, + canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE; @@ -1079,8 +1079,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, * Map everything starting from the Gb boundary, possibly with 1G pages */ while (end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | - massage_pgprot(pud_pgprot))); + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE; From 6b06f36f07e2c91ad0126f17d0fc8f933c827da8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 7 Aug 2018 15:09:38 -0700 Subject: [PATCH 55/59] x86/mm/kmmio: Make the tracer robust against L1TF commit 1063711b57393c1999248cccb57bebfaf16739e7 upstream The mmio tracer sets io mapping PTEs and PMDs to non present when enabled without inverting the address bits, which makes the PTE entry vulnerable for L1TF. Make it use the right low level macros to actually invert the address bits to protect against L1TF. In principle this could be avoided because MMIO tracing is not likely to be enabled on production machines, but the fix is straigt forward and for consistency sake it's better to get rid of the open coded PTE manipulation. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kmmio.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 76604c8a2a48..7bf14e74fc8f 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -125,24 +125,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { + pmd_t new_pmd; pmdval_t v = pmd_val(*pmd); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pmd(pmd, __pmd(v)); + *old = v; + new_pmd = pmd_mknotpresent(*pmd); + } else { + /* Presume this has been called with clear==true previously */ + new_pmd = __pmd(*old); + } + set_pmd(pmd, new_pmd); } static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pte_atomic(pte, __pte(v)); + *old = v; + /* Nothing should care about address */ + pte_clear(&init_mm, 0, pte); + } else { + /* Presume this has been called with clear==true previously */ + set_pte_atomic(pte, __pte(*old)); + } } static int clear_page_presence(struct kmmio_fault_page *f, bool clear) From eb993211b9d7856a9ab8c487c701c84103842713 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 13 Aug 2018 10:15:16 -0700 Subject: [PATCH 56/59] x86/speculation/l1tf: Fix up CPU feature flags In linux-4.4.y, the definition of X86_FEATURE_RETPOLINE and X86_FEATURE_RETPOLINE_AMD is different from the upstream definition. Result is an overlap with the newly introduced X86_FEATURE_L1TF_PTEINV. Update RETPOLINE definitions to match upstream definitions to improve alignment with upstream code. Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 123a7105db1b..dd2269dcbc47 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -193,12 +193,12 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_RETPOLINE ( 7*32+29) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* "" AMD Retpoline mitigation for Spectre variant 2 */ - #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ From 4b90ff885c6cc88795b678414aaf5d7b0153a5dc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 14 Aug 2018 20:50:47 +0200 Subject: [PATCH 57/59] x86/init: fix build with CONFIG_SWAP=n commit 792adb90fa724ce07c0171cbc96b9215af4b1045 upstream. The introduction of generic_max_swapfile_size and arch-specific versions has broken linking on x86 with CONFIG_SWAP=n due to undefined reference to 'generic_max_swapfile_size'. Fix it by compiling the x86-specific max_swapfile_size() only with CONFIG_SWAP=y. Reported-by: Tomas Pruzina Fixes: 377eeaa8e11f ("x86/speculation/l1tf: Limit swap file size to MAX_PA/2") Signed-off-by: Vlastimil Babka Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3a8e9abe0667..4954a6cef50a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -770,6 +770,7 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) __pte2cachemode_tbl[entry] = cache; } +#ifdef CONFIG_SWAP unsigned long max_swapfile_size(void) { unsigned long pages; @@ -790,3 +791,4 @@ unsigned long max_swapfile_size(void) } return pages; } +#endif From 8f2adf3d2118cc0822b83a7bb43475f9149a1d26 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Sat, 14 Jul 2018 21:56:13 +0200 Subject: [PATCH 58/59] x86/speculation/l1tf: Unbreak !__HAVE_ARCH_PFN_MODIFY_ALLOWED architectures commit 6c26fcd2abfe0a56bbd95271fce02df2896cfd24 upstream. pfn_modify_allowed() and arch_has_pfn_modify_check() are outside of the !__ASSEMBLY__ section in include/asm-generic/pgtable.h, which confuses assembler on archs that don't have __HAVE_ARCH_PFN_MODIFY_ALLOWED (e.g. ia64) and breaks build: include/asm-generic/pgtable.h: Assembler messages: include/asm-generic/pgtable.h:538: Error: Unknown opcode `static inline bool pfn_modify_allowed(unsigned long pfn,pgprot_t prot)' include/asm-generic/pgtable.h:540: Error: Unknown opcode `return true' include/asm-generic/pgtable.h:543: Error: Unknown opcode `static inline bool arch_has_pfn_modify_check(void)' include/asm-generic/pgtable.h:545: Error: Unknown opcode `return false' arch/ia64/kernel/entry.S:69: Error: `mov' does not fit into bundle Move those two static inlines into the !__ASSEMBLY__ section so that they don't confuse the asm build pass. Fixes: 42e4089c7890 ("x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings") Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman [groeck: Context changes] Signed-off-by: Guenter Roeck --- include/asm-generic/pgtable.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 976f749099ef..dabecb661264 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -799,12 +799,6 @@ static inline int pmd_free_pte_page(pmd_t *pmd) } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -#endif /* !__ASSEMBLY__ */ - -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { @@ -815,6 +809,12 @@ static inline bool arch_has_pfn_modify_check(void) { return false; } +#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ + +#endif /* !__ASSEMBLY__ */ + +#ifndef io_remap_pfn_range +#define io_remap_pfn_range remap_pfn_range #endif #endif /* _ASM_GENERIC_PGTABLE_H */ From 30a97c1e2dc39f45d9deeeccc2733278fc285d5e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 15 Aug 2018 17:42:11 +0200 Subject: [PATCH 59/59] Linux 4.4.148 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ee92a12e3a4b..9b795164122e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 147 +SUBLEVEL = 148 EXTRAVERSION = NAME = Blurry Fish Butt