From 896d0a62196d2fcbe95c311a0beaff6e33d7b61f Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 29 Feb 2016 17:38:34 -0800 Subject: [PATCH 001/607] ANDROID: net: fix 'const' warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See the following build log splats. The sock_i_uid() helper doesn't quite treat the parameter as 'const' (it acquires a member lock), but this cast is the same approach taken by other callers in this file, so I don't feel too bad about the fix. CC net/ipv4/inet_connection_sock.o CC net/ipv6/inet6_connection_sock.o net/ipv6/inet6_connection_sock.c: In function ‘inet6_csk_route_req’: net/ipv6/inet6_connection_sock.c:89:2: warning: passing argument 1 of ‘sock_i_uid’ discards ‘const’ qualifier from pointer target type [enabled by default] In file included from include/linux/tcp.h:22:0, from include/linux/ipv6.h:73, from net/ipv6/inet6_connection_sock.c:18: include/net/sock.h:1689:8: note: expected ‘struct sock *’ but argument is of type ‘const struct sock *’ net/ipv4/inet_connection_sock.c: In function ‘inet_csk_route_req’: net/ipv4/inet_connection_sock.c:423:7: warning: passing argument 1 of ‘sock_i_uid’ discards ‘const’ qualifier from pointer target type [enabled by default] In file included from include/net/inet_sock.h:27:0, from include/net/inet_connection_sock.h:23, from net/ipv4/inet_connection_sock.c:19: include/net/sock.h:1689:8: note: expected ‘struct sock *’ but argument is of type ‘const struct sock *’ net/ipv4/inet_connection_sock.c: In function ‘inet_csk_route_child_sock’: net/ipv4/inet_connection_sock.c:460:7: warning: passing argument 1 of ‘sock_i_uid’ discards ‘const’ qualifier from pointer target type [enabled by default] In file included from include/net/inet_sock.h:27:0, from include/net/inet_connection_sock.h:23, from net/ipv4/inet_connection_sock.c:19: include/net/sock.h:1689:8: note: expected ‘struct sock *’ but argument is of type ‘const struct sock *’ Change-Id: I5c156fc1a81f90323717bffd93c31d205b85620c Signed-off-by: Brian Norris --- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv6/inet6_connection_sock.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a4cfa14c73ee..728414dcea3b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -420,7 +420,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sock_i_uid(sk)); + htons(ireq->ir_num), sock_i_uid((struct sock *)sk)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -457,7 +457,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sock_i_uid(sk)); + htons(ireq->ir_num), sock_i_uid((struct sock *)sk)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 7b214652768e..897bb6eb5751 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -86,7 +86,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); - fl6->flowi6_uid = sock_i_uid(sk); + fl6->flowi6_uid = sock_i_uid((struct sock *)sk); security_req_classify_flow(req, flowi6_to_flowi(fl6)); dst = ip6_dst_lookup_flow(sk, fl6, final_p); From bc340f81f5849bad721feafe43f7737eb6ec4904 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 29 Feb 2016 17:40:05 -0800 Subject: [PATCH 002/607] ANDROID: lowmemorykiller: fix declaration order warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/staging/android/lowmemorykiller.c: In function ‘lowmem_scan’: drivers/staging/android/lowmemorykiller.c:174:3: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement] Change-Id: I9de6cf2c374bc43131725a7ed666a033a4449ea9 Signed-off-by: Brian Norris --- drivers/staging/android/lowmemorykiller.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 4cc3d02eacfb..af49af0cca01 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -161,6 +161,10 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) p->comm, p->pid, oom_score_adj, tasksize); } if (selected) { + long cache_size = other_file * (long)(PAGE_SIZE / 1024); + long cache_limit = minfree * (long)(PAGE_SIZE / 1024); + long free = other_free * (long)(PAGE_SIZE / 1024); + task_lock(selected); send_sig(SIGKILL, selected, 0); /* @@ -171,9 +175,6 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) if (selected->mm) mark_oom_victim(selected); task_unlock(selected); - long cache_size = other_file * (long)(PAGE_SIZE / 1024); - long cache_limit = minfree * (long)(PAGE_SIZE / 1024); - long free = other_free * (long)(PAGE_SIZE / 1024); trace_lowmemory_kill(selected, cache_size, cache_limit, free); lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \ " to free %ldkB on behalf of '%s' (%d) because\n" \ From e3893608d32b23ef35299f137b953b65c9a7c4ba Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 29 Feb 2016 17:44:51 -0800 Subject: [PATCH 003/607] ANDROID: usb: gadget: f_mtp: don't use le16 for u8 field The 'bCount' field is u8. Noticed by this warning: drivers/usb/gadget/function/f_mtp.c:264:3: warning: large integer implicitly truncated to unsigned type [-Woverflow] Change-Id: Ie82dfd1a8986ecd3acf143e41c46822f0d1aca4f Signed-off-by: Brian Norris --- drivers/usb/gadget/function/f_mtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index aec7b8d61fe7..8f80a7e91314 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -261,7 +261,7 @@ struct { .dwLength = __constant_cpu_to_le32(sizeof(mtp_ext_config_desc)), .bcdVersion = __constant_cpu_to_le16(0x0100), .wIndex = __constant_cpu_to_le16(4), - .bCount = __constant_cpu_to_le16(1), + .bCount = 1, }, .function = { .bFirstInterfaceNumber = 0, From 4861be8f9e494d76b571657a067c0e525230391a Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 29 Feb 2016 17:42:29 -0800 Subject: [PATCH 004/607] ANDROID: kernel/watchdog: fix unused variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/watchdog.c:122:22: warning: ‘hardlockup_allcpu_dumped’ defined but not used [-Wunused-variable] Change-Id: I99e97e7cc31b589cd674fd4495832c9ef036d0b9 Signed-off-by: Brian Norris --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 25955235ecdd..e864906af3fc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -119,7 +119,7 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; -static unsigned long hardlockup_allcpu_dumped; +static unsigned long __maybe_unused hardlockup_allcpu_dumped; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these From 4658d931f6072f6998c2ea0f43d87111a1fc8a2e Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Mon, 2 Mar 2015 17:09:07 -0800 Subject: [PATCH 005/607] ANDROID: mmc: Move tracepoint creation and export symbols Move the tracepoint creation to core from card, as core shouldn't depend on card. Also add EXPORT_SYMBOL_GPL calls to enable module build. Change-Id: Ie39fcdadc0516df99600d0963efe09b6cd7a9bf8 Signed-off-by: Dylan Reid (cherry picked from commit da5fbd1e7e50fee3a8271f50d25c848d0ede64b3, from android-3.14) Signed-off-by: Brian Norris --- drivers/mmc/card/block.c | 1 - drivers/mmc/core/core.c | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 21aa3244029b..90e9738a129a 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -36,7 +36,6 @@ #include #include -#define CREATE_TRACE_POINTS #include #include diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 3e54185bc985..9fab52559a8c 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -30,6 +30,7 @@ #include #include +#define CREATE_TRACE_POINTS #include #include @@ -48,6 +49,11 @@ #include "sd_ops.h" #include "sdio_ops.h" +EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_erase_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_erase_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_rw_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_rw_end); + /* If the device is not responding */ #define MMC_CORE_TIMEOUT_MS (10 * 60 * 1000) /* 10 minute timeout */ From ee1a8e0fc3cb6ac300a47abc4beb211b0db47389 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Tue, 10 Nov 2015 14:11:46 -0800 Subject: [PATCH 006/607] ANDROID: mmc: sdio: Disable retuning in sdio_reset_comm() Since sdio_reset_comm() re-initializes the SDIO card, disable retuning before idling and shutting down the card. Tuning will be re-enabled (if necessary) in mmc_sdio_init_card(). BUG=chrome-os-partner:46444 TEST=With CL:311815, toggle WiFi on/off on Smaug and observe that the WiFi card comes back up and is able to tune successfully. Change-Id: Ib4a5cfd4d75fc9e3ed7bb3f1e2ffd30de16c5d28 Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/311797 Reviewed-by: Derek Basehore [briannorris: brought from Chromium kernel in 3.18 -> 4.4 rebase] Signed-off-by: Brian Norris --- drivers/mmc/core/sdio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index cc93152f82c6..b5a1c0220493 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -22,6 +22,7 @@ #include "core.h" #include "bus.h" +#include "host.h" #include "sd.h" #include "sdio_bus.h" #include "mmc_ops.h" @@ -1228,6 +1229,8 @@ int sdio_reset_comm(struct mmc_card *card) printk("%s():\n", __func__); mmc_claim_host(host); + mmc_retune_disable(host); + mmc_go_idle(host); mmc_set_clock(host, host->f_min); From 26fc40a09221330c574fa9cc06d0175273022e44 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 1 Mar 2016 09:44:17 -0800 Subject: [PATCH 007/607] net: pppolac/pppopns: Replace msg.msg_iov with iov_iter_kvec() Commit 1af89c1ef3b6 ("Hack: net: PPPoPNS and PPPoLAC build fixes for 4.1") fixed the build for PPPoPNS and PPPoLAC by re-introducing a field in struct msghdr which was removed upstream. Re-introducing the field doesn't get it used, so it is quite likely that the code never worked. Fix it up for good. Fixes: 1af89c1ef3b6 ("Hack: net: PPPoPNS and PPPoLAC build fixes for 4.1") Signed-off-by: Guenter Roeck --- drivers/net/ppp/pppolac.c | 9 ++++----- drivers/net/ppp/pppopns.c | 9 ++++----- include/linux/socket.h | 4 ---- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/net/ppp/pppolac.c b/drivers/net/ppp/pppolac.c index 1b8180cc1d4d..0184c96579e9 100644 --- a/drivers/net/ppp/pppolac.c +++ b/drivers/net/ppp/pppolac.c @@ -206,11 +206,10 @@ static void pppolac_xmit_core(struct work_struct *delivery_work) while ((skb = skb_dequeue(&delivery_queue))) { struct sock *sk_udp = skb->sk; struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len}; - struct msghdr msg = { - .msg_iov = (struct iovec *)&iov, - .msg_iovlen = 1, - .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT, - }; + struct msghdr msg = { 0 }; + + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, + skb->len); sk_udp->sk_prot->sendmsg(sk_udp, &msg, skb->len); kfree_skb(skb); } diff --git a/drivers/net/ppp/pppopns.c b/drivers/net/ppp/pppopns.c index 568bb45cfeac..d9e06039794e 100644 --- a/drivers/net/ppp/pppopns.c +++ b/drivers/net/ppp/pppopns.c @@ -189,11 +189,10 @@ static void pppopns_xmit_core(struct work_struct *delivery_work) while ((skb = skb_dequeue(&delivery_queue))) { struct sock *sk_raw = skb->sk; struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len}; - struct msghdr msg = { - .msg_iov = (struct iovec *)&iov, - .msg_iovlen = 1, - .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT, - }; + struct msghdr msg = { 0 }; + + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, + skb->len); sk_raw->sk_prot->sendmsg(sk_raw, &msg, skb->len); kfree_skb(skb); } diff --git a/include/linux/socket.h b/include/linux/socket.h index 18a8337c8959..5bf59c8493b7 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -47,10 +47,6 @@ struct linger { struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ -#if defined(CONFIG_PPPOLAC) || defined(CONFIG_PPPOPNS) - struct iovec *msg_iov; /* scatter/gather array */ - __kernel_size_t msg_iovlen; /* # elements in msg_iov */ -#endif struct iov_iter msg_iter; /* data */ void *msg_control; /* ancillary data */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ From 52f2987faf143a1edd3892b6356fdae135d30a6a Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 1 Mar 2016 09:47:32 -0800 Subject: [PATCH 008/607] net: ppp: Fix modular build for PPPOLAC and PPPOPNS Unlike other configurations in net/ppp, PPPOLAC and PPPOPNS are defined as boolean configuration options. In allmodconfig builds (or, specifically, if PPP and some of the other PPP protocols were built as modules), this resulted in build errors such as the following, since pppox was built both as module and into the kernel. ERROR: "pppox_ioctl" [net/l2tp/l2tp_ppp.ko] undefined! ERROR: "unregister_pppox_proto" [net/l2tp/l2tp_ppp.ko] undefined! ERROR: "register_pppox_proto" [net/l2tp/l2tp_ppp.ko] undefined! ERROR: "pppox_unbind_sock" [net/l2tp/l2tp_ppp.ko] undefined! Fix the problem by defining PPPOLAC and PPPOPNS tristate. Signed-off-by: Guenter Roeck --- drivers/net/ppp/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/Kconfig b/drivers/net/ppp/Kconfig index e4cf44b1e815..282aec4860eb 100644 --- a/drivers/net/ppp/Kconfig +++ b/drivers/net/ppp/Kconfig @@ -150,7 +150,7 @@ config PPPOL2TP if TTY config PPPOLAC - bool "PPP on L2TP Access Concentrator" + tristate "PPP on L2TP Access Concentrator" depends on PPP && INET help L2TP (RFC 2661) is a tunneling protocol widely used in virtual private @@ -159,7 +159,7 @@ config PPPOLAC fairly simple and suited for clients. config PPPOPNS - bool "PPP on PPTP Network Server" + tristate "PPP on PPTP Network Server" depends on PPP && INET help PPTP (RFC 2637) is a tunneling protocol widely used in virtual private From bb485f9d7aac6f4f2a75e0c3bd48d3b55966afb6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 1 Mar 2016 09:52:27 -0800 Subject: [PATCH 009/607] video: adf: Fix modular build Builds with ADF configured as module fail the following errors. ERROR: "adf_fops" [drivers/video/adf/adf_sysfs.ko] undefined! ERROR: "adf_obj_sysfs_find" [drivers/video/adf/adf_fops.ko] undefined! ERROR: "adf_buffer_cleanup" [drivers/video/adf/adf_fops.ko] undefined! ERROR: "adf_attachment_validate" [drivers/video/adf/adf_client.ko] undefined! ERROR: "adf_attachment_find" [drivers/video/adf/adf_client.ko] undefined! ERROR: "adf_buffer_mapping_cleanup" [drivers/video/adf/adf_client.ko] undefined! ERROR: "adf_attachment_free" [drivers/video/adf/adf_client.ko] undefined! ERROR: "adf_obj_find_event_refcount" [drivers/video/adf/adf_client.ko] undefined! ERROR: "adf_file_queue_event" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_interface_sysfs_init" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_interface_sysfs_destroy" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_device_sysfs_init" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_device_sysfs_destroy" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_sysfs_destroy" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_overlay_engine_sysfs_init" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_overlay_engine_sysfs_destroy" [drivers/video/adf/adf.ko] undefined! ERROR: "adf_sysfs_init" [drivers/video/adf/adf.ko] undefined! If ADF is configured as module, each of the object files ends up being a separate module. Since the functions are used across the various files but not exported, this results in the observed build errors. Modify the Makefile to create a single module instead. Fixes: 066a50cee536 ("video: add atomic display framework") Signed-off-by: Guenter Roeck --- drivers/video/adf/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/video/adf/Makefile b/drivers/video/adf/Makefile index 78d0915122f4..cdf34a666dc7 100644 --- a/drivers/video/adf/Makefile +++ b/drivers/video/adf/Makefile @@ -2,13 +2,15 @@ ccflags-y := -Idrivers/staging/android CFLAGS_adf.o := -I$(src) -obj-$(CONFIG_ADF) += adf.o \ +obj-$(CONFIG_ADF) += adf_core.o + +adf_core-y := adf.o \ adf_client.o \ adf_fops.o \ adf_format.o \ adf_sysfs.o -obj-$(CONFIG_COMPAT) += adf_fops32.o +adf_core-$(CONFIG_COMPAT) += adf_fops32.o obj-$(CONFIG_ADF_FBDEV) += adf_fbdev.o From 077be8a752875772589c70299a291070807c8046 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 3 Mar 2016 09:30:33 -0800 Subject: [PATCH 010/607] video: adf: Set ADF_MEMBLOCK to boolean Attempts to build with CONFIG_ADF_MEMBLOCK=m result in the following build error. ERROR: "memblock_free" [drivers/video/adf/adf_memblock.ko] undefined! memblock_free() is marked as __init_memblock, so exporting it seems to be a bad idea. All other callers are only configurable into the kernel, so do the same with ADF_MEMBLOCK. Signed-off-by: Guenter Roeck --- drivers/video/adf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/adf/Kconfig b/drivers/video/adf/Kconfig index 33858b73d8bb..2777db48fae0 100644 --- a/drivers/video/adf/Kconfig +++ b/drivers/video/adf/Kconfig @@ -11,4 +11,4 @@ menuconfig ADF_FBDEV menuconfig ADF_MEMBLOCK depends on ADF depends on HAVE_MEMBLOCK - tristate "Helper for using memblocks as buffers in ADF drivers" + bool "Helper for using memblocks as buffers in ADF drivers" From e1b5d103894d097fb630aebc3c1fdaf257f7c9bb Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 3 Mar 2016 09:44:44 -0800 Subject: [PATCH 011/607] drivers: power: use 'current' instead of 'get_current()' get_current() to get the current thread pointer is not defined for all architectures. This results in the following build error for several architectures (s390, powerpc, and possibly others). drivers/base/power/main.c: In function '__device_suspend': drivers/base/power/main.c:1415:2: error: implicit declaration of function 'get_current' Use 'current' instead. Also include asm/current.h instead of depending on an implicit include. Fixes: ad86cc8ad632 ("drivers: power: Add watchdog timer to catch drivers which lockup during suspend." Signed-off-by: Guenter Roeck --- drivers/base/power/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index a54d810f2966..6ed8b9326629 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -35,6 +35,8 @@ #include #include +#include + #include "../base.h" #include "power.h" @@ -1412,7 +1414,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) goto Complete; data.dev = dev; - data.tsk = get_current(); + data.tsk = current; init_timer_on_stack(&timer); timer.expires = jiffies + HZ * 12; timer.function = dpm_drv_timeout; From 57b21d0a63f9968a147188410c0cfc266e4be934 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 3 Mar 2016 10:33:40 -0800 Subject: [PATCH 012/607] PM / suspend: Add dependency on RTC_LIB Commit 1eff8f99f9f9 ("PM / Suspend: Print wall time at suspend entry and exit") calls rtc_time_to_tm(), which in turn calls rtc_time64_to_tm(). Since RTC_LIB is not mandatory for all architetures, this can result in the following build error. suspend.c:(.text+0x2f36c): undefined reference to `rtc_time64_to_tm' rtc_time64_to_tm() is implemented in rtc-lib, so SUSPEND now needs to select RTC_LIB. Fixes: 1eff8f99f9f9 ("PM / Suspend: Print wall time at suspend entry and exit") Signed-off-by: Guenter Roeck --- kernel/power/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 84c480946fb2..6d6f63be1f9b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,6 +1,7 @@ config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE + select RTC_LIB default y ---help--- Allow the system to enter sleep states in which main memory is From 6b7afd3c5e1af8db9e933bf9a7438e3d3a5035f5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Nov 2015 15:03:57 +0100 Subject: [PATCH 013/607] UPSTREAM: ARM: 8457/1: psci-smp is built only for SMP The PSCI SMP implementation is built only when both CONFIG_SMP and CONFIG_ARM_PSCI are set, so a configuration that has the latter but not the former can get a link error when it tries to call psci_smp_available(). arch/arm/mach-tegra/built-in.o: In function `tegra114_cpuidle_init': cpuidle-tegra114.c:(.init.text+0x52a): undefined reference to `psci_smp_available' This corrects the #ifdef in the psci.h header file to match the Makefile conditional we have for building that function. Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (cherry picked from commit be95485a0b8288a93402705730d3ea32f9f812b9) Signed-off-by: Olof Johansson --- arch/arm/include/asm/psci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/psci.h b/arch/arm/include/asm/psci.h index 68ee3ce17b82..b4c6d99364f1 100644 --- a/arch/arm/include/asm/psci.h +++ b/arch/arm/include/asm/psci.h @@ -16,7 +16,7 @@ extern struct smp_operations psci_smp_ops; -#ifdef CONFIG_ARM_PSCI +#if defined(CONFIG_SMP) && defined(CONFIG_ARM_PSCI) bool psci_smp_available(void); #else static inline bool psci_smp_available(void) { return false; } From c2c4c4ecd27d69807906cf6f0df5a176e1498e71 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 4 Mar 2016 07:22:27 -0800 Subject: [PATCH 014/607] power: Provide dummy log_suspend_abort_reason() if SUSPEND is disabled The API to log the suspend reason was introduced with commit 57caa2ad5ce3 ("power: Adds functionality to log the last suspend abort reason."). It is called from functions enabled with PM_SLEEP and from functions enabled with SUSPEND, but only available if SUSPEND is enabled. This can result in build failures such as the following if PM_SLEEP is enabled, but SUSPEND is not. kernel/built-in.o: In function `try_to_freeze_tasks': process.c:(.text+0x30928): undefined reference to `log_suspend_abort_reason' drivers/built-in.o: In function `syscore_suspend': (.text+0x6e250): undefined reference to `log_suspend_abort_reason' drivers/built-in.o: In function `__device_suspend': main.c:(.text+0x7a528): undefined reference to `log_suspend_abort_reason' Fixes: 57caa2ad5ce3 ("power: Adds functionality to log the last suspend abort reason.") Signed-off-by: Guenter Roeck --- include/linux/wakeup_reason.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h index ad8b76936c7f..d84d8c301546 100644 --- a/include/linux/wakeup_reason.h +++ b/include/linux/wakeup_reason.h @@ -21,7 +21,12 @@ #define MAX_SUSPEND_ABORT_LEN 256 void log_wakeup_reason(int irq); -void log_suspend_abort_reason(const char *fmt, ...); int check_wakeup_reason(int irq); +#ifdef CONFIG_SUSPEND +void log_suspend_abort_reason(const char *fmt, ...); +#else +static inline void log_suspend_abort_reason(const char *fmt, ...) { } +#endif + #endif /* _LINUX_WAKEUP_REASON_H */ From 59650041fed46209791e68242407b0198f51a146 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 17:01:46 -0500 Subject: [PATCH 015/607] hid-sensor-hub.c: fix wrong do_div() usage do_div() must only be used with a u64 dividend. Signed-off-by: Nicolas Pitre (cherry picked from commit 8d43b49e7e0070f96ac46d30659a336c0224fa0b) Signed-off-by: Guenter Roeck --- drivers/hid/hid-sensor-hub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index 92870cdb52d9..8efaa88329aa 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -218,7 +218,8 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id, goto done_proc; } - remaining_bytes = do_div(buffer_size, sizeof(__s32)); + remaining_bytes = buffer_size % sizeof(__s32); + buffer_size = buffer_size / sizeof(__s32); if (buffer_size) { for (i = 0; i < buffer_size; ++i) { hid_set_field(report->field[field_index], i, From 8d3a6c1538fb021448c4f6381f6191061f947ba1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 4 Mar 2016 12:44:33 -0800 Subject: [PATCH 016/607] misc: uid_stat: Include linux/atomic.h instead of asm/atomic.h Building the uid_stat driver on sparc32 fails with the following errors. include/linux/atomic.h: In function 'atomic_add_unless': include/linux/atomic.h:437:2: error: implicit declaration of function '__atomic_add_unless' include/linux/atomic.h: In function 'atomic_andnot': include/linux/atomic.h:454:2: error: implicit declaration of function 'atomic_and' include/linux/atomic.h: In function 'atomic_set_mask': include/linux/atomic.h:465:2: error: implicit declaration of function 'atomic_or' include/linux/atomic.h: In function 'atomic_inc_not_zero_hint': include/linux/atomic.h:490:3: error: implicit declaration of function 'atomic_cmpxchg' include/linux/atomic.h: In function 'atomic_dec_if_positive': include/linux/atomic.h:537:2: error: implicit declaration of function 'atomic_read' Fixes: 6b6d5fbf9ae5 ("misc: uidstat: Adding uid stat driver to collect network statistics.") Signed-off-by: Guenter Roeck --- drivers/misc/uid_stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 8b8c9a22360b..185c69c9738a 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -13,7 +13,7 @@ * */ -#include +#include #include #include From 46050a93ff9e13fddcdcaff35e2b0b863b3af69b Mon Sep 17 00:00:00 2001 From: Tim Murray Date: Tue, 19 Jan 2016 16:33:27 -0800 Subject: [PATCH 017/607] ANDROID: dm-crypt: run in a WQ_HIGHPRI workqueue (cherry pick from commit ad3ac5180979e5dd1f84e4a807f76fb9fb19f814) Running dm-crypt in a standard workqueue results in IO competing for CPU time with standard user apps, which can lead to pipeline bubbles and seriously degraded performance. Move to a WQ_HIGHPRI workqueue to protect against that. Signed-off-by: Tim Murray Bug: 25392275 Change-Id: I589149a31c7b5d322fe2ed5b2476b1f6e3d5ee6f --- drivers/md/dm-crypt.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3147c8d09ea8..855600410064 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1864,16 +1864,25 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_HIGHPRI | + WQ_NON_REENTRANT| + WQ_MEM_RECLAIM, + 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_HIGHPRI | + WQ_MEM_RECLAIM, 1); else - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_HIGHPRI | + WQ_MEM_RECLAIM | + WQ_UNBOUND, num_online_cpus()); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; From bf519936d5925b859232cdb26b03c664dd59eb82 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 14 Jan 2016 16:30:58 +0100 Subject: [PATCH 018/607] UPSTREAM: ALSA: timer: Harden slave timer list handling (cherry pick from commit b5a663aa426f4884c71cd8580adae73f33570f0d) A slave timer instance might be still accessible in a racy way while operating the master instance as it lacks of locking. Since the master operation is mostly protected with timer->lock, we should cope with it while changing the slave instance, too. Also, some linked lists (active_list and ack_list) of slave instances aren't unlinked immediately at stopping or closing, and this may lead to unexpected accesses. This patch tries to address these issues. It adds spin lock of timer->lock (either from master or slave, which is equivalent) in a few places. For avoiding a deadlock, we ensure that the global slave_active_lock is always locked at first before each timer lock. Also, ack and active_list of slave instances are properly unlinked at snd_timer_stop() and snd_timer_close(). Last but not least, remove the superfluous call of _snd_timer_stop() at removing slave links. This is a noop, and calling it may confuse readers wrt locking. Further cleanup will follow in a later patch. Actually we've got reports of use-after-free by syzkaller fuzzer, and this hopefully fixes these issues. Reported-by: Dmitry Vyukov Cc: Signed-off-by: Takashi Iwai Bug: 26636060 --- sound/core/timer.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 31f40f03e5b7..8b06413e967e 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -215,11 +215,13 @@ static void snd_timer_check_master(struct snd_timer_instance *master) slave->slave_id == master->slave_id) { list_move_tail(&slave->open_list, &master->slave_list_head); spin_lock_irq(&slave_active_lock); + spin_lock(&master->timer->lock); slave->master = master; slave->timer = master->timer; if (slave->flags & SNDRV_TIMER_IFLG_RUNNING) list_add_tail(&slave->active_list, &master->slave_active_head); + spin_unlock(&master->timer->lock); spin_unlock_irq(&slave_active_lock); } } @@ -346,15 +348,18 @@ int snd_timer_close(struct snd_timer_instance *timeri) timer->hw.close) timer->hw.close(timer); /* remove slave links */ + spin_lock_irq(&slave_active_lock); + spin_lock(&timer->lock); list_for_each_entry_safe(slave, tmp, &timeri->slave_list_head, open_list) { - spin_lock_irq(&slave_active_lock); - _snd_timer_stop(slave, 1, SNDRV_TIMER_EVENT_RESOLUTION); list_move_tail(&slave->open_list, &snd_timer_slave_list); slave->master = NULL; slave->timer = NULL; - spin_unlock_irq(&slave_active_lock); + list_del_init(&slave->ack_list); + list_del_init(&slave->active_list); } + spin_unlock(&timer->lock); + spin_unlock_irq(&slave_active_lock); mutex_unlock(®ister_mutex); } out: @@ -441,9 +446,12 @@ static int snd_timer_start_slave(struct snd_timer_instance *timeri) spin_lock_irqsave(&slave_active_lock, flags); timeri->flags |= SNDRV_TIMER_IFLG_RUNNING; - if (timeri->master) + if (timeri->master && timeri->timer) { + spin_lock(&timeri->timer->lock); list_add_tail(&timeri->active_list, &timeri->master->slave_active_head); + spin_unlock(&timeri->timer->lock); + } spin_unlock_irqrestore(&slave_active_lock, flags); return 1; /* delayed start */ } @@ -489,6 +497,8 @@ static int _snd_timer_stop(struct snd_timer_instance * timeri, if (!keep_flag) { spin_lock_irqsave(&slave_active_lock, flags); timeri->flags &= ~SNDRV_TIMER_IFLG_RUNNING; + list_del_init(&timeri->ack_list); + list_del_init(&timeri->active_list); spin_unlock_irqrestore(&slave_active_lock, flags); } goto __end; From 9f5ef4b3ad75cb7bda0967b3e37eca2d0b634a02 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 8 Mar 2016 21:19:12 +0000 Subject: [PATCH 019/607] Revert "ANDROID: dm-crypt: run in a WQ_HIGHPRI workqueue" This reverts commit 46050a93ff9e13fddcdcaff35e2b0b863b3af69b. Change-Id: I3e37cf37c9ea0270737608dcd92ab5c311ea5ab8 --- drivers/md/dm-crypt.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 855600410064..3147c8d09ea8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1864,25 +1864,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", - WQ_HIGHPRI | - WQ_NON_REENTRANT| - WQ_MEM_RECLAIM, - 1); + cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd", - WQ_HIGHPRI | - WQ_MEM_RECLAIM, 1); + cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); else - cc->crypt_queue = alloc_workqueue("kcryptd", - WQ_HIGHPRI | - WQ_MEM_RECLAIM | - WQ_UNBOUND, + cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; From dc874623048d1ffae1ba50188f623dbe7551c124 Mon Sep 17 00:00:00 2001 From: Tim Murray Date: Tue, 19 Jan 2016 16:33:27 -0800 Subject: [PATCH 020/607] ANDROID: dm-crypt: run in a WQ_HIGHPRI workqueue (cherry pick from commit ad3ac5180979e5dd1f84e4a807f76fb9fb19f814) Running dm-crypt in a standard workqueue results in IO competing for CPU time with standard user apps, which can lead to pipeline bubbles and seriously degraded performance. Move to a WQ_HIGHPRI workqueue to protect against that. Signed-off-by: Tim Murray Bug: 25392275 Change-Id: I2828587c754a7c2cafdd78b3323b9896cb8cd4e7 --- drivers/md/dm-crypt.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3147c8d09ea8..e85bcae50f65 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1864,16 +1864,24 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_HIGHPRI | + WQ_MEM_RECLAIM, + 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_HIGHPRI | + WQ_MEM_RECLAIM, 1); else - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_HIGHPRI | + WQ_MEM_RECLAIM | + WQ_UNBOUND, num_online_cpus()); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; From fafb89f95a059c26223956bd035dcb0d90a0d523 Mon Sep 17 00:00:00 2001 From: Matthew Moeller Date: Wed, 9 Mar 2016 20:19:25 -0600 Subject: [PATCH 021/607] usb: u_ether: Add missing rx_work init commit 398a708ed5f3ef771d96dfb9b95b5d5170d17eb7 usb: u_ether: Add workqueue as bottom half handler for rx data path set up a worker for the rx data path but missed a case where the work_struct needed to be initialized. This patch adds the missing 'INIT_WORK' Change-Id: I2daabd39d35b3e17a3054837282d649d9c78a0aa Signed-off-by: Matthew Moeller --- drivers/usb/gadget/function/u_ether.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 76b25445c6ad..dd73dfe5dcab 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -1014,6 +1014,7 @@ struct net_device *gether_setup_name_default(const char *netname) spin_lock_init(&dev->lock); spin_lock_init(&dev->req_lock); INIT_WORK(&dev->work, eth_work); + INIT_WORK(&dev->rx_work, process_rx_w); INIT_LIST_HEAD(&dev->tx_reqs); INIT_LIST_HEAD(&dev->rx_reqs); From 157aa28fd7bf66bf3d9d1774398fddc7dc255c52 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 14 Mar 2016 13:34:44 -0700 Subject: [PATCH 022/607] FROMLIST: pstore-ram: fix NULL reference when used with pdata When using platform-data (not DT), we get an OOPS, because drvdata is only initialized after we try to use it. This addresses my comments made on the upstream submission here: https://patchwork.kernel.org/patch/7980651/ Fixes boot on Chrome OS systems, including the Pixel 2. Change-Id: I97360edf2ce61c83dc543cb6c169f3287e2dae4b Fixes: b1d1b7187c11 ("FROMLIST: pstore-ram: add Device Tree bindings") Signed-off-by: Brian Norris --- fs/pstore/ram.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2429c804cf78..414041342a99 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -554,7 +554,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct ramoops_platform_data *pdata = platform_get_drvdata(pdev); + struct ramoops_platform_data *pdata = pdev->dev.platform_data; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; @@ -666,7 +666,6 @@ static int ramoops_probe(struct platform_device *pdev) cxt->size, (unsigned long long)cxt->phys_addr, cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); - platform_set_drvdata(pdev, pdata); return 0; fail_buf: From 9a3fe392e468775da2b63a813f60b923eb6476c5 Mon Sep 17 00:00:00 2001 From: dcashman Date: Wed, 24 Feb 2016 13:27:06 -0800 Subject: [PATCH 023/607] FROMLIST: drivers: char: random: add get_random_long() (cherry picked from commit https://lkml.org/lkml/2016/2/4/831) d07e22597d1d355 ("mm: mmap: add new /proc tunable for mmap_base ASLR") added the ability to choose from a range of values to use for entropy count in generating the random offset to the mmap_base address. The maximum value on this range was set to 32 bits for 64-bit x86 systems, but this value could be increased further, requiring more than the 32 bits of randomness provided by get_random_int(), as is already possible for arm64. Add a new function: get_random_long() which more naturally fits with the mmap usage of get_random_int() but operates exactly the same as get_random_int(). Also, fix the shifting constant in mmap_rnd() to be an unsigned long so that values greater than 31 bits generate an appropriate mask without overflow. This is especially important on x86, as its shift instruction uses a 5-bit mask for the shift operand, which meant that any value for mmap_rnd_bits over 31 acts as a no-op and effectively disables mmap_base randomization. Finally, replace calls to get_random_int() with get_random_long() where appropriate. Bug: 26963541 Signed-off-by: Daniel Cashman Signed-off-by: Daniel Cashman Change-Id: I5b45621088666d5d1dfbf43952f25ea0798b10ba --- drivers/char/random.c | 22 ++++++++++++++++++++++ include/linux/random.h | 1 + 2 files changed, 23 insertions(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index d0da5d852d41..b583e5336630 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1818,6 +1818,28 @@ unsigned int get_random_int(void) } EXPORT_SYMBOL(get_random_int); +/* + * Same as get_random_int(), but returns unsigned long. + */ +unsigned long get_random_long(void) +{ + __u32 *hash; + unsigned long ret; + + if (arch_get_random_long(&ret)) + return ret; + + hash = get_cpu_var(get_random_int_hash); + + hash[0] += current->pid + jiffies + random_get_entropy(); + md5_transform(hash, random_int_secret); + ret = *(unsigned long *)hash; + put_cpu_var(get_random_int_hash); + + return ret; +} +EXPORT_SYMBOL(get_random_long); + /* * randomize_range() returns a start address such that * diff --git a/include/linux/random.h b/include/linux/random.h index a75840c1aa71..9c29122037f9 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -34,6 +34,7 @@ extern const struct file_operations random_fops, urandom_fops; #endif unsigned int get_random_int(void); +unsigned long get_random_long(void); unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); u32 prandom_u32(void); From b471fcdaec3d419ddcbe416568f3f9ce7f1e9ef8 Mon Sep 17 00:00:00 2001 From: dcashman Date: Wed, 24 Feb 2016 13:31:22 -0800 Subject: [PATCH 024/607] FROMLIST: mm: ASLR: use get_random_long() (cherry picked from commit https://lkml.org/lkml/2016/2/4/833) Replace calls to get_random_int() followed by a cast to (unsigned long) with calls to get_random_long(). Also address shifting bug which, in case of x86 removed entropy mask for mmap_rnd_bits values > 31 bits. Bug: 26963541 Signed-off-by: Daniel Cashman Signed-off-by: Daniel Cashman Change-Id: I36c156c9b8d7d157134895fddd4cd6efddcbee86 --- arch/arm/mm/mmap.c | 2 +- arch/arm64/mm/mmap.c | 4 ++-- arch/mips/mm/mmap.c | 4 ++-- arch/powerpc/kernel/process.c | 4 ++-- arch/powerpc/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/x86/mm/mmap.c | 6 +++--- fs/binfmt_elf.c | 2 +- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 4b4058db0781..66353caa35b9 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -173,7 +173,7 @@ unsigned long arch_mmap_rnd(void) { unsigned long rnd; - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 4c893b5189dd..232f787a088a 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -53,10 +53,10 @@ unsigned long arch_mmap_rnd(void) #ifdef CONFIG_COMPAT if (test_thread_flag(TIF_32BIT)) - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 5c81fdd032c3..353037699512 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -146,7 +146,7 @@ unsigned long arch_mmap_rnd(void) { unsigned long rnd; - rnd = (unsigned long)get_random_int(); + rnd = get_random_long(); rnd <<= PAGE_SHIFT; if (TASK_IS_32BIT_ADDR) rnd &= 0xfffffful; @@ -174,7 +174,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) static inline unsigned long brk_rnd(void) { - unsigned long rnd = get_random_int(); + unsigned long rnd = get_random_long(); rnd = rnd << PAGE_SHIFT; /* 8MB for 32bit, 256MB for 64bit */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 646bf4d222c1..a7b91b54c813 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1641,9 +1641,9 @@ static inline unsigned long brk_rnd(void) /* 8MB for 32bit, 1GB for 64bit */ if (is_32bit_task()) - rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); + rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT))); else - rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); + rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT))); return rnd << PAGE_SHIFT; } diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 0f0502e12f6c..4087705ba90f 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -59,9 +59,9 @@ unsigned long arch_mmap_rnd(void) /* 8MB for 32bit, 1GB for 64bit */ if (is_32bit_task()) - rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT)); + rnd = get_random_long() % (1<<(23-PAGE_SHIFT)); else - rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT)); + rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT)); return rnd << PAGE_SHIFT; } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 30e7ddb27a3a..c48d93b60afe 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -264,7 +264,7 @@ static unsigned long mmap_rnd(void) unsigned long rnd = 0UL; if (current->flags & PF_RANDOMIZE) { - unsigned long val = get_random_int(); + unsigned long val = get_random_long(); if (test_thread_flag(TIF_32BIT)) rnd = (val % (1UL << (23UL-PAGE_SHIFT))); else diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 96bd1e2bffaf..72bb52f93c3d 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void) if (mmap_is_ia32()) #ifdef CONFIG_COMPAT - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); #else - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); #endif else - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3a93755e880f..0c52941dd62c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -651,7 +651,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = (unsigned long) get_random_int(); + random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } From 13c44727fb66fc1d57eb716a2225daaa4bd200e0 Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Wed, 16 Mar 2016 18:15:47 +0800 Subject: [PATCH 025/607] FROMLIST: mmc: block: fix ABI regression of mmc_blk_ioctl If mmc_blk_ioctl returns -EINVAL, blkdev_ioctl continues to work without returning err to user-space. But now we check CAP_SYS_RAWIO firstly, so we return -EPERM to blkdev_ioctl, which make blkdev_ioctl return -EPERM to user-space directly. So this will break all the ioctl with BLKROSET. Now we find Android-adb suffer it for the following log: remount of /system failed; couldn't make block device writable: Operation not permitted openat(AT_FDCWD, "/dev/block/platform/ff420000.dwmmc/by-name/system", O_RDONLY) = 3 ioctl(3, BLKROSET, 0) = -1 EPERM (Operation not permitted) Fixes: a5f5774c55a2 ("mmc: block: Add new ioctl to send multi commands") Change-Id: Ie9ba728e366abf4ab73fd6102d2a2aa0d4ee5c66 Signed-off-by: Shawn Lin Signed-off-by: John Stultz --- drivers/mmc/card/block.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 90e9738a129a..f2ce13ab7ae6 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -593,6 +593,14 @@ static int mmc_blk_ioctl_cmd(struct block_device *bdev, struct mmc_card *card; int err = 0, ioc_err = 0; + /* + * The caller must have CAP_SYS_RAWIO, and must be calling this on the + * whole block device, not on a partition. This prevents overspray + * between sibling partitions. + */ + if ((!capable(CAP_SYS_RAWIO)) || (bdev != bdev->bd_contains)) + return -EPERM; + idata = mmc_blk_ioctl_copy_from_user(ic_ptr); if (IS_ERR(idata)) return PTR_ERR(idata); @@ -635,6 +643,14 @@ static int mmc_blk_ioctl_multi_cmd(struct block_device *bdev, int i, err = 0, ioc_err = 0; __u64 num_of_cmds; + /* + * The caller must have CAP_SYS_RAWIO, and must be calling this on the + * whole block device, not on a partition. This prevents overspray + * between sibling partitions. + */ + if ((!capable(CAP_SYS_RAWIO)) || (bdev != bdev->bd_contains)) + return -EPERM; + if (copy_from_user(&num_of_cmds, &user->num_of_cmds, sizeof(num_of_cmds))) return -EFAULT; @@ -690,14 +706,6 @@ cmd_err: static int mmc_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - /* - * The caller must have CAP_SYS_RAWIO, and must be calling this on the - * whole block device, not on a partition. This prevents overspray - * between sibling partitions. - */ - if ((!capable(CAP_SYS_RAWIO)) || (bdev != bdev->bd_contains)) - return -EPERM; - switch (cmd) { case MMC_IOC_CMD: return mmc_blk_ioctl_cmd(bdev, From 17fe2ea84ddda272ab4a382186916213965201db Mon Sep 17 00:00:00 2001 From: Mark Kuo Date: Thu, 20 Aug 2015 13:01:46 +0800 Subject: [PATCH 026/607] CHROMIUM: usb: gadget: f_mtp: Add SuperSpeed support Add SuperSpeed endpoint and companion descriptors. BUG=chrome-os-partner:43682 TEST=Smaug enumerates as a SuperSpeed device. Change-Id: I2bf3125d180fcb07222a5740fa67f3526cf3e95c Signed-off-by: Hui Fu Signed-off-by: Henry Lin Signed-off-by: Mark Kuo Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/294950 --- drivers/usb/gadget/function/f_mtp.c | 76 ++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 8f80a7e91314..148f8fcecc80 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -135,6 +135,34 @@ static struct usb_interface_descriptor ptp_interface_desc = { .bInterfaceProtocol = 1, }; +static struct usb_endpoint_descriptor mtp_ss_in_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_IN, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = __constant_cpu_to_le16(1024), +}; + +static struct usb_ss_ep_comp_descriptor mtp_ss_in_comp_desc = { + .bLength = sizeof(mtp_ss_in_comp_desc), + .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, + /* .bMaxBurst = DYNAMIC, */ +}; + +static struct usb_endpoint_descriptor mtp_ss_out_desc = { + .bLength = USB_DT_ENDPOINT_SIZE, + .bDescriptorType = USB_DT_ENDPOINT, + .bEndpointAddress = USB_DIR_OUT, + .bmAttributes = USB_ENDPOINT_XFER_BULK, + .wMaxPacketSize = __constant_cpu_to_le16(1024), +}; + +static struct usb_ss_ep_comp_descriptor mtp_ss_out_comp_desc = { + .bLength = sizeof(mtp_ss_out_comp_desc), + .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, + /* .bMaxBurst = DYNAMIC, */ +}; + static struct usb_endpoint_descriptor mtp_highspeed_in_desc = { .bLength = USB_DT_ENDPOINT_SIZE, .bDescriptorType = USB_DT_ENDPOINT, @@ -174,6 +202,12 @@ static struct usb_endpoint_descriptor mtp_intr_desc = { .bInterval = 6, }; +static struct usb_ss_ep_comp_descriptor mtp_intr_ss_comp_desc = { + .bLength = sizeof(mtp_intr_ss_comp_desc), + .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, + .wBytesPerInterval = cpu_to_le16(2), +}; + static struct usb_descriptor_header *fs_mtp_descs[] = { (struct usb_descriptor_header *) &mtp_interface_desc, (struct usb_descriptor_header *) &mtp_fullspeed_in_desc, @@ -190,6 +224,17 @@ static struct usb_descriptor_header *hs_mtp_descs[] = { NULL, }; +static struct usb_descriptor_header *ss_mtp_descs[] = { + (struct usb_descriptor_header *) &mtp_interface_desc, + (struct usb_descriptor_header *) &mtp_ss_in_desc, + (struct usb_descriptor_header *) &mtp_ss_in_comp_desc, + (struct usb_descriptor_header *) &mtp_ss_out_desc, + (struct usb_descriptor_header *) &mtp_ss_out_comp_desc, + (struct usb_descriptor_header *) &mtp_intr_desc, + (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc, + NULL, +}; + static struct usb_descriptor_header *fs_ptp_descs[] = { (struct usb_descriptor_header *) &ptp_interface_desc, (struct usb_descriptor_header *) &mtp_fullspeed_in_desc, @@ -206,6 +251,17 @@ static struct usb_descriptor_header *hs_ptp_descs[] = { NULL, }; +static struct usb_descriptor_header *ss_ptp_descs[] = { + (struct usb_descriptor_header *) &ptp_interface_desc, + (struct usb_descriptor_header *) &mtp_ss_in_desc, + (struct usb_descriptor_header *) &mtp_ss_in_comp_desc, + (struct usb_descriptor_header *) &mtp_ss_out_desc, + (struct usb_descriptor_header *) &mtp_ss_out_comp_desc, + (struct usb_descriptor_header *) &mtp_intr_desc, + (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc, + NULL, +}; + static struct usb_string mtp_string_defs[] = { /* Naming interface "MTP" so libmtp will recognize us */ [INTERFACE_STRING_INDEX].s = "MTP", @@ -1131,10 +1187,24 @@ mtp_function_bind(struct usb_configuration *c, struct usb_function *f) mtp_highspeed_out_desc.bEndpointAddress = mtp_fullspeed_out_desc.bEndpointAddress; } + /* support super speed hardware */ + if (gadget_is_superspeed(c->cdev->gadget)) { + unsigned max_burst; + + /* Calculate bMaxBurst, we know packet size is 1024 */ + max_burst = min_t(unsigned, MTP_BULK_BUFFER_SIZE / 1024, 15); + mtp_ss_in_desc.bEndpointAddress = + mtp_fullspeed_in_desc.bEndpointAddress; + mtp_ss_in_comp_desc.bMaxBurst = max_burst; + mtp_ss_out_desc.bEndpointAddress = + mtp_fullspeed_out_desc.bEndpointAddress; + mtp_ss_out_comp_desc.bMaxBurst = max_burst; + } DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n", - gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full", - f->name, dev->ep_in->name, dev->ep_out->name); + gadget_is_superspeed(c->cdev->gadget) ? "super" : + (gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full"), + f->name, dev->ep_in->name, dev->ep_out->name); return 0; } @@ -1410,9 +1480,11 @@ struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi, if (mtp_config) { dev->function.fs_descriptors = fs_mtp_descs; dev->function.hs_descriptors = hs_mtp_descs; + dev->function.ss_descriptors = ss_mtp_descs; } else { dev->function.fs_descriptors = fs_ptp_descs; dev->function.hs_descriptors = hs_ptp_descs; + dev->function.ss_descriptors = ss_ptp_descs; } dev->function.bind = mtp_function_bind; dev->function.unbind = mtp_function_unbind; From 588dc2e55280d5601e65ade2826dd606a0bbff7a Mon Sep 17 00:00:00 2001 From: Mark Kuo Date: Fri, 11 Sep 2015 16:12:59 +0800 Subject: [PATCH 027/607] CHROMIUM: usb: gadget: f_mtp: fix usb_ss_ep_comp_descriptor wBytesPerInterval in SuperSpeed Endpoint Companion Descriptor needs to be set large enough to reserve enough bus time for associated periodic endpoint. Originally, wBytesPerInterval for mtp's interrupt IN endpoint is set to 2 and its single interrupt transfer will be split into many 2 bytes interrupt transfers. So, we change wBytesPerInterval to INTR_BUFFER_SIZE to ensure interrupt transfer will not be split. BUG=none TEST=Smaug works as a MTP device Change-Id: I49c0df892b2d9e0193a684eef23f73664ced9f91 Signed-off-by: Henry Lin Signed-off-by: Mark Kuo Reviewed-on: https://chromium-review.googlesource.com/299091 Reviewed-by: Andrew Bresticker --- drivers/usb/gadget/function/f_mtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 148f8fcecc80..7f5c390885fe 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -205,7 +205,7 @@ static struct usb_endpoint_descriptor mtp_intr_desc = { static struct usb_ss_ep_comp_descriptor mtp_intr_ss_comp_desc = { .bLength = sizeof(mtp_intr_ss_comp_desc), .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, - .wBytesPerInterval = cpu_to_le16(2), + .wBytesPerInterval = cpu_to_le16(INTR_BUFFER_SIZE), }; static struct usb_descriptor_header *fs_mtp_descs[] = { From 555f387a43cad7e922254a2e06e3a670bb536957 Mon Sep 17 00:00:00 2001 From: Mark Kuo Date: Mon, 11 Jan 2016 17:49:16 +0800 Subject: [PATCH 028/607] CHROMIUM: usb: gadget: audio_source: add .free_func callback When userspace unbinds gadget functions through configfs, the .free_func() callback is always invoked. (in config_usb_cfg_unlink()) Implement it as a no-op to avoid the following crash: [ 68.125679] configfs-gadget gadget: unbind function 'accessory'/ffffffc0720bf000 [ 68.133202] configfs-gadget gadget: unbind function 'audio_source'/ffffffc0012ca3c0 [ 68.142668] tegra-xudc 700d0000.usb-device: ep 0 disabled [ 68.148186] Bad mode in Synchronous Abort handler detected, code 0x86000006 [ 68.155144] CPU: 2 PID: 1 Comm: init Tainted: G U W 3.18.0-09419-g87296c3-dirty #561 [ 68.163743] Hardware name: Google Tegra210 Smaug Rev 1,3+ (DT) [ 68.169566] task: ffffffc0bc8d0000 ti: ffffffc0bc8bc000 task.ti: ffffffc0bc8bc000 [ 68.177039] PC is at 0x0 [ 68.179577] LR is at usb_put_function+0x14/0x1c .... BUG=chrome-os-partner:49140 TEST="setprop sys.usb.config accessory,audio_source" on A44 and then switch back to default: "setprop sys.usb.config mtp,adb", no crash will be seen. Change-Id: I5b6141964aab861e86e3afb139ded02d4d122dab Signed-off-by: Mark Kuo Reviewed-on: https://chromium-review.googlesource.com/321013 Commit-Ready: Andrew Bresticker Tested-by: Andrew Bresticker Reviewed-by: Andrew Bresticker --- drivers/usb/gadget/function/f_audio_source.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c index 39645be93502..bcd817439dbf 100644 --- a/drivers/usb/gadget/function/f_audio_source.c +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -583,6 +583,11 @@ static void audio_disable(struct usb_function *f) usb_ep_disable(audio->in_ep); } +static void audio_free_func(struct usb_function *f) +{ + /* no-op */ +} + /*-------------------------------------------------------------------------*/ static void audio_build_desc(struct audio_dev *audio) @@ -827,6 +832,7 @@ static struct audio_dev _audio_dev = { .set_alt = audio_set_alt, .setup = audio_setup, .disable = audio_disable, + .free_func = audio_free_func, }, .lock = __SPIN_LOCK_UNLOCKED(_audio_dev.lock), .idle_reqs = LIST_HEAD_INIT(_audio_dev.idle_reqs), From aa1d916a3b462b0dc797b506b0a257935c1cef3f Mon Sep 17 00:00:00 2001 From: Mark Kuo Date: Mon, 11 Jan 2016 19:07:12 +0800 Subject: [PATCH 029/607] CHROMIUM: usb: gadget: f_accessory: add .raw_request callback After this upstream commit: 3c86726cfe38952f0366f86acfbbb025813ec1c2, .raw_request is mandatory in hid_ll_driver structure, hence add an empty raw_request() function. BUG=chrome-os-partner:49140 TEST=none Change-Id: Idd0bbe6960aad2c557376e4a24827d7e1df8e023 Signed-off-by: Mark Kuo Reviewed-on: https://chromium-review.googlesource.com/321038 Commit-Ready: Andrew Bresticker Tested-by: Andrew Bresticker Reviewed-by: Andrew Bresticker --- drivers/usb/gadget/function/f_accessory.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 1be93a7ca4a1..c62123560143 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -404,12 +404,19 @@ static void acc_hid_close(struct hid_device *hid) { } +static int acc_hid_raw_request(struct hid_device *hid, unsigned char reportnum, + __u8 *buf, size_t len, unsigned char rtype, int reqtype) +{ + return 0; +} + static struct hid_ll_driver acc_hid_ll_driver = { .parse = acc_hid_parse, .start = acc_hid_start, .stop = acc_hid_stop, .open = acc_hid_open, .close = acc_hid_close, + .raw_request = acc_hid_raw_request, }; static struct acc_hid_dev *acc_hid_new(struct acc_dev *dev, From 80780775bfb28ca293e674157bb0278b94adc754 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 6 Oct 2015 20:32:01 -0700 Subject: [PATCH 030/607] ANDROID: usb: gadget: Add support for MTP OS desc Windows requires OS specific descriptors for automatic install of drivers for MTP devices. https://msdn.microsoft.com/en-us/library/windows/ hardware/gg463179.aspx BUG=24583401 BUG=chrome-os-partner:43409 Change-Id: I9397072ca3d183efbc9571c6cde3790f10d8851e Signed-off-by: Badhri Jagan Sridharan Reviewed-on: https://chromium-review.googlesource.com/304346 Commit-Ready: Andrew Bresticker Tested-by: Andrew Bresticker Reviewed-by: Andrew Bresticker Signed-off-by: Amit Pundir --- drivers/usb/gadget/function/f_mtp.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 7f5c390885fe..74195be9b054 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -346,6 +346,8 @@ struct mtp_instance { struct usb_function_instance func_inst; const char *name; struct mtp_dev *dev; + char mtp_ext_compat_id[16]; + struct usb_os_desc mtp_os_desc; }; /* temporary variable used between mtp_open() and mtp_gadget_bind() */ @@ -1157,6 +1159,7 @@ mtp_function_bind(struct usb_configuration *c, struct usb_function *f) struct mtp_dev *dev = func_to_mtp(f); int id; int ret; + struct mtp_instance *fi_mtp; dev->cdev = cdev; DBG(cdev, "mtp_function_bind dev: %p\n", dev); @@ -1174,6 +1177,18 @@ mtp_function_bind(struct usb_configuration *c, struct usb_function *f) mtp_string_defs[INTERFACE_STRING_INDEX].id = ret; mtp_interface_desc.iInterface = ret; } + + fi_mtp = container_of(f->fi, struct mtp_instance, func_inst); + + if (cdev->use_os_string) { + f->os_desc_table = kzalloc(sizeof(*f->os_desc_table), + GFP_KERNEL); + if (!f->os_desc_table) + return -ENOMEM; + f->os_desc_n = 1; + f->os_desc_table[0].os_desc = &fi_mtp->mtp_os_desc; + } + /* allocate endpoints */ ret = mtp_create_bulk_endpoints(dev, &mtp_fullspeed_in_desc, &mtp_fullspeed_out_desc, &mtp_intr_desc); @@ -1223,6 +1238,8 @@ mtp_function_unbind(struct usb_configuration *c, struct usb_function *f) while ((req = mtp_req_get(dev, &dev->intr_idle))) mtp_request_free(req, dev->ep_intr); dev->state = STATE_OFFLINE; + kfree(f->os_desc_table); + f->os_desc_n = 0; } static int mtp_function_set_alt(struct usb_function *f, @@ -1406,6 +1423,7 @@ static void mtp_free_inst(struct usb_function_instance *fi) fi_mtp = to_fi_mtp(fi); kfree(fi_mtp->name); mtp_cleanup(); + kfree(fi_mtp->mtp_os_desc.group.default_groups); kfree(fi_mtp); } @@ -1413,6 +1431,8 @@ struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config) { struct mtp_instance *fi_mtp; int ret = 0; + struct usb_os_desc *descs[1]; + char *names[1]; fi_mtp = kzalloc(sizeof(*fi_mtp), GFP_KERNEL); if (!fi_mtp) @@ -1420,6 +1440,13 @@ struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config) fi_mtp->func_inst.set_inst_name = mtp_set_inst_name; fi_mtp->func_inst.free_func_inst = mtp_free_inst; + fi_mtp->mtp_os_desc.ext_compat_id = fi_mtp->mtp_ext_compat_id; + INIT_LIST_HEAD(&fi_mtp->mtp_os_desc.ext_prop); + descs[0] = &fi_mtp->mtp_os_desc; + names[0] = "MTP"; + usb_os_desc_prepare_interf_dir(&fi_mtp->func_inst.group, 1, + descs, names, THIS_MODULE); + if (mtp_config) { ret = mtp_setup_configfs(fi_mtp); if (ret) { From 84a1b7d3d3121e86fac74ae570311556c10b9c31 Mon Sep 17 00:00:00 2001 From: Daniel Campello Date: Mon, 20 Jul 2015 16:23:50 -0700 Subject: [PATCH 031/607] Included sdcardfs source code for kernel 3.0 Only included the source code as is for kernel 3.0. Following patches take care of porting this file system to version 3.10. Change-Id: I09e76db77cd98a059053ba5b6fd88572a4b75b5b Signed-off-by: Daniel Campello --- fs/Kconfig | 1 + fs/Makefile | 5 +- fs/sdcardfs/Kconfig | 18 + fs/sdcardfs/Makefile | 7 + fs/sdcardfs/dentry.c | 182 ++++++++ fs/sdcardfs/derived_perm.c | 290 ++++++++++++ fs/sdcardfs/file.c | 357 +++++++++++++++ fs/sdcardfs/hashtable.h | 190 ++++++++ fs/sdcardfs/inode.c | 886 +++++++++++++++++++++++++++++++++++++ fs/sdcardfs/lookup.c | 386 ++++++++++++++++ fs/sdcardfs/main.c | 425 ++++++++++++++++++ fs/sdcardfs/mmap.c | 82 ++++ fs/sdcardfs/multiuser.h | 37 ++ fs/sdcardfs/packagelist.c | 458 +++++++++++++++++++ fs/sdcardfs/sdcardfs.h | 493 +++++++++++++++++++++ fs/sdcardfs/strtok.h | 75 ++++ fs/sdcardfs/super.c | 229 ++++++++++ include/linux/namei.h | 3 + include/uapi/linux/magic.h | 2 + 19 files changed, 4124 insertions(+), 2 deletions(-) create mode 100644 fs/sdcardfs/Kconfig create mode 100644 fs/sdcardfs/Makefile create mode 100644 fs/sdcardfs/dentry.c create mode 100644 fs/sdcardfs/derived_perm.c create mode 100644 fs/sdcardfs/file.c create mode 100644 fs/sdcardfs/hashtable.h create mode 100644 fs/sdcardfs/inode.c create mode 100644 fs/sdcardfs/lookup.c create mode 100644 fs/sdcardfs/main.c create mode 100644 fs/sdcardfs/mmap.c create mode 100644 fs/sdcardfs/multiuser.h create mode 100644 fs/sdcardfs/packagelist.c create mode 100644 fs/sdcardfs/sdcardfs.h create mode 100644 fs/sdcardfs/strtok.h create mode 100644 fs/sdcardfs/super.c diff --git a/fs/Kconfig b/fs/Kconfig index 6ce72d8d1ee1..a5d2dc39ba07 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -199,6 +199,7 @@ if MISC_FILESYSTEMS source "fs/adfs/Kconfig" source "fs/affs/Kconfig" source "fs/ecryptfs/Kconfig" +source "fs/sdcardfs/Kconfig" source "fs/hfs/Kconfig" source "fs/hfsplus/Kconfig" source "fs/befs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 79f522575cba..3b54070cd629 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -3,7 +3,7 @@ # # 14 Sep 2000, Christoph Hellwig # Rewritten to use lists instead of if-statements. -# +# obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ @@ -59,7 +59,7 @@ obj-y += devpts/ obj-$(CONFIG_PROFILING) += dcookies.o obj-$(CONFIG_DLM) += dlm/ - + # Do not add any filesystems before this line obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ @@ -81,6 +81,7 @@ obj-$(CONFIG_ISO9660_FS) += isofs/ obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ obj-$(CONFIG_HFS_FS) += hfs/ obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ +obj-$(CONFIG_SDCARD_FS) += sdcardfs/ obj-$(CONFIG_VXFS_FS) += freevxfs/ obj-$(CONFIG_NFS_FS) += nfs/ obj-$(CONFIG_EXPORTFS) += exportfs/ diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig new file mode 100644 index 000000000000..657f4958e8d6 --- /dev/null +++ b/fs/sdcardfs/Kconfig @@ -0,0 +1,18 @@ +config SDCARD_FS + tristate "sdcard file system" + depends on EXPERIMENTAL + default n + help + Sdcardfs is based on Wrapfs file system. + +config SDCARD_FS_FADV_NOACTIVE + bool "sdcardfs fadvise noactive support" + depends on FADV_NOACTIVE + default y + help + Sdcardfs supports fadvise noactive mode. + +config SDCARD_FS_CI_SEARCH + tristate "sdcardfs case-insensitive search support" + depends on SDCARD_FS + default y diff --git a/fs/sdcardfs/Makefile b/fs/sdcardfs/Makefile new file mode 100644 index 000000000000..b84fbb2b45a4 --- /dev/null +++ b/fs/sdcardfs/Makefile @@ -0,0 +1,7 @@ +SDCARDFS_VERSION="0.1" + +EXTRA_CFLAGS += -DSDCARDFS_VERSION=\"$(SDCARDFS_VERSION)\" + +obj-$(CONFIG_SDCARD_FS) += sdcardfs.o + +sdcardfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o packagelist.o derived_perm.o diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c new file mode 100644 index 000000000000..4572a5403bb2 --- /dev/null +++ b/fs/sdcardfs/dentry.c @@ -0,0 +1,182 @@ +/* + * fs/sdcardfs/dentry.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" +#include "linux/ctype.h" + +/* + * returns: -ERRNO if error (returned to user) + * 0: tell VFS to invalidate dentry + * 1: dentry is valid + */ +static int sdcardfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + int err = 1; + struct path parent_lower_path, lower_path; + struct dentry *parent_dentry = NULL; + struct dentry *parent_lower_dentry = NULL; + struct dentry *lower_cur_parent_dentry = NULL; + struct dentry *lower_dentry = NULL; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + spin_lock(&dentry->d_lock); + if (IS_ROOT(dentry)) { + spin_unlock(&dentry->d_lock); + return 1; + } + spin_unlock(&dentry->d_lock); + + /* check uninitialized obb_dentry and + * whether the base obbpath has been changed or not */ + if (is_obbpath_invalid(dentry)) { + d_drop(dentry); + return 0; + } + + parent_dentry = dget_parent(dentry); + sdcardfs_get_lower_path(parent_dentry, &parent_lower_path); + sdcardfs_get_real_lower(dentry, &lower_path); + parent_lower_dentry = parent_lower_path.dentry; + lower_dentry = lower_path.dentry; + lower_cur_parent_dentry = dget_parent(lower_dentry); + + spin_lock(&lower_dentry->d_lock); + if (d_unhashed(lower_dentry)) { + spin_unlock(&lower_dentry->d_lock); + d_drop(dentry); + err = 0; + goto out; + } + spin_unlock(&lower_dentry->d_lock); + + if (parent_lower_dentry != lower_cur_parent_dentry) { + d_drop(dentry); + err = 0; + goto out; + } + + if (dentry < lower_dentry) { + spin_lock(&dentry->d_lock); + spin_lock(&lower_dentry->d_lock); + } else { + spin_lock(&lower_dentry->d_lock); + spin_lock(&dentry->d_lock); + } + + if (dentry->d_name.len != lower_dentry->d_name.len) { + __d_drop(dentry); + err = 0; + } else if (strncasecmp(dentry->d_name.name, lower_dentry->d_name.name, + dentry->d_name.len) != 0) { + __d_drop(dentry); + err = 0; + } + + if (dentry < lower_dentry) { + spin_unlock(&lower_dentry->d_lock); + spin_unlock(&dentry->d_lock); + } else { + spin_unlock(&dentry->d_lock); + spin_unlock(&lower_dentry->d_lock); + } + +out: + dput(parent_dentry); + dput(lower_cur_parent_dentry); + sdcardfs_put_lower_path(parent_dentry, &parent_lower_path); + sdcardfs_put_real_lower(dentry, &lower_path); + return err; +} + +static void sdcardfs_d_release(struct dentry *dentry) +{ + /* release and reset the lower paths */ + if(has_graft_path(dentry)) { + sdcardfs_put_reset_orig_path(dentry); + } + sdcardfs_put_reset_lower_path(dentry); + free_dentry_private_data(dentry); + return; +} + +static int sdcardfs_hash_ci(const struct dentry *dentry, + const struct inode *inode, struct qstr *qstr) +{ + /* + * This function is copy of vfat_hashi. + * FIXME Should we support national language? + * Refer to vfat_hashi() + * struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; + */ + const unsigned char *name; + unsigned int len; + unsigned long hash; + + name = qstr->name; + //len = vfat_striptail_len(qstr); + len = qstr->len; + + hash = init_name_hash(); + while (len--) + //hash = partial_name_hash(nls_tolower(t, *name++), hash); + hash = partial_name_hash(tolower(*name++), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Case insensitive compare of two vfat names. + */ +static int sdcardfs_cmp_ci(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + /* This function is copy of vfat_cmpi */ + // FIXME Should we support national language? + //struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; + //unsigned int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + /* + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); + if (alen == blen) { + if (nls_strnicmp(t, name->name, str, alen) == 0) + return 0; + } + */ + if (name->len == len) { + if (strncasecmp(name->name, str, len) == 0) + return 0; + } + return 1; +} + +const struct dentry_operations sdcardfs_ci_dops = { + .d_revalidate = sdcardfs_d_revalidate, + .d_release = sdcardfs_d_release, + .d_hash = sdcardfs_hash_ci, + .d_compare = sdcardfs_cmp_ci, +}; + diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c new file mode 100644 index 000000000000..00c33a471dcc --- /dev/null +++ b/fs/sdcardfs/derived_perm.c @@ -0,0 +1,290 @@ +/* + * fs/sdcardfs/derived_perm.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" + +/* copy derived state from parent inode */ +static void inherit_derived_state(struct inode *parent, struct inode *child) +{ + struct sdcardfs_inode_info *pi = SDCARDFS_I(parent); + struct sdcardfs_inode_info *ci = SDCARDFS_I(child); + + ci->perm = PERM_INHERIT; + ci->userid = pi->userid; + ci->d_uid = pi->d_uid; + ci->d_gid = pi->d_gid; + ci->d_mode = pi->d_mode; +} + +/* helper function for derived state */ +void setup_derived_state(struct inode *inode, perm_t perm, + userid_t userid, uid_t uid, gid_t gid, mode_t mode) +{ + struct sdcardfs_inode_info *info = SDCARDFS_I(inode); + + info->perm = perm; + info->userid = userid; + info->d_uid = uid; + info->d_gid = gid; + info->d_mode = mode; +} + +void get_derived_permission(struct dentry *parent, struct dentry *dentry) +{ + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode); + struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); + appid_t appid; + + /* By default, each inode inherits from its parent. + * the properties are maintained on its private fields + * because the inode attributes will be modified with that of + * its lower inode. + * The derived state will be updated on the last + * stage of each system call by fix_derived_permission(inode). + */ + + inherit_derived_state(parent->d_inode, dentry->d_inode); + + //printk(KERN_INFO "sdcardfs: derived: %s, %s, %d\n", parent->d_name.name, + // dentry->d_name.name, parent_info->perm); + + if (sbi->options.derive == DERIVE_NONE) { + return; + } + + /* Derive custom permissions based on parent and current node */ + switch (parent_info->perm) { + case PERM_INHERIT: + /* Already inherited above */ + break; + case PERM_LEGACY_PRE_ROOT: + /* Legacy internal layout places users at top level */ + info->perm = PERM_ROOT; + info->userid = simple_strtoul(dentry->d_name.name, NULL, 10); + break; + case PERM_ROOT: + /* Assume masked off by default. */ + info->d_mode = 00770; + if (!strcasecmp(dentry->d_name.name, "Android")) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID; + info->d_mode = 00771; + } else if (sbi->options.split_perms) { + if (!strcasecmp(dentry->d_name.name, "DCIM") + || !strcasecmp(dentry->d_name.name, "Pictures")) { + info->d_gid = AID_SDCARD_PICS; + } else if (!strcasecmp(dentry->d_name.name, "Alarms") + || !strcasecmp(dentry->d_name.name, "Movies") + || !strcasecmp(dentry->d_name.name, "Music") + || !strcasecmp(dentry->d_name.name, "Notifications") + || !strcasecmp(dentry->d_name.name, "Podcasts") + || !strcasecmp(dentry->d_name.name, "Ringtones")) { + info->d_gid = AID_SDCARD_AV; + } + } + break; + case PERM_ANDROID: + if (!strcasecmp(dentry->d_name.name, "data")) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID_DATA; + info->d_mode = 00771; + } else if (!strcasecmp(dentry->d_name.name, "obb")) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID_OBB; + info->d_mode = 00771; + // FIXME : this feature will be implemented later. + /* Single OBB directory is always shared */ + } else if (!strcasecmp(dentry->d_name.name, "user")) { + /* User directories must only be accessible to system, protected + * by sdcard_all. Zygote will bind mount the appropriate user- + * specific path. */ + info->perm = PERM_ANDROID_USER; + info->d_gid = AID_SDCARD_ALL; + info->d_mode = 00770; + } + break; + /* same policy will be applied on PERM_ANDROID_DATA + * and PERM_ANDROID_OBB */ + case PERM_ANDROID_DATA: + case PERM_ANDROID_OBB: + appid = get_appid(sbi->pkgl_id, dentry->d_name.name); + if (appid != 0) { + info->d_uid = multiuser_get_uid(parent_info->userid, appid); + } + info->d_mode = 00770; + break; + case PERM_ANDROID_USER: + /* Root of a secondary user */ + info->perm = PERM_ROOT; + info->userid = simple_strtoul(dentry->d_name.name, NULL, 10); + info->d_gid = AID_SDCARD_R; + info->d_mode = 00771; + break; + } +} + +/* main function for updating derived permission */ +inline void update_derived_permission(struct dentry *dentry) +{ + struct dentry *parent; + + if(!dentry || !dentry->d_inode) { + printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__); + return; + } + /* FIXME: + * 1. need to check whether the dentry is updated or not + * 2. remove the root dentry update + */ + if(IS_ROOT(dentry)) { + //setup_default_pre_root_state(dentry->d_inode); + } else { + parent = dget_parent(dentry); + if(parent) { + get_derived_permission(parent, dentry); + dput(parent); + } + } + fix_derived_permission(dentry->d_inode); +} + +int need_graft_path(struct dentry *dentry) +{ + int ret = 0; + struct dentry *parent = dget_parent(dentry); + struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + + if(parent_info->perm == PERM_ANDROID && + !strcasecmp(dentry->d_name.name, "obb")) { + + /* /Android/obb is the base obbpath of DERIVED_UNIFIED */ + if(!(sbi->options.derive == DERIVE_UNIFIED + && parent_info->userid == 0)) { + ret = 1; + } + } + dput(parent); + return ret; +} + +int is_obbpath_invalid(struct dentry *dent) +{ + int ret = 0; + struct sdcardfs_dentry_info *di = SDCARDFS_D(dent); + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb); + char *path_buf, *obbpath_s; + + /* check the base obbpath has been changed. + * this routine can check an uninitialized obb dentry as well. + * regarding the uninitialized obb, refer to the sdcardfs_mkdir() */ + spin_lock(&di->lock); + if(di->orig_path.dentry) { + if(!di->lower_path.dentry) { + ret = 1; + } else { + path_get(&di->lower_path); + //lower_parent = lock_parent(lower_path->dentry); + + path_buf = kmalloc(PATH_MAX, GFP_ATOMIC); + if(!path_buf) { + ret = 1; + printk(KERN_ERR "sdcardfs: " + "fail to allocate path_buf in %s.\n", __func__); + } else { + obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX); + if (d_unhashed(di->lower_path.dentry) || + strcasecmp(sbi->obbpath_s, obbpath_s)) { + ret = 1; + } + kfree(path_buf); + } + + //unlock_dir(lower_parent); + path_put(&di->lower_path); + } + } + spin_unlock(&di->lock); + return ret; +} + +int is_base_obbpath(struct dentry *dentry) +{ + int ret = 0; + struct dentry *parent = dget_parent(dentry); + struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + + spin_lock(&SDCARDFS_D(dentry)->lock); + /* DERIVED_LEGACY */ + if(parent_info->perm == PERM_LEGACY_PRE_ROOT && + !strcasecmp(dentry->d_name.name, "obb")) { + ret = 1; + } + /* DERIVED_UNIFIED :/Android/obb is the base obbpath */ + else if (parent_info->perm == PERM_ANDROID && + !strcasecmp(dentry->d_name.name, "obb")) { + if((sbi->options.derive == DERIVE_UNIFIED + && parent_info->userid == 0)) { + ret = 1; + } + } + spin_unlock(&SDCARDFS_D(dentry)->lock); + dput(parent); + return ret; +} + +/* The lower_path will be stored to the dentry's orig_path + * and the base obbpath will be copyed to the lower_path variable. + * if an error returned, there's no change in the lower_path + * returns: -ERRNO if error (0: no error) */ +int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) +{ + int err = 0; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + struct path obbpath; + + /* A local obb dentry must have its own orig_path to support rmdir + * and mkdir of itself. Usually, we expect that the sbi->obbpath + * is avaiable on this stage. */ + sdcardfs_set_orig_path(dentry, lower_path); + + err = kern_path(sbi->obbpath_s, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath); + + if(!err) { + /* the obbpath base has been found */ + printk(KERN_INFO "sdcardfs: " + "the sbi->obbpath is found\n"); + pathcpy(lower_path, &obbpath); + } else { + /* if the sbi->obbpath is not available, we can optionally + * setup the lower_path with its orig_path. + * but, the current implementation just returns an error + * because the sdcard daemon also regards this case as + * a lookup fail. */ + printk(KERN_INFO "sdcardfs: " + "the sbi->obbpath is not available\n"); + } + return err; +} + + diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c new file mode 100644 index 000000000000..bcacb947c874 --- /dev/null +++ b/fs/sdcardfs/file.c @@ -0,0 +1,357 @@ +/* + * fs/sdcardfs/file.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" +#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE +#include +#endif + +static ssize_t sdcardfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int err; + struct file *lower_file; + struct dentry *dentry = file->f_path.dentry; +#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE + struct backing_dev_info *bdi; +#endif + + lower_file = sdcardfs_lower_file(file); + +#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE + if (file->f_mode & FMODE_NOACTIVE) { + if (!(lower_file->f_mode & FMODE_NOACTIVE)) { + bdi = lower_file->f_mapping->backing_dev_info; + lower_file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&lower_file->f_lock); + lower_file->f_mode |= FMODE_NOACTIVE; + spin_unlock(&lower_file->f_lock); + } + } +#endif + + err = vfs_read(lower_file, buf, count, ppos); + /* update our inode atime upon a successful lower read */ + if (err >= 0) + fsstack_copy_attr_atime(dentry->d_inode, + lower_file->f_path.dentry->d_inode); + + return err; +} + +static ssize_t sdcardfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int err = 0; + struct file *lower_file; + struct dentry *dentry = file->f_path.dentry; + + /* check disk space */ + if (!check_min_free_space(dentry, count, 0)) { + printk(KERN_INFO "No minimum free space.\n"); + return -ENOSPC; + } + + lower_file = sdcardfs_lower_file(file); + err = vfs_write(lower_file, buf, count, ppos); + /* update our inode times+sizes upon a successful lower write */ + if (err >= 0) { + fsstack_copy_inode_size(dentry->d_inode, + lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_times(dentry->d_inode, + lower_file->f_path.dentry->d_inode); + } + + return err; +} + +static int sdcardfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int err = 0; + struct file *lower_file = NULL; + struct dentry *dentry = file->f_path.dentry; + + lower_file = sdcardfs_lower_file(file); + + lower_file->f_pos = file->f_pos; + err = vfs_readdir(lower_file, filldir, dirent); + file->f_pos = lower_file->f_pos; + if (err >= 0) /* copy the atime */ + fsstack_copy_attr_atime(dentry->d_inode, + lower_file->f_path.dentry->d_inode); + return err; +} + +static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + long err = -ENOTTY; + struct file *lower_file; + + lower_file = sdcardfs_lower_file(file); + + /* XXX: use vfs_ioctl if/when VFS exports it */ + if (!lower_file || !lower_file->f_op) + goto out; + if (lower_file->f_op->unlocked_ioctl) + err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + +out: + return err; +} + +#ifdef CONFIG_COMPAT +static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + long err = -ENOTTY; + struct file *lower_file; + + lower_file = sdcardfs_lower_file(file); + + /* XXX: use vfs_ioctl if/when VFS exports it */ + if (!lower_file || !lower_file->f_op) + goto out; + if (lower_file->f_op->compat_ioctl) + err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + +out: + return err; +} +#endif + +static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err = 0; + bool willwrite; + struct file *lower_file; + const struct vm_operations_struct *saved_vm_ops = NULL; + + /* this might be deferred to mmap's writepage */ + willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags); + + /* + * File systems which do not implement ->writepage may use + * generic_file_readonly_mmap as their ->mmap op. If you call + * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL. + * But we cannot call the lower ->mmap op, so we can't tell that + * writeable mappings won't work. Therefore, our only choice is to + * check if the lower file system supports the ->writepage, and if + * not, return EINVAL (the same error that + * generic_file_readonly_mmap returns in that case). + */ + lower_file = sdcardfs_lower_file(file); + if (willwrite && !lower_file->f_mapping->a_ops->writepage) { + err = -EINVAL; + printk(KERN_ERR "sdcardfs: lower file system does not " + "support writeable mmap\n"); + goto out; + } + + /* + * find and save lower vm_ops. + * + * XXX: the VFS should have a cleaner way of finding the lower vm_ops + */ + if (!SDCARDFS_F(file)->lower_vm_ops) { + err = lower_file->f_op->mmap(lower_file, vma); + if (err) { + printk(KERN_ERR "sdcardfs: lower mmap failed %d\n", err); + goto out; + } + saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */ + err = do_munmap(current->mm, vma->vm_start, + vma->vm_end - vma->vm_start); + if (err) { + printk(KERN_ERR "sdcardfs: do_munmap failed %d\n", err); + goto out; + } + } + + /* + * Next 3 lines are all I need from generic_file_mmap. I definitely + * don't want its test for ->readpage which returns -ENOEXEC. + */ + file_accessed(file); + vma->vm_ops = &sdcardfs_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + + file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */ + if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */ + SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops; + +out: + return err; +} + +static int sdcardfs_open(struct inode *inode, struct file *file) +{ + int err = 0; + struct file *lower_file = NULL; + struct path lower_path; + struct dentry *dentry = file->f_path.dentry; + struct dentry *parent = dget_parent(dentry); + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + const struct cred *saved_cred = NULL; + int has_rw; + + /* don't open unhashed/deleted files */ + if (d_unhashed(dentry)) { + err = -ENOENT; + goto out_err; + } + + has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + + if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, + sbi->options.derive, + open_flags_to_access_mode(file->f_flags), has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + err = -EACCES; + goto out_err; + } + + /* save current_cred and override it */ + OVERRIDE_CRED(sbi, saved_cred); + + file->private_data = + kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL); + if (!SDCARDFS_F(file)) { + err = -ENOMEM; + goto out_revert_cred; + } + + /* open lower object and link sdcardfs's file struct to lower's */ + sdcardfs_get_lower_path(file->f_path.dentry, &lower_path); + lower_file = dentry_open(lower_path.dentry, lower_path.mnt, + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + err = PTR_ERR(lower_file); + lower_file = sdcardfs_lower_file(file); + if (lower_file) { + sdcardfs_set_lower_file(file, NULL); + fput(lower_file); /* fput calls dput for lower_dentry */ + } + } else { + sdcardfs_set_lower_file(file, lower_file); + } + + if (err) + kfree(SDCARDFS_F(file)); + else { + fsstack_copy_attr_all(inode, sdcardfs_lower_inode(inode)); + fix_derived_permission(inode); + } + +out_revert_cred: + REVERT_CRED(saved_cred); +out_err: + dput(parent); + return err; +} + +static int sdcardfs_flush(struct file *file, fl_owner_t id) +{ + int err = 0; + struct file *lower_file = NULL; + + lower_file = sdcardfs_lower_file(file); + if (lower_file && lower_file->f_op && lower_file->f_op->flush) + err = lower_file->f_op->flush(lower_file, id); + + return err; +} + +/* release all lower object references & free the file info structure */ +static int sdcardfs_file_release(struct inode *inode, struct file *file) +{ + struct file *lower_file; + + lower_file = sdcardfs_lower_file(file); + if (lower_file) { + sdcardfs_set_lower_file(file, NULL); + fput(lower_file); + } + + kfree(SDCARDFS_F(file)); + return 0; +} + +static int +sdcardfs_fsync(struct file *file, int datasync) +{ + int err; + struct file *lower_file; + struct path lower_path; + struct dentry *dentry = file->f_path.dentry; + + lower_file = sdcardfs_lower_file(file); + sdcardfs_get_lower_path(dentry, &lower_path); + err = vfs_fsync(lower_file, datasync); + sdcardfs_put_lower_path(dentry, &lower_path); + + return err; +} + +static int sdcardfs_fasync(int fd, struct file *file, int flag) +{ + int err = 0; + struct file *lower_file = NULL; + + lower_file = sdcardfs_lower_file(file); + if (lower_file->f_op && lower_file->f_op->fasync) + err = lower_file->f_op->fasync(fd, lower_file, flag); + + return err; +} + +const struct file_operations sdcardfs_main_fops = { + .llseek = generic_file_llseek, + .read = sdcardfs_read, + .write = sdcardfs_write, + .unlocked_ioctl = sdcardfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sdcardfs_compat_ioctl, +#endif + .mmap = sdcardfs_mmap, + .open = sdcardfs_open, + .flush = sdcardfs_flush, + .release = sdcardfs_file_release, + .fsync = sdcardfs_fsync, + .fasync = sdcardfs_fasync, +}; + +/* trimmed directory options */ +const struct file_operations sdcardfs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = sdcardfs_readdir, + .unlocked_ioctl = sdcardfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sdcardfs_compat_ioctl, +#endif + .open = sdcardfs_open, + .release = sdcardfs_file_release, + .flush = sdcardfs_flush, + .fsync = sdcardfs_fsync, + .fasync = sdcardfs_fasync, +}; diff --git a/fs/sdcardfs/hashtable.h b/fs/sdcardfs/hashtable.h new file mode 100644 index 000000000000..1e770f3df148 --- /dev/null +++ b/fs/sdcardfs/hashtable.h @@ -0,0 +1,190 @@ +/* + * Statically sized hash table implementation + * (C) 2012 Sasha Levin + */ + +#ifndef _LINUX_HASHTABLE_H +#define _LINUX_HASHTABLE_H + +#include +#include +#include +#include +#include + +#define DEFINE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + +#define DECLARE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] + +#define HASH_SIZE(name) (ARRAY_SIZE(name)) +#define HASH_BITS(name) ilog2(HASH_SIZE(name)) + +/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ +#define hash_min(val, bits) \ + (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) + +static inline void __hash_init(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + INIT_HLIST_HEAD(&ht[i]); +} + +/** + * hash_init - initialize a hash table + * @hashtable: hashtable to be initialized + * + * Calculates the size of the hashtable from the given parameter, otherwise + * same as hash_init_size. + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_add - add an object to a hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add(hashtable, node, key) \ + hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_add_rcu - add an object to a rcu enabled hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add_rcu(hashtable, node, key) \ + hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_hashed - check whether an object is in any hashtable + * @node: the &struct hlist_node of the object to be checked + */ +static inline bool hash_hashed(struct hlist_node *node) +{ + return !hlist_unhashed(node); +} + +static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + if (!hlist_empty(&ht[i])) + return false; + + return true; +} + +/** + * hash_empty - check whether a hashtable is empty + * @hashtable: hashtable to check + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_del - remove an object from a hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del(struct hlist_node *node) +{ + hlist_del_init(node); +} + +/** + * hash_del_rcu - remove an object from a rcu enabled hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del_rcu(struct hlist_node *node) +{ + hlist_del_init_rcu(node); +} + +/** + * hash_for_each - iterate over a hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each(name, bkt, obj, member, pos) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry(obj, pos, &name[bkt], member) + +/** + * hash_for_each_rcu - iterate over a rcu enabled hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_rcu(name, bkt, obj, member) \ + for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry_rcu(obj, &name[bkt], member) + +/** + * hash_for_each_safe - iterate over a hashtable safe against removal of + * hash entry + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @tmp: a &struct used for temporary storage + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_safe(name, bkt, tmp, obj, member, pos) \ + for ((bkt) = 0, obj = NULL; (bkt) < HASH_SIZE(name);\ + (bkt)++)\ + hlist_for_each_entry_safe(obj, pos, tmp, &name[bkt], member) + +/** + * hash_for_each_possible - iterate over all possible objects hashing to the + * same bucket + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible(name, obj, member, key, pos) \ + hlist_for_each_entry(obj, pos, &name[hash_min(key, HASH_BITS(name))], member) + +/** + * hash_for_each_possible_rcu - iterate over all possible objects hashing to the + * same bucket in an rcu enabled hashtable + * in a rcu enabled hashtable + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_rcu(name, obj, member, key) \ + hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ + member) + +/** + * hash_for_each_possible_safe - iterate over all possible objects hashing to the + * same bucket safe against removals + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @tmp: a &struct used for temporary storage + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_safe(name, obj, tmp, member, key) \ + hlist_for_each_entry_safe(obj, tmp,\ + &name[hash_min(key, HASH_BITS(name))], member) + + +#endif diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c new file mode 100644 index 000000000000..e8ed04250ed1 --- /dev/null +++ b/fs/sdcardfs/inode.c @@ -0,0 +1,886 @@ +/* + * fs/sdcardfs/inode.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" + +/* Do not directly use this function. Use OVERRIDE_CRED() instead. */ +const struct cred * override_fsids(struct sdcardfs_sb_info* sbi) +{ + struct cred * cred; + const struct cred * old_cred; + + cred = prepare_creds(); + if (!cred) + return NULL; + + cred->fsuid = sbi->options.fs_low_uid; + cred->fsgid = sbi->options.fs_low_gid; + + old_cred = override_creds(cred); + + return old_cred; +} + +/* Do not directly use this function, use REVERT_CRED() instead. */ +void revert_fsids(const struct cred * old_cred) +{ + const struct cred * cur_cred; + + cur_cred = current->cred; + revert_creds(old_cred); + put_cred(cur_cred); +} + +static int sdcardfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + int err = 0; + struct dentry *lower_dentry; + struct dentry *lower_parent_dentry = NULL; + struct path lower_path, saved_path; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + const struct cred *saved_cred = NULL; + + int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + err = -EACCES; + goto out_eacces; + } + + /* save current_cred and override it */ + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_parent_dentry = lock_parent(lower_dentry); + + err = mnt_want_write(lower_path.mnt); + if (err) + goto out_unlock; + + pathcpy(&saved_path, &nd->path); + pathcpy(&nd->path, &lower_path); + + /* set last 16bytes of mode field to 0664 */ + mode = (mode & S_IFMT) | 00664; + err = vfs_create(lower_parent_dentry->d_inode, lower_dentry, mode, nd); + + pathcpy(&nd->path, &saved_path); + if (err) + goto out; + + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); + if (err) + goto out; + fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); + fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + +out: + mnt_drop_write(lower_path.mnt); +out_unlock: + unlock_dir(lower_parent_dentry); + sdcardfs_put_lower_path(dentry, &lower_path); + REVERT_CRED(saved_cred); +out_eacces: + return err; +} + +#if 0 +static int sdcardfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_dir_dentry; + u64 file_size_save; + int err; + struct path lower_old_path, lower_new_path; + + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); + + file_size_save = i_size_read(old_dentry->d_inode); + sdcardfs_get_lower_path(old_dentry, &lower_old_path); + sdcardfs_get_lower_path(new_dentry, &lower_new_path); + lower_old_dentry = lower_old_path.dentry; + lower_new_dentry = lower_new_path.dentry; + lower_dir_dentry = lock_parent(lower_new_dentry); + + err = mnt_want_write(lower_new_path.mnt); + if (err) + goto out_unlock; + + err = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, + lower_new_dentry); + if (err || !lower_new_dentry->d_inode) + goto out; + + err = sdcardfs_interpose(new_dentry, dir->i_sb, &lower_new_path); + if (err) + goto out; + fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); + old_dentry->d_inode->i_nlink = + sdcardfs_lower_inode(old_dentry->d_inode)->i_nlink; + i_size_write(new_dentry->d_inode, file_size_save); +out: + mnt_drop_write(lower_new_path.mnt); +out_unlock: + unlock_dir(lower_dir_dentry); + sdcardfs_put_lower_path(old_dentry, &lower_old_path); + sdcardfs_put_lower_path(new_dentry, &lower_new_path); + REVERT_CRED(); + return err; +} +#endif + +static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int err; + struct dentry *lower_dentry; + struct inode *lower_dir_inode = sdcardfs_lower_inode(dir); + struct dentry *lower_dir_dentry; + struct path lower_path; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + const struct cred *saved_cred = NULL; + + int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + err = -EACCES; + goto out_eacces; + } + + /* save current_cred and override it */ + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + + err = mnt_want_write(lower_path.mnt); + if (err) + goto out_unlock; + err = vfs_unlink(lower_dir_inode, lower_dentry); + + /* + * Note: unlinking on top of NFS can cause silly-renamed files. + * Trying to delete such files results in EBUSY from NFS + * below. Silly-renamed files will get deleted by NFS later on, so + * we just need to detect them here and treat such EBUSY errors as + * if the upper file was successfully deleted. + */ + if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED) + err = 0; + if (err) + goto out; + fsstack_copy_attr_times(dir, lower_dir_inode); + fsstack_copy_inode_size(dir, lower_dir_inode); + dentry->d_inode->i_nlink = + sdcardfs_lower_inode(dentry->d_inode)->i_nlink; + dentry->d_inode->i_ctime = dir->i_ctime; + d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */ +out: + mnt_drop_write(lower_path.mnt); +out_unlock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + sdcardfs_put_lower_path(dentry, &lower_path); + REVERT_CRED(saved_cred); +out_eacces: + return err; +} + +#if 0 +static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + int err = 0; + struct dentry *lower_dentry; + struct dentry *lower_parent_dentry = NULL; + struct path lower_path; + + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_parent_dentry = lock_parent(lower_dentry); + + err = mnt_want_write(lower_path.mnt); + if (err) + goto out_unlock; + err = vfs_symlink(lower_parent_dentry->d_inode, lower_dentry, symname); + if (err) + goto out; + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); + if (err) + goto out; + fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); + fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + +out: + mnt_drop_write(lower_path.mnt); +out_unlock: + unlock_dir(lower_parent_dentry); + sdcardfs_put_lower_path(dentry, &lower_path); + REVERT_CRED(); + return err; +} +#endif + +static int touch(char *abs_path, mode_t mode) { + struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode); + if (IS_ERR(filp)) { + if (PTR_ERR(filp) == -EEXIST) { + return 0; + } + else { + printk(KERN_ERR "sdcardfs: failed to open(%s): %ld\n", + abs_path, PTR_ERR(filp)); + return PTR_ERR(filp); + } + } + filp_close(filp, current->files); + return 0; +} + +static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int err = 0; + int make_nomedia_in_obb = 0; + struct dentry *lower_dentry; + struct dentry *lower_parent_dentry = NULL; + struct path lower_path; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + const struct cred *saved_cred = NULL; + struct sdcardfs_inode_info *pi = SDCARDFS_I(dir); + char *page_buf; + char *nomedia_dir_name; + char *nomedia_fullpath; + int fullpath_namelen; + int touch_err = 0; + + int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + err = -EACCES; + goto out_eacces; + } + + /* save current_cred and override it */ + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + + /* check disk space */ + if (!check_min_free_space(dentry, 0, 1)) { + printk(KERN_INFO "sdcardfs: No minimum free space.\n"); + err = -ENOSPC; + goto out_revert; + } + + /* the lower_dentry is negative here */ + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_parent_dentry = lock_parent(lower_dentry); + + err = mnt_want_write(lower_path.mnt); + if (err) + goto out_unlock; + + /* set last 16bytes of mode field to 0775 */ + mode = (mode & S_IFMT) | 00775; + err = vfs_mkdir(lower_parent_dentry->d_inode, lower_dentry, mode); + + if (err) + goto out; + + /* if it is a local obb dentry, setup it with the base obbpath */ + if(need_graft_path(dentry)) { + + err = setup_obb_dentry(dentry, &lower_path); + if(err) { + /* if the sbi->obbpath is not available, the lower_path won't be + * changed by setup_obb_dentry() but the lower path is saved to + * its orig_path. this dentry will be revalidated later. + * but now, the lower_path should be NULL */ + sdcardfs_put_reset_lower_path(dentry); + + /* the newly created lower path which saved to its orig_path or + * the lower_path is the base obbpath. + * therefore, an additional path_get is required */ + path_get(&lower_path); + } else + make_nomedia_in_obb = 1; + } + + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); + if (err) + goto out; + + fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); + fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + /* update number of links on parent directory */ + dir->i_nlink = sdcardfs_lower_inode(dir)->i_nlink; + + if ((sbi->options.derive == DERIVE_UNIFIED) && (!strcasecmp(dentry->d_name.name, "obb")) + && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) + make_nomedia_in_obb = 1; + + /* When creating /Android/data and /Android/obb, mark them as .nomedia */ + if (make_nomedia_in_obb || + ((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) { + + page_buf = (char *)__get_free_page(GFP_KERNEL); + if (!page_buf) { + printk(KERN_ERR "sdcardfs: failed to allocate page buf\n"); + goto out; + } + + nomedia_dir_name = d_absolute_path(&lower_path, page_buf, PAGE_SIZE); + if (IS_ERR(nomedia_dir_name)) { + free_page((unsigned long)page_buf); + printk(KERN_ERR "sdcardfs: failed to get .nomedia dir name\n"); + goto out; + } + + fullpath_namelen = page_buf + PAGE_SIZE - nomedia_dir_name - 1; + fullpath_namelen += strlen("/.nomedia"); + nomedia_fullpath = kzalloc(fullpath_namelen + 1, GFP_KERNEL); + if (!nomedia_fullpath) { + free_page((unsigned long)page_buf); + printk(KERN_ERR "sdcardfs: failed to allocate .nomedia fullpath buf\n"); + goto out; + } + + strcpy(nomedia_fullpath, nomedia_dir_name); + free_page((unsigned long)page_buf); + strcat(nomedia_fullpath, "/.nomedia"); + touch_err = touch(nomedia_fullpath, 0664); + if (touch_err) { + printk(KERN_ERR "sdcardfs: failed to touch(%s): %d\n", + nomedia_fullpath, touch_err); + kfree(nomedia_fullpath); + goto out; + } + kfree(nomedia_fullpath); + } +out: + mnt_drop_write(lower_path.mnt); +out_unlock: + unlock_dir(lower_parent_dentry); + sdcardfs_put_lower_path(dentry, &lower_path); +out_revert: + REVERT_CRED(saved_cred); +out_eacces: + return err; +} + +static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + int err; + struct path lower_path; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + const struct cred *saved_cred = NULL; + //char *path_s = NULL; + + int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + err = -EACCES; + goto out_eacces; + } + + /* save current_cred and override it */ + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + + /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry + * the dentry on the original path should be deleted. */ + sdcardfs_get_real_lower(dentry, &lower_path); + + lower_dentry = lower_path.dentry; + lower_dir_dentry = lock_parent(lower_dentry); + + err = mnt_want_write(lower_path.mnt); + if (err) + goto out_unlock; + err = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + if (err) + goto out; + + d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */ + if (dentry->d_inode) + clear_nlink(dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + +out: + mnt_drop_write(lower_path.mnt); +out_unlock: + unlock_dir(lower_dir_dentry); + sdcardfs_put_real_lower(dentry, &lower_path); + REVERT_CRED(saved_cred); +out_eacces: + return err; +} + +#if 0 +static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t dev) +{ + int err = 0; + struct dentry *lower_dentry; + struct dentry *lower_parent_dentry = NULL; + struct path lower_path; + + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_parent_dentry = lock_parent(lower_dentry); + + err = mnt_want_write(lower_path.mnt); + if (err) + goto out_unlock; + err = vfs_mknod(lower_parent_dentry->d_inode, lower_dentry, mode, dev); + if (err) + goto out; + + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); + if (err) + goto out; + fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); + fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + +out: + mnt_drop_write(lower_path.mnt); +out_unlock: + unlock_dir(lower_parent_dentry); + sdcardfs_put_lower_path(dentry, &lower_path); + REVERT_CRED(); + return err; +} +#endif + +/* + * The locking rules in sdcardfs_rename are complex. We could use a simpler + * superblock-level name-space lock for renames and copy-ups. + */ +static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int err = 0; + struct dentry *lower_old_dentry = NULL; + struct dentry *lower_new_dentry = NULL; + struct dentry *lower_old_dir_dentry = NULL; + struct dentry *lower_new_dir_dentry = NULL; + struct dentry *trap = NULL; + struct dentry *new_parent = NULL; + struct path lower_old_path, lower_new_path; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(old_dentry->d_sb); + const struct cred *saved_cred = NULL; + + int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + if(!check_caller_access_to_name(old_dir, old_dentry->d_name.name, + sbi->options.derive, 1, has_rw) || + !check_caller_access_to_name(new_dir, new_dentry->d_name.name, + sbi->options.derive, 1, has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " new_dentry: %s, task:%s\n", + __func__, new_dentry->d_name.name, current->comm); + err = -EACCES; + goto out_eacces; + } + + /* save current_cred and override it */ + OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred); + + sdcardfs_get_real_lower(old_dentry, &lower_old_path); + sdcardfs_get_lower_path(new_dentry, &lower_new_path); + lower_old_dentry = lower_old_path.dentry; + lower_new_dentry = lower_new_path.dentry; + lower_old_dir_dentry = dget_parent(lower_old_dentry); + lower_new_dir_dentry = dget_parent(lower_new_dentry); + + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { + err = -EINVAL; + goto out; + } + /* target should not be ancestor of source */ + if (trap == lower_new_dentry) { + err = -ENOTEMPTY; + goto out; + } + + err = mnt_want_write(lower_old_path.mnt); + if (err) + goto out; + err = mnt_want_write(lower_new_path.mnt); + if (err) + goto out_drop_old_write; + + err = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, + lower_new_dir_dentry->d_inode, lower_new_dentry); + if (err) + goto out_err; + + /* Copy attrs from lower dir, but i_uid/i_gid */ + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + fsstack_copy_inode_size(new_dir, lower_new_dir_dentry->d_inode); + fix_derived_permission(new_dir); + if (new_dir != old_dir) { + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); + fsstack_copy_inode_size(old_dir, lower_old_dir_dentry->d_inode); + fix_derived_permission(old_dir); + /* update the derived permission of the old_dentry + * with its new parent + */ + new_parent = dget_parent(new_dentry); + if(new_parent) { + if(old_dentry->d_inode) { + get_derived_permission(new_parent, old_dentry); + fix_derived_permission(old_dentry->d_inode); + } + dput(new_parent); + } + } + +out_err: + mnt_drop_write(lower_new_path.mnt); +out_drop_old_write: + mnt_drop_write(lower_old_path.mnt); +out: + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dput(lower_old_dir_dentry); + dput(lower_new_dir_dentry); + sdcardfs_put_real_lower(old_dentry, &lower_old_path); + sdcardfs_put_lower_path(new_dentry, &lower_new_path); + REVERT_CRED(saved_cred); +out_eacces: + return err; +} + +#if 0 +static int sdcardfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + int err; + struct dentry *lower_dentry; + struct path lower_path; + /* XXX readlink does not requires overriding credential */ + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + if (!lower_dentry->d_inode->i_op || + !lower_dentry->d_inode->i_op->readlink) { + err = -EINVAL; + goto out; + } + + err = lower_dentry->d_inode->i_op->readlink(lower_dentry, + buf, bufsiz); + if (err < 0) + goto out; + fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); + +out: + sdcardfs_put_lower_path(dentry, &lower_path); + return err; +} +#endif + +#if 0 +static void *sdcardfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *buf; + int len = PAGE_SIZE, err; + mm_segment_t old_fs; + + /* This is freed by the put_link method assuming a successful call. */ + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + buf = ERR_PTR(-ENOMEM); + goto out; + } + + /* read the symlink, and then we will follow it */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sdcardfs_readlink(dentry, buf, len); + set_fs(old_fs); + if (err < 0) { + kfree(buf); + buf = ERR_PTR(err); + } else { + buf[err] = '\0'; + } +out: + nd_set_link(nd, buf); + return NULL; +} +#endif + +#if 0 +/* this @nd *IS* still used */ +static void sdcardfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *buf = nd_get_link(nd); + if (!IS_ERR(buf)) /* free the char* */ + kfree(buf); +} +#endif + +static int sdcardfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + int err; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + /* + * Permission check on sdcardfs inode. + * Calling process should have AID_SDCARD_RW permission + */ + err = generic_permission(inode, mask, 0, inode->i_op->check_acl); + + /* XXX + * Original sdcardfs code calls inode_permission(lower_inode,.. ) + * for checking inode permission. But doing such things here seems + * duplicated work, because the functions called after this func, + * such as vfs_create, vfs_unlink, vfs_rename, and etc, + * does exactly same thing, i.e., they calls inode_permission(). + * So we just let they do the things. + * If there are any security hole, just uncomment following if block. + */ +#if 0 + if (!err) { + /* + * Permission check on lower_inode(=EXT4). + * we check it with AID_MEDIA_RW permission + */ + struct inode *lower_inode; + OVERRIDE_CRED(SDCARDFS_SB(inode->sb)); + + lower_inode = sdcardfs_lower_inode(inode); + err = inode_permission(lower_inode, mask); + + REVERT_CRED(); + } +#endif + return err; + +} + +static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct dentry *lower_dentry; + struct inode *inode; + struct inode *lower_inode; + struct path lower_path; + struct dentry *parent; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + + parent = dget_parent(dentry); + if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, + sbi->options.derive, 0, 0)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + dput(parent); + return -EACCES; + } + dput(parent); + + inode = dentry->d_inode; + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_inode = sdcardfs_lower_inode(inode); + + fsstack_copy_attr_all(inode, lower_inode); + fsstack_copy_inode_size(inode, lower_inode); + /* if the dentry has been moved from other location + * so, on this stage, its derived permission must be + * rechecked from its private field. + */ + fix_derived_permission(inode); + + generic_fillattr(inode, stat); + sdcardfs_put_lower_path(dentry, &lower_path); + return 0; +} + +static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int err = 0; + struct dentry *lower_dentry; + struct inode *inode; + struct inode *lower_inode; + struct path lower_path; + struct iattr lower_ia; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + struct dentry *parent; + int has_rw; + + inode = dentry->d_inode; + + /* + * Check if user has permission to change inode. We don't check if + * this user can change the lower inode: that should happen when + * calling notify_change on the lower inode. + */ + err = inode_change_ok(inode, ia); + + /* no vfs_XXX operations required, cred overriding will be skipped. wj*/ + if (!err) { + /* check the Android group ID */ + has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); + parent = dget_parent(dentry); + if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, + sbi->options.derive, 1, has_rw)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + err = -EACCES; + } + dput(parent); + } + + if (err) + goto out_err; + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_inode = sdcardfs_lower_inode(inode); + + /* prepare our own lower struct iattr (with the lower file) */ + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file); + + lower_ia.ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE); + + /* + * If shrinking, first truncate upper level to cancel writing dirty + * pages beyond the new eof; and also if its' maxbytes is more + * limiting (fail with -EFBIG before making any change to the lower + * level). There is no need to vmtruncate the upper level + * afterwards in the other cases: we fsstack_copy_inode_size from + * the lower level. + */ + if (current->mm) + down_write(¤t->mm->mmap_sem); + if (ia->ia_valid & ATTR_SIZE) { + err = inode_newsize_ok(inode, ia->ia_size); + if (err) { + if (current->mm) + up_write(¤t->mm->mmap_sem); + goto out; + } + truncate_setsize(inode, ia->ia_size); + } + + /* + * mode change is for clearing setuid/setgid bits. Allow lower fs + * to interpret this in its own way. + */ + if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + lower_ia.ia_valid &= ~ATTR_MODE; + + /* notify the (possibly copied-up) lower inode */ + /* + * Note: we use lower_dentry->d_inode, because lower_inode may be + * unlinked (no inode->i_sb and i_ino==0. This happens if someone + * tries to open(), unlink(), then ftruncate() a file. + */ + mutex_lock(&lower_dentry->d_inode->i_mutex); + err = notify_change(lower_dentry, &lower_ia); /* note: lower_ia */ + mutex_unlock(&lower_dentry->d_inode->i_mutex); + if (current->mm) + up_write(¤t->mm->mmap_sem); + if (err) + goto out; + + /* get attributes from the lower inode */ + fsstack_copy_attr_all(inode, lower_inode); + /* update derived permission of the upper inode */ + fix_derived_permission(inode); + + /* + * Not running fsstack_copy_inode_size(inode, lower_inode), because + * VFS should update our inode size, and notify_change on + * lower_inode should update its size. + */ + +out: + sdcardfs_put_lower_path(dentry, &lower_path); +out_err: + return err; +} + +const struct inode_operations sdcardfs_symlink_iops = { + .permission = sdcardfs_permission, + .setattr = sdcardfs_setattr, + /* XXX Following operations are implemented, + * but FUSE(sdcard) or FAT does not support them + * These methods are *NOT* perfectly tested. + .readlink = sdcardfs_readlink, + .follow_link = sdcardfs_follow_link, + .put_link = sdcardfs_put_link, + */ +}; + +const struct inode_operations sdcardfs_dir_iops = { + .create = sdcardfs_create, + .lookup = sdcardfs_lookup, + .permission = sdcardfs_permission, + .unlink = sdcardfs_unlink, + .mkdir = sdcardfs_mkdir, + .rmdir = sdcardfs_rmdir, + .rename = sdcardfs_rename, + .setattr = sdcardfs_setattr, + .getattr = sdcardfs_getattr, + /* XXX Following operations are implemented, + * but FUSE(sdcard) or FAT does not support them + * These methods are *NOT* perfectly tested. + .symlink = sdcardfs_symlink, + .link = sdcardfs_link, + .mknod = sdcardfs_mknod, + */ +}; + +const struct inode_operations sdcardfs_main_iops = { + .permission = sdcardfs_permission, + .setattr = sdcardfs_setattr, + .getattr = sdcardfs_getattr, +}; diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c new file mode 100644 index 000000000000..c0b12375b1bf --- /dev/null +++ b/fs/sdcardfs/lookup.c @@ -0,0 +1,386 @@ +/* + * fs/sdcardfs/lookup.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" +#include "linux/delay.h" + +/* The dentry cache is just so we have properly sized dentries */ +static struct kmem_cache *sdcardfs_dentry_cachep; + +int sdcardfs_init_dentry_cache(void) +{ + sdcardfs_dentry_cachep = + kmem_cache_create("sdcardfs_dentry", + sizeof(struct sdcardfs_dentry_info), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + + return sdcardfs_dentry_cachep ? 0 : -ENOMEM; +} + +void sdcardfs_destroy_dentry_cache(void) +{ + if (sdcardfs_dentry_cachep) + kmem_cache_destroy(sdcardfs_dentry_cachep); +} + +void free_dentry_private_data(struct dentry *dentry) +{ + if (!dentry || !dentry->d_fsdata) + return; + kmem_cache_free(sdcardfs_dentry_cachep, dentry->d_fsdata); + dentry->d_fsdata = NULL; +} + +/* allocate new dentry private data */ +int new_dentry_private_data(struct dentry *dentry) +{ + struct sdcardfs_dentry_info *info = SDCARDFS_D(dentry); + + /* use zalloc to init dentry_info.lower_path */ + info = kmem_cache_zalloc(sdcardfs_dentry_cachep, GFP_ATOMIC); + if (!info) + return -ENOMEM; + + spin_lock_init(&info->lock); + dentry->d_fsdata = info; + + return 0; +} + +static int sdcardfs_inode_test(struct inode *inode, void *candidate_lower_inode) +{ + struct inode *current_lower_inode = sdcardfs_lower_inode(inode); + if (current_lower_inode == (struct inode *)candidate_lower_inode) + return 1; /* found a match */ + else + return 0; /* no match */ +} + +static int sdcardfs_inode_set(struct inode *inode, void *lower_inode) +{ + /* we do actual inode initialization in sdcardfs_iget */ + return 0; +} + +static struct inode *sdcardfs_iget(struct super_block *sb, + struct inode *lower_inode) +{ + struct sdcardfs_inode_info *info; + struct inode *inode; /* the new inode to return */ + int err; + + inode = iget5_locked(sb, /* our superblock */ + /* + * hashval: we use inode number, but we can + * also use "(unsigned long)lower_inode" + * instead. + */ + lower_inode->i_ino, /* hashval */ + sdcardfs_inode_test, /* inode comparison function */ + sdcardfs_inode_set, /* inode init function */ + lower_inode); /* data passed to test+set fxns */ + if (!inode) { + err = -EACCES; + iput(lower_inode); + return ERR_PTR(err); + } + /* if found a cached inode, then just return it */ + if (!(inode->i_state & I_NEW)) + return inode; + + /* initialize new inode */ + info = SDCARDFS_I(inode); + + inode->i_ino = lower_inode->i_ino; + if (!igrab(lower_inode)) { + err = -ESTALE; + return ERR_PTR(err); + } + sdcardfs_set_lower_inode(inode, lower_inode); + + inode->i_version++; + + /* use different set of inode ops for symlinks & directories */ + if (S_ISDIR(lower_inode->i_mode)) + inode->i_op = &sdcardfs_dir_iops; + else if (S_ISLNK(lower_inode->i_mode)) + inode->i_op = &sdcardfs_symlink_iops; + else + inode->i_op = &sdcardfs_main_iops; + + /* use different set of file ops for directories */ + if (S_ISDIR(lower_inode->i_mode)) + inode->i_fop = &sdcardfs_dir_fops; + else + inode->i_fop = &sdcardfs_main_fops; + + inode->i_mapping->a_ops = &sdcardfs_aops; + + inode->i_atime.tv_sec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_sec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_sec = 0; + inode->i_ctime.tv_nsec = 0; + + /* properly initialize special inodes */ + if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) || + S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode)) + init_special_inode(inode, lower_inode->i_mode, + lower_inode->i_rdev); + + /* all well, copy inode attributes */ + fsstack_copy_attr_all(inode, lower_inode); + fsstack_copy_inode_size(inode, lower_inode); + + fix_derived_permission(inode); + + unlock_new_inode(inode); + return inode; +} + +/* + * Connect a sdcardfs inode dentry/inode with several lower ones. This is + * the classic stackable file system "vnode interposition" action. + * + * @dentry: sdcardfs's dentry which interposes on lower one + * @sb: sdcardfs's super_block + * @lower_path: the lower path (caller does path_get/put) + */ +int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, + struct path *lower_path) +{ + int err = 0; + struct inode *inode; + struct inode *lower_inode; + struct super_block *lower_sb; + + lower_inode = lower_path->dentry->d_inode; + lower_sb = sdcardfs_lower_super(sb); + + /* check that the lower file system didn't cross a mount point */ + if (lower_inode->i_sb != lower_sb) { + err = -EXDEV; + goto out; + } + + /* + * We allocate our new inode below by calling sdcardfs_iget, + * which will initialize some of the new inode's fields + */ + + /* inherit lower inode number for sdcardfs's inode */ + inode = sdcardfs_iget(sb, lower_inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + d_add(dentry, inode); + update_derived_permission(dentry); +out: + return err; +} + +/* + * Main driver function for sdcardfs's lookup. + * + * Returns: NULL (ok), ERR_PTR if an error occurred. + * Fills in lower_parent_path with on success. + */ +static struct dentry *__sdcardfs_lookup(struct dentry *dentry, + struct nameidata *nd, struct path *lower_parent_path) +{ + int err = 0; + struct vfsmount *lower_dir_mnt; + struct dentry *lower_dir_dentry = NULL; + struct dentry *lower_dentry; + const char *name; + struct nameidata lower_nd; + struct path lower_path; + struct qstr this; + struct sdcardfs_sb_info *sbi; + + sbi = SDCARDFS_SB(dentry->d_sb); + /* must initialize dentry operations */ + d_set_d_op(dentry, &sdcardfs_ci_dops); + + if (IS_ROOT(dentry)) + goto out; + + name = dentry->d_name.name; + + /* now start the actual lookup procedure */ + lower_dir_dentry = lower_parent_path->dentry; + lower_dir_mnt = lower_parent_path->mnt; + + /* Use vfs_path_lookup to check if the dentry exists or not */ + if (sbi->options.lower_fs == LOWER_FS_EXT4) { + err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, + LOOKUP_CASE_INSENSITIVE, &lower_nd); + } else if (sbi->options.lower_fs == LOWER_FS_FAT) { + err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, + &lower_nd); + } + + /* no error: handle positive dentries */ + if (!err) { + /* check if the dentry is an obb dentry + * if true, the lower_inode must be replaced with + * the inode of the graft path */ + + if(need_graft_path(dentry)) { + + /* setup_obb_dentry() + * The lower_path will be stored to the dentry's orig_path + * and the base obbpath will be copyed to the lower_path variable. + * if an error returned, there's no change in the lower_path + * returns: -ERRNO if error (0: no error) */ + err = setup_obb_dentry(dentry, &lower_nd.path); + + if(err) { + /* if the sbi->obbpath is not available, we can optionally + * setup the lower_path with its orig_path. + * but, the current implementation just returns an error + * because the sdcard daemon also regards this case as + * a lookup fail. */ + printk(KERN_INFO "sdcardfs: base obbpath is not available\n"); + sdcardfs_put_reset_orig_path(dentry); + goto out; + } + } + + sdcardfs_set_lower_path(dentry, &lower_nd.path); + err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_nd.path); + if (err) /* path_put underlying path on error */ + sdcardfs_put_reset_lower_path(dentry); + goto out; + } + + /* + * We don't consider ENOENT an error, and we want to return a + * negative dentry. + */ + if (err && err != -ENOENT) + goto out; + + /* instatiate a new negative dentry */ + this.name = name; + this.len = strlen(name); + this.hash = full_name_hash(this.name, this.len); + lower_dentry = d_lookup(lower_dir_dentry, &this); + if (lower_dentry) + goto setup_lower; + + lower_dentry = d_alloc(lower_dir_dentry, &this); + if (!lower_dentry) { + err = -ENOMEM; + goto out; + } + d_add(lower_dentry, NULL); /* instantiate and hash */ + +setup_lower: + lower_path.dentry = lower_dentry; + lower_path.mnt = mntget(lower_dir_mnt); + sdcardfs_set_lower_path(dentry, &lower_path); + + /* + * If the intent is to create a file, then don't return an error, so + * the VFS will continue the process of making this negative dentry + * into a positive one. + */ + if (nd) { + if (nd->flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET)) + err = 0; + } else + err = 0; + +out: + return ERR_PTR(err); +} + +/* + * On success: + * fills dentry object appropriate values and returns NULL. + * On fail (== error) + * returns error ptr + * + * @dir : Parent inode. It is locked (dir->i_mutex) + * @dentry : Target dentry to lookup. we should set each of fields. + * (dentry->d_name is initialized already) + * @nd : nameidata of parent inode + */ +struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct dentry *ret = NULL, *parent; + struct path lower_parent_path; + int err = 0; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + const struct cred *saved_cred = NULL; + + parent = dget_parent(dentry); + + if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, + sbi->options.derive, 0, 0)) { + ret = ERR_PTR(-EACCES); + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + goto out_err; + } + + /* save current_cred and override it */ + OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred); + + sdcardfs_get_lower_path(parent, &lower_parent_path); + + /* allocate dentry private data. We free it in ->d_release */ + err = new_dentry_private_data(dentry); + if (err) { + ret = ERR_PTR(err); + goto out; + } + + ret = __sdcardfs_lookup(dentry, nd, &lower_parent_path); + if (IS_ERR(ret)) + { + goto out; + } + if (ret) + dentry = ret; + if (dentry->d_inode) { + fsstack_copy_attr_times(dentry->d_inode, + sdcardfs_lower_inode(dentry->d_inode)); + /* get drived permission */ + get_derived_permission(parent, dentry); + fix_derived_permission(dentry->d_inode); + } + /* update parent directory's atime */ + fsstack_copy_attr_atime(parent->d_inode, + sdcardfs_lower_inode(parent->d_inode)); + +out: + sdcardfs_put_lower_path(parent, &lower_parent_path); + REVERT_CRED(saved_cred); +out_err: + dput(parent); + return ret; +} diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c new file mode 100644 index 000000000000..1fdceffec72c --- /dev/null +++ b/fs/sdcardfs/main.c @@ -0,0 +1,425 @@ +/* + * fs/sdcardfs/main.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" +#include +#include +#include + +enum { + Opt_uid, + Opt_gid, + Opt_wgid, + Opt_debug, + Opt_split, + Opt_derive, + Opt_lower_fs, + Opt_reserved_mb, + Opt_err, +}; + +static const match_table_t sdcardfs_tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_wgid, "wgid=%u"}, + {Opt_debug, "debug"}, + {Opt_split, "split"}, + {Opt_derive, "derive=%s"}, + {Opt_lower_fs, "lower_fs=%s"}, + {Opt_reserved_mb, "reserved_mb=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options, int silent, + int *debug, struct sdcardfs_mount_options *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *string_option; + + /* by default, we use AID_MEDIA_RW as uid, gid */ + opts->fs_low_uid = AID_MEDIA_RW; + opts->fs_low_gid = AID_MEDIA_RW; + /* by default, we use AID_SDCARD_RW as write_gid */ + opts->write_gid = AID_SDCARD_RW; + /* default permission policy + * (DERIVE_NONE | DERIVE_LEGACY | DERIVE_UNIFIED) */ + opts->derive = DERIVE_NONE; + opts->split_perms = 0; + /* by default, we use LOWER_FS_EXT4 as lower fs type */ + opts->lower_fs = LOWER_FS_EXT4; + /* by default, 0MB is reserved */ + opts->reserved_mb = 0; + + *debug = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, sdcardfs_tokens, args); + + switch (token) { + case Opt_debug: + *debug = 1; + break; + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + opts->fs_low_uid = option; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + opts->fs_low_gid = option; + break; + case Opt_wgid: + if (match_int(&args[0], &option)) + return 0; + opts->write_gid = option; + break; + case Opt_split: + opts->split_perms=1; + break; + case Opt_derive: + string_option = match_strdup(&args[0]); + if (!strcmp("none", string_option)) { + opts->derive = DERIVE_NONE; + } else if (!strcmp("legacy", string_option)) { + opts->derive = DERIVE_LEGACY; + } else if (!strcmp("unified", string_option)) { + opts->derive = DERIVE_UNIFIED; + } else { + kfree(string_option); + goto invalid_option; + } + kfree(string_option); + break; + case Opt_lower_fs: + string_option = match_strdup(&args[0]); + if (!strcmp("ext4", string_option)) { + opts->lower_fs = LOWER_FS_EXT4; + } else if (!strcmp("fat", string_option)) { + opts->lower_fs = LOWER_FS_FAT; + } else { + kfree(string_option); + goto invalid_option; + } + kfree(string_option); + break; + case Opt_reserved_mb: + if (match_int(&args[0], &option)) + return 0; + opts->reserved_mb = option; + break; + /* unknown option */ + default: +invalid_option: + if (!silent) { + printk( KERN_ERR "Unrecognized mount option \"%s\" " + "or missing value", p); + } + return -EINVAL; + } + } + + if (*debug) { + printk( KERN_INFO "sdcardfs : options - debug:%d\n", *debug); + printk( KERN_INFO "sdcardfs : options - uid:%d\n", + opts->fs_low_uid); + printk( KERN_INFO "sdcardfs : options - gid:%d\n", + opts->fs_low_gid); + } + + return 0; +} + +/* + * our custom d_alloc_root work-alike + * + * we can't use d_alloc_root if we want to use our own interpose function + * unchanged, so we simply call our own "fake" d_alloc_root + */ +static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb) +{ + struct dentry *ret = NULL; + + if (sb) { + static const struct qstr name = { + .name = "/", + .len = 1 + }; + + ret = d_alloc(NULL, &name); + if (ret) { + d_set_d_op(ret, &sdcardfs_ci_dops); + ret->d_sb = sb; + ret->d_parent = ret; + } + } + return ret; +} + +/* + * There is no need to lock the sdcardfs_super_info's rwsem as there is no + * way anyone can have a reference to the superblock at this point in time. + */ +static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, + void *raw_data, int silent) +{ + int err = 0; + int debug; + struct super_block *lower_sb; + struct path lower_path; + struct sdcardfs_sb_info *sb_info; + void *pkgl_id; + + printk(KERN_INFO "sdcardfs version 2.0\n"); + + if (!dev_name) { + printk(KERN_ERR + "sdcardfs: read_super: missing dev_name argument\n"); + err = -EINVAL; + goto out; + } + + printk(KERN_INFO "sdcardfs: dev_name -> %s\n", dev_name); + printk(KERN_INFO "sdcardfs: options -> %s\n", (char *)raw_data); + + /* parse lower path */ + err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, + &lower_path); + if (err) { + printk(KERN_ERR "sdcardfs: error accessing " + "lower directory '%s'\n", dev_name); + goto out; + } + + /* allocate superblock private data */ + sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL); + if (!SDCARDFS_SB(sb)) { + printk(KERN_CRIT "sdcardfs: read_super: out of memory\n"); + err = -ENOMEM; + goto out_free; + } + + sb_info = sb->s_fs_info; + + /* parse options */ + err = parse_options(sb, raw_data, silent, &debug, &sb_info->options); + if (err) { + printk(KERN_ERR "sdcardfs: invalid options\n"); + goto out_freesbi; + } + + if (sb_info->options.derive != DERIVE_NONE) { + pkgl_id = packagelist_create(sb_info->options.write_gid); + if(IS_ERR(pkgl_id)) + goto out_freesbi; + else + sb_info->pkgl_id = pkgl_id; + } + + /* set the lower superblock field of upper superblock */ + lower_sb = lower_path.dentry->d_sb; + atomic_inc(&lower_sb->s_active); + sdcardfs_set_lower_super(sb, lower_sb); + + /* inherit maxbytes from lower file system */ + sb->s_maxbytes = lower_sb->s_maxbytes; + + /* + * Our c/m/atime granularity is 1 ns because we may stack on file + * systems whose granularity is as good. + */ + sb->s_time_gran = 1; + + sb->s_magic = SDCARDFS_SUPER_MAGIC; + sb->s_op = &sdcardfs_sops; + + /* see comment next to the definition of sdcardfs_d_alloc_root */ + sb->s_root = sdcardfs_d_alloc_root(sb); + if (!sb->s_root) { + err = -ENOMEM; + goto out_sput; + } + + /* link the upper and lower dentries */ + sb->s_root->d_fsdata = NULL; + err = new_dentry_private_data(sb->s_root); + if (err) + goto out_freeroot; + + /* set the lower dentries for s_root */ + sdcardfs_set_lower_path(sb->s_root, &lower_path); + + /* call interpose to create the upper level inode */ + err = sdcardfs_interpose(sb->s_root, sb, &lower_path); + if (!err) { + /* setup permission policy */ + switch(sb_info->options.derive) { + case DERIVE_NONE: + setup_derived_state(sb->s_root->d_inode, + PERM_ROOT, 0, AID_ROOT, AID_SDCARD_RW, 00775); + sb_info->obbpath_s = NULL; + break; + case DERIVE_LEGACY: + /* Legacy behavior used to support internal multiuser layout which + * places user_id at the top directory level, with the actual roots + * just below that. Shared OBB path is also at top level. */ + setup_derived_state(sb->s_root->d_inode, + PERM_LEGACY_PRE_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); + /* initialize the obbpath string and lookup the path + * sb_info->obb_path will be deactivated by path_put + * on sdcardfs_put_super */ + sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); + snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); + err = prepare_dir(sb_info->obbpath_s, + sb_info->options.fs_low_uid, + sb_info->options.fs_low_gid, 00755); + if(err) + printk(KERN_ERR "sdcardfs: %s: %d, error on creating %s\n", + __func__,__LINE__, sb_info->obbpath_s); + break; + case DERIVE_UNIFIED: + /* Unified multiuser layout which places secondary user_id under + * /Android/user and shared OBB path under /Android/obb. */ + setup_derived_state(sb->s_root->d_inode, + PERM_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); + + sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); + snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); + break; + } + fix_derived_permission(sb->s_root->d_inode); + + if (!silent) + printk(KERN_INFO "sdcardfs: mounted on top of %s type %s\n", + dev_name, lower_sb->s_type->name); + goto out; + } + /* else error: fall through */ + + free_dentry_private_data(sb->s_root); +out_freeroot: + dput(sb->s_root); +out_sput: + /* drop refs we took earlier */ + atomic_dec(&lower_sb->s_active); + packagelist_destroy(sb_info->pkgl_id); +out_freesbi: + kfree(SDCARDFS_SB(sb)); + sb->s_fs_info = NULL; +out_free: + path_put(&lower_path); + +out: + return err; +} + +/* A feature which supports mount_nodev() with options */ +static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, const char *, void *, int)) + +{ + int error; + struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + + if (IS_ERR(s)) + return ERR_CAST(s); + + s->s_flags = flags; + + error = fill_super(s, dev_name, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); +} + +struct dentry *sdcardfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + /* + * dev_name is a lower_path_name, + * raw_data is a option string. + */ + return mount_nodev_with_options(fs_type, flags, dev_name, + raw_data, sdcardfs_read_super); +} + +static struct file_system_type sdcardfs_fs_type = { + .owner = THIS_MODULE, + .name = SDCARDFS_NAME, + .mount = sdcardfs_mount, + .kill_sb = generic_shutdown_super, + .fs_flags = FS_REVAL_DOT, +}; + +static int __init init_sdcardfs_fs(void) +{ + int err; + + pr_info("Registering sdcardfs " SDCARDFS_VERSION "\n"); + + err = sdcardfs_init_inode_cache(); + if (err) + goto out; + err = sdcardfs_init_dentry_cache(); + if (err) + goto out; + err = packagelist_init(); + if (err) + goto out; + err = register_filesystem(&sdcardfs_fs_type); +out: + if (err) { + sdcardfs_destroy_inode_cache(); + sdcardfs_destroy_dentry_cache(); + packagelist_exit(); + } + return err; +} + +static void __exit exit_sdcardfs_fs(void) +{ + sdcardfs_destroy_inode_cache(); + sdcardfs_destroy_dentry_cache(); + packagelist_exit(); + unregister_filesystem(&sdcardfs_fs_type); + pr_info("Completed sdcardfs module unload\n"); +} + +MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University" + " (http://www.fsl.cs.sunysb.edu/)"); +MODULE_DESCRIPTION("Wrapfs " SDCARDFS_VERSION + " (http://wrapfs.filesystems.org/)"); +MODULE_LICENSE("GPL"); + +module_init(init_sdcardfs_fs); +module_exit(exit_sdcardfs_fs); diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c new file mode 100644 index 000000000000..c807d7f18f8b --- /dev/null +++ b/fs/sdcardfs/mmap.c @@ -0,0 +1,82 @@ +/* + * fs/sdcardfs/mmap.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" + +static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int err; + struct file *file, *lower_file; + const struct vm_operations_struct *lower_vm_ops; + struct vm_area_struct lower_vma; + + memcpy(&lower_vma, vma, sizeof(struct vm_area_struct)); + file = lower_vma.vm_file; + lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops; + BUG_ON(!lower_vm_ops); + + lower_file = sdcardfs_lower_file(file); + /* + * XXX: vm_ops->fault may be called in parallel. Because we have to + * resort to temporarily changing the vma->vm_file to point to the + * lower file, a concurrent invocation of sdcardfs_fault could see a + * different value. In this workaround, we keep a different copy of + * the vma structure in our stack, so we never expose a different + * value of the vma->vm_file called to us, even temporarily. A + * better fix would be to change the calling semantics of ->fault to + * take an explicit file pointer. + */ + lower_vma.vm_file = lower_file; + err = lower_vm_ops->fault(&lower_vma, vmf); + return err; +} + +static ssize_t sdcardfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + /* + * This function returns zero on purpose in order to support direct IO. + * __dentry_open checks a_ops->direct_IO and returns EINVAL if it is null. + * + * However, this function won't be called by certain file operations + * including generic fs functions. * reads and writes are delivered to + * the lower file systems and the direct IOs will be handled by them. + * + * NOTE: exceptionally, on the recent kernels (since Linux 3.8.x), + * swap_writepage invokes this function directly. + */ + printk(KERN_INFO "%s, operation is not supported\n", __func__); + return 0; +} + +/* + * XXX: the default address_space_ops for sdcardfs is empty. We cannot set + * our inode->i_mapping->a_ops to NULL because too many code paths expect + * the a_ops vector to be non-NULL. + */ +const struct address_space_operations sdcardfs_aops = { + /* empty on purpose */ + .direct_IO = sdcardfs_direct_IO, +}; + +const struct vm_operations_struct sdcardfs_vm_ops = { + .fault = sdcardfs_fault, +}; diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h new file mode 100644 index 000000000000..923ba101dfa9 --- /dev/null +++ b/fs/sdcardfs/multiuser.h @@ -0,0 +1,37 @@ +/* + * fs/sdcardfs/multiuser.h + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#define MULTIUSER_APP_PER_USER_RANGE 100000 + +typedef uid_t userid_t; +typedef uid_t appid_t; + +static inline userid_t multiuser_get_user_id(uid_t uid) { + return uid / MULTIUSER_APP_PER_USER_RANGE; +} + +static inline appid_t multiuser_get_app_id(uid_t uid) { + return uid % MULTIUSER_APP_PER_USER_RANGE; +} + +static inline uid_t multiuser_get_uid(userid_t userId, appid_t appId) { + return userId * MULTIUSER_APP_PER_USER_RANGE + (appId % MULTIUSER_APP_PER_USER_RANGE); +} + diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c new file mode 100644 index 000000000000..c786d8f92203 --- /dev/null +++ b/fs/sdcardfs/packagelist.c @@ -0,0 +1,458 @@ +/* + * fs/sdcardfs/packagelist.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" +#include "strtok.h" +#include "hashtable.h" +#include +#include +#include +#include + +#define STRING_BUF_SIZE (512) + +struct hashtable_entry { + struct hlist_node hlist; + void *key; + int value; +}; + +struct packagelist_data { + DECLARE_HASHTABLE(package_to_appid,8); + DECLARE_HASHTABLE(appid_with_rw,7); + struct mutex hashtable_lock; + struct task_struct *thread_id; + gid_t write_gid; + char *strtok_last; + char read_buf[STRING_BUF_SIZE]; + char event_buf[STRING_BUF_SIZE]; + char app_name_buf[STRING_BUF_SIZE]; + char gids_buf[STRING_BUF_SIZE]; +}; + +static struct kmem_cache *hashtable_entry_cachep; + +/* Path to system-provided mapping of package name to appIds */ +static const char* const kpackageslist_file = "/data/system/packages.list"; +/* Supplementary groups to execute with */ +static const gid_t kgroups[1] = { AID_PACKAGE_INFO }; + +static unsigned int str_hash(void *key) { + int i; + unsigned int h = strlen(key); + char *data = (char *)key; + + for (i = 0; i < strlen(key); i++) { + h = h * 31 + *data; + data++; + } + return h; +} + +static int contain_appid_key(struct packagelist_data *pkgl_dat, void *appid) { + struct hashtable_entry *hash_cur; + struct hlist_node *h_n; + + hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, (unsigned int)appid, h_n) + if (appid == hash_cur->key) + return 1; + return 0; +} + +/* Return if the calling UID holds sdcard_rw. */ +int get_caller_has_rw_locked(void *pkgl_id, derive_t derive) { + struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; + appid_t appid; + int ret; + + /* No additional permissions enforcement */ + if (derive == DERIVE_NONE) { + return 1; + } + + appid = multiuser_get_app_id(current_fsuid()); + mutex_lock(&pkgl_dat->hashtable_lock); + ret = contain_appid_key(pkgl_dat, (void *)appid); + mutex_unlock(&pkgl_dat->hashtable_lock); + return ret; +} + +appid_t get_appid(void *pkgl_id, const char *app_name) +{ + struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; + struct hashtable_entry *hash_cur; + struct hlist_node *h_n; + unsigned int hash = str_hash((void *)app_name); + appid_t ret_id; + + //printk(KERN_INFO "sdcardfs: %s: %s, %u\n", __func__, (char *)app_name, hash); + mutex_lock(&pkgl_dat->hashtable_lock); + hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash, h_n) { + //printk(KERN_INFO "sdcardfs: %s: %s\n", __func__, (char *)hash_cur->key); + if (!strcasecmp(app_name, hash_cur->key)) { + ret_id = (appid_t)hash_cur->value; + mutex_unlock(&pkgl_dat->hashtable_lock); + //printk(KERN_INFO "=> app_id: %d\n", (int)ret_id); + return ret_id; + } + } + mutex_unlock(&pkgl_dat->hashtable_lock); + //printk(KERN_INFO "=> app_id: %d\n", 0); + return 0; +} + +/* Kernel has already enforced everything we returned through + * derive_permissions_locked(), so this is used to lock down access + * even further, such as enforcing that apps hold sdcard_rw. */ +int check_caller_access_to_name(struct inode *parent_node, const char* name, + derive_t derive, int w_ok, int has_rw) { + + /* Always block security-sensitive files at root */ + if (parent_node && SDCARDFS_I(parent_node)->perm == PERM_ROOT) { + if (!strcasecmp(name, "autorun.inf") + || !strcasecmp(name, ".android_secure") + || !strcasecmp(name, "android_secure")) { + return 0; + } + } + + /* No additional permissions enforcement */ + if (derive == DERIVE_NONE) { + return 1; + } + + /* Root always has access; access for any other UIDs should always + * be controlled through packages.list. */ + if (current_fsuid() == 0) { + return 1; + } + + /* If asking to write, verify that caller either owns the + * parent or holds sdcard_rw. */ + if (w_ok) { + if (parent_node && + (current_fsuid() == SDCARDFS_I(parent_node)->d_uid)) { + return 1; + } + return has_rw; + } + + /* No extra permissions to enforce */ + return 1; +} + +/* This function is used when file opening. The open flags must be + * checked before calling check_caller_access_to_name() */ +int open_flags_to_access_mode(int open_flags) { + if((open_flags & O_ACCMODE) == O_RDONLY) { + return 0; /* R_OK */ + } else if ((open_flags & O_ACCMODE) == O_WRONLY) { + return 1; /* W_OK */ + } else { + /* Probably O_RDRW, but treat as default to be safe */ + return 1; /* R_OK | W_OK */ + } +} + +static int insert_str_to_int(struct packagelist_data *pkgl_dat, void *key, int value) { + struct hashtable_entry *hash_cur; + struct hashtable_entry *new_entry; + struct hlist_node *h_n; + unsigned int hash = str_hash(key); + + //printk(KERN_INFO "sdcardfs: %s: %s: %d, %u\n", __func__, (char *)key, value, hash); + hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash, h_n) { + if (!strcasecmp(key, hash_cur->key)) { + hash_cur->value = value; + return 0; + } + } + new_entry = kmem_cache_alloc(hashtable_entry_cachep, GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->key = kstrdup(key, GFP_KERNEL); + new_entry->value = value; + hash_add(pkgl_dat->package_to_appid, &new_entry->hlist, hash); + return 0; +} + +static void remove_str_to_int(struct hashtable_entry *h_entry) { + //printk(KERN_INFO "sdcardfs: %s: %s: %d\n", __func__, (char *)h_entry->key, h_entry->value); + kfree(h_entry->key); + kmem_cache_free(hashtable_entry_cachep, h_entry); +} + +static int insert_int_to_null(struct packagelist_data *pkgl_dat, void *key, int value) { + struct hashtable_entry *hash_cur; + struct hashtable_entry *new_entry; + struct hlist_node *h_n; + + //printk(KERN_INFO "sdcardfs: %s: %d: %d\n", __func__, (int)key, value); + hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, + (unsigned int)key, h_n) { + if (key == hash_cur->key) { + hash_cur->value = value; + return 0; + } + } + new_entry = kmem_cache_alloc(hashtable_entry_cachep, GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->key = key; + new_entry->value = value; + hash_add(pkgl_dat->appid_with_rw, &new_entry->hlist, + (unsigned int)new_entry->key); + return 0; +} + +static void remove_int_to_null(struct hashtable_entry *h_entry) { + //printk(KERN_INFO "sdcardfs: %s: %d: %d\n", __func__, (int)h_entry->key, h_entry->value); + kmem_cache_free(hashtable_entry_cachep, h_entry); +} + +static void remove_all_hashentrys(struct packagelist_data *pkgl_dat) +{ + struct hashtable_entry *hash_cur; + struct hlist_node *h_n; + struct hlist_node *h_t; + int i; + + hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist, h_n) + remove_str_to_int(hash_cur); + hash_for_each_safe(pkgl_dat->appid_with_rw, i, h_t, hash_cur, hlist, h_n) + remove_int_to_null(hash_cur); + + hash_init(pkgl_dat->package_to_appid); + hash_init(pkgl_dat->appid_with_rw); +} + +static int read_package_list(struct packagelist_data *pkgl_dat) { + int ret; + int fd; + int read_amount; + + printk(KERN_INFO "sdcardfs: read_package_list\n"); + + mutex_lock(&pkgl_dat->hashtable_lock); + + remove_all_hashentrys(pkgl_dat); + + fd = sys_open(kpackageslist_file, O_RDONLY, 0); + if (fd < 0) { + printk(KERN_ERR "sdcardfs: failed to open package list\n"); + mutex_unlock(&pkgl_dat->hashtable_lock); + return fd; + } + + while ((read_amount = sys_read(fd, pkgl_dat->read_buf, + sizeof(pkgl_dat->read_buf))) > 0) { + int appid; + char *token; + int one_line_len = 0; + int additional_read; + unsigned long ret_gid; + + while (one_line_len < read_amount) { + if (pkgl_dat->read_buf[one_line_len] == '\n') { + one_line_len++; + break; + } + one_line_len++; + } + additional_read = read_amount - one_line_len; + if (additional_read > 0) + sys_lseek(fd, -additional_read, SEEK_CUR); + + if (sscanf(pkgl_dat->read_buf, "%s %d %*d %*s %*s %s", + pkgl_dat->app_name_buf, &appid, + pkgl_dat->gids_buf) == 3) { + ret = insert_str_to_int(pkgl_dat, pkgl_dat->app_name_buf, appid); + if (ret) { + sys_close(fd); + mutex_unlock(&pkgl_dat->hashtable_lock); + return ret; + } + + token = strtok_r(pkgl_dat->gids_buf, ",", &pkgl_dat->strtok_last); + while (token != NULL) { + if (!kstrtoul(token, 10, &ret_gid) && + (ret_gid == pkgl_dat->write_gid)) { + ret = insert_int_to_null(pkgl_dat, (void *)appid, 1); + if (ret) { + sys_close(fd); + mutex_unlock(&pkgl_dat->hashtable_lock); + return ret; + } + break; + } + token = strtok_r(NULL, ",", &pkgl_dat->strtok_last); + } + } + } + + sys_close(fd); + mutex_unlock(&pkgl_dat->hashtable_lock); + return 0; +} + +static int packagelist_reader(void *thread_data) +{ + struct packagelist_data *pkgl_dat = (struct packagelist_data *)thread_data; + struct inotify_event *event; + bool active = false; + int event_pos; + int event_size; + int res = 0; + int nfd; + + allow_signal(SIGINT); + + nfd = sys_inotify_init(); + if (nfd < 0) { + printk(KERN_ERR "sdcardfs: inotify_init failed: %d\n", nfd); + return nfd; + } + + while (!kthread_should_stop()) { + if (signal_pending(current)) { + ssleep(1); + continue; + } + + if (!active) { + res = sys_inotify_add_watch(nfd, kpackageslist_file, IN_DELETE_SELF); + if (res < 0) { + if (res == -ENOENT || res == -EACCES) { + /* Framework may not have created yet, sleep and retry */ + printk(KERN_ERR "sdcardfs: missing packages.list; retrying\n"); + ssleep(2); + printk(KERN_ERR "sdcardfs: missing packages.list_end; retrying\n"); + continue; + } else { + printk(KERN_ERR "sdcardfs: inotify_add_watch failed: %d\n", res); + goto interruptable_sleep; + } + } + /* Watch above will tell us about any future changes, so + * read the current state. */ + res = read_package_list(pkgl_dat); + if (res) { + printk(KERN_ERR "sdcardfs: read_package_list failed: %d\n", res); + goto interruptable_sleep; + } + active = true; + } + + event_pos = 0; + res = sys_read(nfd, pkgl_dat->event_buf, sizeof(pkgl_dat->event_buf)); + if (res < (int) sizeof(*event)) { + if (res == -EINTR) + continue; + printk(KERN_ERR "sdcardfs: failed to read inotify event: %d\n", res); + goto interruptable_sleep; + } + + while (res >= (int) sizeof(*event)) { + event = (struct inotify_event *) (pkgl_dat->event_buf + event_pos); + + printk(KERN_INFO "sdcardfs: inotify event: %08x\n", event->mask); + if ((event->mask & IN_IGNORED) == IN_IGNORED) { + /* Previously watched file was deleted, probably due to move + * that swapped in new data; re-arm the watch and read. */ + active = false; + } + + event_size = sizeof(*event) + event->len; + res -= event_size; + event_pos += event_size; + } + continue; + +interruptable_sleep: + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + flush_signals(current); + sys_close(nfd); + return res; +} + +void * packagelist_create(gid_t write_gid) +{ + struct packagelist_data *pkgl_dat; + struct task_struct *packagelist_thread; + + pkgl_dat = kmalloc(sizeof(*pkgl_dat), GFP_KERNEL | __GFP_ZERO); + if (!pkgl_dat) { + printk(KERN_ERR "sdcardfs: creating kthread failed\n"); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&pkgl_dat->hashtable_lock); + hash_init(pkgl_dat->package_to_appid); + hash_init(pkgl_dat->appid_with_rw); + pkgl_dat->write_gid = write_gid; + + packagelist_thread = kthread_run(packagelist_reader, (void *)pkgl_dat, "pkgld"); + if (IS_ERR(packagelist_thread)) { + printk(KERN_ERR "sdcardfs: creating kthread failed\n"); + kfree(pkgl_dat); + return packagelist_thread; + } + pkgl_dat->thread_id = packagelist_thread; + + printk(KERN_INFO "sdcardfs: created packagelist pkgld/%d\n", + (int)pkgl_dat->thread_id->pid); + + return (void *)pkgl_dat; +} + +void packagelist_destroy(void *pkgl_id) +{ + struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; + pid_t pkgl_pid = pkgl_dat->thread_id->pid; + + force_sig_info(SIGINT, SEND_SIG_PRIV, pkgl_dat->thread_id); + kthread_stop(pkgl_dat->thread_id); + remove_all_hashentrys(pkgl_dat); + printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld/%d\n", (int)pkgl_pid); + kfree(pkgl_dat); +} + +int packagelist_init(void) +{ + hashtable_entry_cachep = + kmem_cache_create("packagelist_hashtable_entry", + sizeof(struct hashtable_entry), 0, 0, NULL); + if (!hashtable_entry_cachep) { + printk(KERN_ERR "sdcardfs: failed creating pkgl_hashtable entry slab cache\n"); + return -ENOMEM; + } + + return 0; +} + +void packagelist_exit(void) +{ + if (hashtable_entry_cachep) + kmem_cache_destroy(hashtable_entry_cachep); +} + + diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h new file mode 100644 index 000000000000..90f8b24e4a52 --- /dev/null +++ b/fs/sdcardfs/sdcardfs.h @@ -0,0 +1,493 @@ +/* + * fs/sdcardfs/sdcardfs.h + * + * The sdcardfs v2.0 + * This file system replaces the sdcard daemon on Android + * On version 2.0, some of the daemon functions have been ported + * to support the multi-user concepts of Android 4.4 + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#ifndef _SDCARDFS_H_ +#define _SDCARDFS_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "multiuser.h" + +/* the file system name */ +#define SDCARDFS_NAME "sdcardfs" + +/* sdcardfs root inode number */ +#define SDCARDFS_ROOT_INO 1 + +/* useful for tracking code reachability */ +#define UDBG printk(KERN_DEFAULT "DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__) + +#define SDCARDFS_DIRENT_SIZE 256 + +/* temporary static uid settings for development */ +#define AID_ROOT 0 /* uid for accessing /mnt/sdcard & extSdcard */ +#define AID_MEDIA_RW 1023 /* internal media storage write access */ + +#define AID_SDCARD_RW 1015 /* external storage write access */ +#define AID_SDCARD_R 1028 /* external storage read access */ +#define AID_SDCARD_PICS 1033 /* external storage photos access */ +#define AID_SDCARD_AV 1034 /* external storage audio/video access */ +#define AID_SDCARD_ALL 1035 /* access all users external storage */ + +#define AID_PACKAGE_INFO 1027 + +#define fix_derived_permission(x) \ + do { \ + (x)->i_uid = SDCARDFS_I(x)->d_uid; \ + (x)->i_gid = SDCARDFS_I(x)->d_gid; \ + (x)->i_mode = ((x)->i_mode & S_IFMT) | SDCARDFS_I(x)->d_mode;\ + } while (0) + +/* OVERRIDE_CRED() and REVERT_CRED() + * OVERRID_CRED() + * backup original task->cred + * and modifies task->cred->fsuid/fsgid to specified value. + * REVERT_CRED() + * restore original task->cred->fsuid/fsgid. + * These two macro should be used in pair, and OVERRIDE_CRED() should be + * placed at the beginning of a function, right after variable declaration. + */ +#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred) \ + saved_cred = override_fsids(sdcardfs_sbi); \ + if (!saved_cred) { return -ENOMEM; } + +#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred) \ + saved_cred = override_fsids(sdcardfs_sbi); \ + if (!saved_cred) { return ERR_PTR(-ENOMEM); } + +#define REVERT_CRED(saved_cred) revert_fsids(saved_cred) + +#define DEBUG_CRED() \ + printk("KAKJAGI: %s:%d fsuid %d fsgid %d\n", \ + __FUNCTION__, __LINE__, \ + (int)current->cred->fsuid, \ + (int)current->cred->fsgid); + +/* Android 4.4 support */ + +/* Permission mode for a specific node. Controls how file permissions + * are derived for children nodes. */ +typedef enum { + /* Nothing special; this node should just inherit from its parent. */ + PERM_INHERIT, + /* This node is one level above a normal root; used for legacy layouts + * which use the first level to represent user_id. */ + PERM_LEGACY_PRE_ROOT, + /* This node is "/" */ + PERM_ROOT, + /* This node is "/Android" */ + PERM_ANDROID, + /* This node is "/Android/data" */ + PERM_ANDROID_DATA, + /* This node is "/Android/obb" */ + PERM_ANDROID_OBB, + /* This node is "/Android/user" */ + PERM_ANDROID_USER, +} perm_t; + +/* Permissions structure to derive */ +typedef enum { + DERIVE_NONE, + DERIVE_LEGACY, + DERIVE_UNIFIED, +} derive_t; + +typedef enum { + LOWER_FS_EXT4, + LOWER_FS_FAT, +} lower_fs_t; + +struct sdcardfs_sb_info; +struct sdcardfs_mount_options; + +/* Do not directly use this function. Use OVERRIDE_CRED() instead. */ +const struct cred * override_fsids(struct sdcardfs_sb_info* sbi); +/* Do not directly use this function, use REVERT_CRED() instead. */ +void revert_fsids(const struct cred * old_cred); + +/* operations vectors defined in specific files */ +extern const struct file_operations sdcardfs_main_fops; +extern const struct file_operations sdcardfs_dir_fops; +extern const struct inode_operations sdcardfs_main_iops; +extern const struct inode_operations sdcardfs_dir_iops; +extern const struct inode_operations sdcardfs_symlink_iops; +extern const struct super_operations sdcardfs_sops; +extern const struct dentry_operations sdcardfs_ci_dops; +extern const struct address_space_operations sdcardfs_aops, sdcardfs_dummy_aops; +extern const struct vm_operations_struct sdcardfs_vm_ops; + +extern int sdcardfs_init_inode_cache(void); +extern void sdcardfs_destroy_inode_cache(void); +extern int sdcardfs_init_dentry_cache(void); +extern void sdcardfs_destroy_dentry_cache(void); +extern int new_dentry_private_data(struct dentry *dentry); +extern void free_dentry_private_data(struct dentry *dentry); +extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd); +extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, + struct path *lower_path); + +/* file private data */ +struct sdcardfs_file_info { + struct file *lower_file; + const struct vm_operations_struct *lower_vm_ops; +}; + +/* sdcardfs inode data in memory */ +struct sdcardfs_inode_info { + struct inode *lower_inode; + /* state derived based on current position in hierachy + * caution: d_mode does not include file types + */ + perm_t perm; + userid_t userid; + uid_t d_uid; + gid_t d_gid; + mode_t d_mode; + + struct inode vfs_inode; +}; + +/* sdcardfs dentry data in memory */ +struct sdcardfs_dentry_info { + spinlock_t lock; /* protects lower_path */ + struct path lower_path; + struct path orig_path; +}; + +struct sdcardfs_mount_options { + uid_t fs_low_uid; + gid_t fs_low_gid; + gid_t write_gid; + int split_perms; + derive_t derive; + lower_fs_t lower_fs; + unsigned int reserved_mb; +}; + +/* sdcardfs super-block data in memory */ +struct sdcardfs_sb_info { + struct super_block *lower_sb; + /* derived perm policy : some of options have been added + * to sdcardfs_mount_options (Android 4.4 support) */ + struct sdcardfs_mount_options options; + spinlock_t lock; /* protects obbpath */ + char *obbpath_s; + struct path obbpath; + void *pkgl_id; +}; + +/* + * inode to private data + * + * Since we use containers and the struct inode is _inside_ the + * sdcardfs_inode_info structure, SDCARDFS_I will always (given a non-NULL + * inode pointer), return a valid non-NULL pointer. + */ +static inline struct sdcardfs_inode_info *SDCARDFS_I(const struct inode *inode) +{ + return container_of(inode, struct sdcardfs_inode_info, vfs_inode); +} + +/* dentry to private data */ +#define SDCARDFS_D(dent) ((struct sdcardfs_dentry_info *)(dent)->d_fsdata) + +/* superblock to private data */ +#define SDCARDFS_SB(super) ((struct sdcardfs_sb_info *)(super)->s_fs_info) + +/* file to private Data */ +#define SDCARDFS_F(file) ((struct sdcardfs_file_info *)((file)->private_data)) + +/* file to lower file */ +static inline struct file *sdcardfs_lower_file(const struct file *f) +{ + return SDCARDFS_F(f)->lower_file; +} + +static inline void sdcardfs_set_lower_file(struct file *f, struct file *val) +{ + SDCARDFS_F(f)->lower_file = val; +} + +/* inode to lower inode. */ +static inline struct inode *sdcardfs_lower_inode(const struct inode *i) +{ + return SDCARDFS_I(i)->lower_inode; +} + +static inline void sdcardfs_set_lower_inode(struct inode *i, struct inode *val) +{ + SDCARDFS_I(i)->lower_inode = val; +} + +/* superblock to lower superblock */ +static inline struct super_block *sdcardfs_lower_super( + const struct super_block *sb) +{ + return SDCARDFS_SB(sb)->lower_sb; +} + +static inline void sdcardfs_set_lower_super(struct super_block *sb, + struct super_block *val) +{ + SDCARDFS_SB(sb)->lower_sb = val; +} + +/* path based (dentry/mnt) macros */ +static inline void pathcpy(struct path *dst, const struct path *src) +{ + dst->dentry = src->dentry; + dst->mnt = src->mnt; +} + +/* sdcardfs_get_pname functions calls path_get() + * therefore, the caller must call "proper" path_put functions + */ +#define SDCARDFS_DENT_FUNC(pname) \ +static inline void sdcardfs_get_##pname(const struct dentry *dent, \ + struct path *pname) \ +{ \ + spin_lock(&SDCARDFS_D(dent)->lock); \ + pathcpy(pname, &SDCARDFS_D(dent)->pname); \ + path_get(pname); \ + spin_unlock(&SDCARDFS_D(dent)->lock); \ + return; \ +} \ +static inline void sdcardfs_put_##pname(const struct dentry *dent, \ + struct path *pname) \ +{ \ + path_put(pname); \ + return; \ +} \ +static inline void sdcardfs_set_##pname(const struct dentry *dent, \ + struct path *pname) \ +{ \ + spin_lock(&SDCARDFS_D(dent)->lock); \ + pathcpy(&SDCARDFS_D(dent)->pname, pname); \ + spin_unlock(&SDCARDFS_D(dent)->lock); \ + return; \ +} \ +static inline void sdcardfs_reset_##pname(const struct dentry *dent) \ +{ \ + spin_lock(&SDCARDFS_D(dent)->lock); \ + SDCARDFS_D(dent)->pname.dentry = NULL; \ + SDCARDFS_D(dent)->pname.mnt = NULL; \ + spin_unlock(&SDCARDFS_D(dent)->lock); \ + return; \ +} \ +static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \ +{ \ + struct path pname; \ + spin_lock(&SDCARDFS_D(dent)->lock); \ + if(SDCARDFS_D(dent)->pname.dentry) { \ + pathcpy(&pname, &SDCARDFS_D(dent)->pname); \ + SDCARDFS_D(dent)->pname.dentry = NULL; \ + SDCARDFS_D(dent)->pname.mnt = NULL; \ + spin_unlock(&SDCARDFS_D(dent)->lock); \ + path_put(&pname); \ + } else \ + spin_unlock(&SDCARDFS_D(dent)->lock); \ + return; \ +} + +SDCARDFS_DENT_FUNC(lower_path) +SDCARDFS_DENT_FUNC(orig_path) + +static inline int has_graft_path(const struct dentry *dent) +{ + int ret = 0; + + spin_lock(&SDCARDFS_D(dent)->lock); + if (SDCARDFS_D(dent)->orig_path.dentry != NULL) + ret = 1; + spin_unlock(&SDCARDFS_D(dent)->lock); + + return ret; +} + +static inline void sdcardfs_get_real_lower(const struct dentry *dent, + struct path *real_lower) +{ + /* in case of a local obb dentry + * the orig_path should be returned + */ + if(has_graft_path(dent)) + sdcardfs_get_orig_path(dent, real_lower); + else + sdcardfs_get_lower_path(dent, real_lower); +} + +static inline void sdcardfs_put_real_lower(const struct dentry *dent, + struct path *real_lower) +{ + if(has_graft_path(dent)) + sdcardfs_put_orig_path(dent, real_lower); + else + sdcardfs_put_lower_path(dent, real_lower); +} + +/* for packagelist.c */ +extern int get_caller_has_rw_locked(void *pkgl_id, derive_t derive); +extern appid_t get_appid(void *pkgl_id, const char *app_name); +extern int check_caller_access_to_name(struct inode *parent_node, const char* name, + derive_t derive, int w_ok, int has_rw); +extern int open_flags_to_access_mode(int open_flags); +extern void * packagelist_create(gid_t write_gid); +extern void packagelist_destroy(void *pkgl_id); +extern int packagelist_init(void); +extern void packagelist_exit(void); + +/* for derived_perm.c */ +extern void setup_derived_state(struct inode *inode, perm_t perm, + userid_t userid, uid_t uid, gid_t gid, mode_t mode); +extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); +extern void update_derived_permission(struct dentry *dentry); +extern int need_graft_path(struct dentry *dentry); +extern int is_base_obbpath(struct dentry *dentry); +extern int is_obbpath_invalid(struct dentry *dentry); +extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path); + +/* locking helpers */ +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *dir = dget_parent(dentry); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + return dir; +} + +static inline void unlock_dir(struct dentry *dir) +{ + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); +} + +static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t mode) +{ + int err; + struct dentry *dent; + struct iattr attrs; + struct nameidata nd; + + err = kern_path_parent(path_s, &nd); + if (err) { + if (err == -EEXIST) + err = 0; + goto out; + } + + dent = lookup_create(&nd, 1); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -EEXIST) + err = 0; + goto out_unlock; + } + + err = vfs_mkdir(nd.path.dentry->d_inode, dent, mode); + if (err) { + if (err == -EEXIST) + err = 0; + goto out_dput; + } + + attrs.ia_uid = uid; + attrs.ia_gid = gid; + attrs.ia_valid = ATTR_UID | ATTR_GID; + mutex_lock(&dent->d_inode->i_mutex); + notify_change(dent, &attrs); + mutex_unlock(&dent->d_inode->i_mutex); + +out_dput: + dput(dent); + +out_unlock: + /* parent dentry locked by lookup_create */ + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + +out: + return err; +} + +/* + * Return 1, if a disk has enough free space, otherwise 0. + * We assume that any files can not be overwritten. + */ +static inline int check_min_free_space(struct dentry *dentry, size_t size, int dir) +{ + int err; + struct path lower_path; + struct kstatfs statfs; + u64 avail; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + + if (sbi->options.reserved_mb) { + /* Get fs stat of lower filesystem. */ + sdcardfs_get_lower_path(dentry, &lower_path); + err = vfs_statfs(&lower_path, &statfs); + sdcardfs_put_lower_path(dentry, &lower_path); + + if (unlikely(err)) + return 0; + + /* Invalid statfs informations. */ + if (unlikely(statfs.f_bsize == 0)) + return 0; + + /* if you are checking directory, set size to f_bsize. */ + if (unlikely(dir)) + size = statfs.f_bsize; + + /* available size */ + avail = statfs.f_bavail * statfs.f_bsize; + + /* not enough space */ + if ((u64)size > avail) + return 0; + + /* enough space */ + if ((avail - size) > (sbi->options.reserved_mb * 1024 * 1024)) + return 1; + + return 0; + } else + return 1; +} + +#endif /* not _SDCARDFS_H_ */ diff --git a/fs/sdcardfs/strtok.h b/fs/sdcardfs/strtok.h new file mode 100644 index 000000000000..50ab25aa0bc4 --- /dev/null +++ b/fs/sdcardfs/strtok.h @@ -0,0 +1,75 @@ +/* + * fs/sdcardfs/strtok.h + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +static char * +strtok_r(char *s, const char *delim, char **last) +{ + char *spanp; + int c, sc; + char *tok; + + + /* if (s == NULL && (s = *last) == NULL) + return NULL; */ + if (s == NULL) { + s = *last; + if (s == NULL) + return NULL; + } + + /* + * Skip (span) leading delimiters (s += strspn(s, delim), sort of). + */ +cont: + c = *s++; + for (spanp = (char *)delim; (sc = *spanp++) != 0;) { + if (c == sc) + goto cont; + } + + if (c == 0) { /* no non-delimiter characters */ + *last = NULL; + return NULL; + } + tok = s - 1; + + /* + * Scan token (scan for delimiters: s += strcspn(s, delim), sort of). + * Note that delim must have one NUL; we stop if we see that, too. + */ + for (;;) { + c = *s++; + spanp = (char *)delim; + do { + sc = *spanp++; + if (sc == c) { + if (c == 0) + s = NULL; + else + s[-1] = 0; + *last = s; + return tok; + } + } while (sc != 0); + } + + /* NOTREACHED */ +} + diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c new file mode 100644 index 000000000000..1d206c82dfdf --- /dev/null +++ b/fs/sdcardfs/super.c @@ -0,0 +1,229 @@ +/* + * fs/sdcardfs/super.c + * + * Copyright (c) 2013 Samsung Electronics Co. Ltd + * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, + * Sunghwan Yun, Sungjong Seo + * + * This program has been developed as a stackable file system based on + * the WrapFS which written by + * + * Copyright (c) 1998-2011 Erez Zadok + * Copyright (c) 2009 Shrikar Archak + * Copyright (c) 2003-2011 Stony Brook University + * Copyright (c) 2003-2011 The Research Foundation of SUNY + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + */ + +#include "sdcardfs.h" + +/* + * The inode cache is used with alloc_inode for both our inode info and the + * vfs inode. + */ +static struct kmem_cache *sdcardfs_inode_cachep; + +/* final actions when unmounting a file system */ +static void sdcardfs_put_super(struct super_block *sb) +{ + struct sdcardfs_sb_info *spd; + struct super_block *s; + + spd = SDCARDFS_SB(sb); + if (!spd) + return; + + if(spd->obbpath_s) { + kfree(spd->obbpath_s); + path_put(&spd->obbpath); + } + + /* decrement lower super references */ + s = sdcardfs_lower_super(sb); + sdcardfs_set_lower_super(sb, NULL); + atomic_dec(&s->s_active); + + if(spd->pkgl_id) + packagelist_destroy(spd->pkgl_id); + + kfree(spd); + sb->s_fs_info = NULL; +} + +static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int err; + struct path lower_path; + u32 min_blocks; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + + sdcardfs_get_lower_path(dentry, &lower_path); + err = vfs_statfs(&lower_path, buf); + sdcardfs_put_lower_path(dentry, &lower_path); + + if (sbi->options.reserved_mb) { + /* Invalid statfs informations. */ + if (buf->f_bsize == 0) { + printk(KERN_ERR "Returned block size is zero.\n"); + return -EINVAL; + } + + min_blocks = ((sbi->options.reserved_mb * 1024 * 1024)/buf->f_bsize); + buf->f_blocks -= min_blocks; + + if (buf->f_bavail > min_blocks) + buf->f_bavail -= min_blocks; + else + buf->f_bavail = 0; + + /* Make reserved blocks invisiable to media storage */ + buf->f_bfree = buf->f_bavail; + } + + /* set return buf to our f/s to avoid confusing user-level utils */ + buf->f_type = SDCARDFS_SUPER_MAGIC; + + return err; +} + +/* + * @flags: numeric mount options + * @options: mount options string + */ +static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options) +{ + int err = 0; + + /* + * The VFS will take care of "ro" and "rw" flags among others. We + * can safely accept a few flags (RDONLY, MANDLOCK), and honor + * SILENT, but anything else left over is an error. + */ + if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) { + printk(KERN_ERR + "sdcardfs: remount flags 0x%x unsupported\n", *flags); + err = -EINVAL; + } + + return err; +} + +/* + * Called by iput() when the inode reference count reached zero + * and the inode is not hashed anywhere. Used to clear anything + * that needs to be, before the inode is completely destroyed and put + * on the inode free list. + */ +static void sdcardfs_evict_inode(struct inode *inode) +{ + struct inode *lower_inode; + + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + /* + * Decrement a reference to a lower_inode, which was incremented + * by our read_inode when it was created initially. + */ + lower_inode = sdcardfs_lower_inode(inode); + sdcardfs_set_lower_inode(inode, NULL); + iput(lower_inode); +} + +static struct inode *sdcardfs_alloc_inode(struct super_block *sb) +{ + struct sdcardfs_inode_info *i; + + i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL); + if (!i) + return NULL; + + /* memset everything up to the inode to 0 */ + memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode)); + + i->vfs_inode.i_version = 1; + return &i->vfs_inode; +} + +static void sdcardfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode)); +} + +/* sdcardfs inode cache constructor */ +static void init_once(void *obj) +{ + struct sdcardfs_inode_info *i = obj; + + inode_init_once(&i->vfs_inode); +} + +int sdcardfs_init_inode_cache(void) +{ + int err = 0; + + sdcardfs_inode_cachep = + kmem_cache_create("sdcardfs_inode_cache", + sizeof(struct sdcardfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, init_once); + if (!sdcardfs_inode_cachep) + err = -ENOMEM; + return err; +} + +/* sdcardfs inode cache destructor */ +void sdcardfs_destroy_inode_cache(void) +{ + if (sdcardfs_inode_cachep) + kmem_cache_destroy(sdcardfs_inode_cachep); +} + +/* + * Used only in nfs, to kill any pending RPC tasks, so that subsequent + * code can actually succeed and won't leave tasks that need handling. + */ +static void sdcardfs_umount_begin(struct super_block *sb) +{ + struct super_block *lower_sb; + + lower_sb = sdcardfs_lower_super(sb); + if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin) + lower_sb->s_op->umount_begin(lower_sb); +} + +static int sdcardfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(mnt->mnt_sb); + struct sdcardfs_mount_options *opts = &sbi->options; + + if (opts->fs_low_uid != 0) + seq_printf(m, ",uid=%u", opts->fs_low_uid); + if (opts->fs_low_gid != 0) + seq_printf(m, ",gid=%u", opts->fs_low_gid); + + if (opts->derive == DERIVE_NONE) + seq_printf(m, ",derive=none"); + else if (opts->derive == DERIVE_LEGACY) + seq_printf(m, ",derive=legacy"); + else if (opts->derive == DERIVE_UNIFIED) + seq_printf(m, ",derive=unified"); + + if (opts->reserved_mb != 0) + seq_printf(m, ",reserved=%uMB", opts->reserved_mb); + + return 0; +}; + +const struct super_operations sdcardfs_sops = { + .put_super = sdcardfs_put_super, + .statfs = sdcardfs_statfs, + .remount_fs = sdcardfs_remount_fs, + .evict_inode = sdcardfs_evict_inode, + .umount_begin = sdcardfs_umount_begin, + .show_options = sdcardfs_show_options, + .alloc_inode = sdcardfs_alloc_inode, + .destroy_inode = sdcardfs_destroy_inode, + .drop_inode = generic_delete_inode, +}; diff --git a/include/linux/namei.h b/include/linux/namei.h index d8c6334cd150..ef3b4f74eaf0 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -43,6 +43,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 +#ifdef CONFIG_SDCARD_FS_CI_SEARCH +#define LOOKUP_CASE_INSENSITIVE 0x8000 +#endif extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index accb036bbc9c..cfb5c406f344 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -52,6 +52,8 @@ #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" +#define SDCARDFS_SUPER_MAGIC 0xb550ca10 + #define SMB_SUPER_MAGIC 0x517B #define CGROUP_SUPER_MAGIC 0x27e0eb From e76fbcd41aba68de594ec481a7d847982da81388 Mon Sep 17 00:00:00 2001 From: Daniel Campello Date: Mon, 20 Jul 2015 16:27:37 -0700 Subject: [PATCH 032/607] Port of sdcardfs to 4.4 Change-Id: I25b99ecf214e72ebf6a57ec3085972542a8d7951 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/Kconfig | 1 - fs/sdcardfs/dentry.c | 9 +- fs/sdcardfs/file.c | 47 ++++--- fs/sdcardfs/hashtable.h | 190 -------------------------- fs/sdcardfs/inode.c | 280 +++++++++++++++----------------------- fs/sdcardfs/lookup.c | 25 ++-- fs/sdcardfs/main.c | 113 ++++++++------- fs/sdcardfs/mmap.c | 5 +- fs/sdcardfs/packagelist.c | 39 +++--- fs/sdcardfs/sdcardfs.h | 41 +++--- fs/sdcardfs/super.c | 6 +- include/linux/namei.h | 2 + 12 files changed, 252 insertions(+), 506 deletions(-) delete mode 100644 fs/sdcardfs/hashtable.h diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig index 657f4958e8d6..d995f3eaae6d 100644 --- a/fs/sdcardfs/Kconfig +++ b/fs/sdcardfs/Kconfig @@ -1,6 +1,5 @@ config SDCARD_FS tristate "sdcard file system" - depends on EXPERIMENTAL default n help Sdcardfs is based on Wrapfs file system. diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 4572a5403bb2..dbbcfd091fc7 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -26,7 +26,7 @@ * 0: tell VFS to invalidate dentry * 1: dentry is valid */ -static int sdcardfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) { int err = 1; struct path parent_lower_path, lower_path; @@ -35,7 +35,7 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct dentry *lower_cur_parent_dentry = NULL; struct dentry *lower_dentry = NULL; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; spin_lock(&dentry->d_lock); @@ -119,7 +119,7 @@ static void sdcardfs_d_release(struct dentry *dentry) } static int sdcardfs_hash_ci(const struct dentry *dentry, - const struct inode *inode, struct qstr *qstr) + struct qstr *qstr) { /* * This function is copy of vfat_hashi. @@ -148,8 +148,7 @@ static int sdcardfs_hash_ci(const struct dentry *dentry, * Case insensitive compare of two vfat names. */ static int sdcardfs_cmp_ci(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { /* This function is copy of vfat_cmpi */ diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index bcacb947c874..f9c5eaafc619 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -50,8 +50,8 @@ static ssize_t sdcardfs_read(struct file *file, char __user *buf, err = vfs_read(lower_file, buf, count, ppos); /* update our inode atime upon a successful lower read */ if (err >= 0) - fsstack_copy_attr_atime(dentry->d_inode, - lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_atime(d_inode(dentry), + file_inode(lower_file)); return err; } @@ -59,7 +59,7 @@ static ssize_t sdcardfs_read(struct file *file, char __user *buf, static ssize_t sdcardfs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int err = 0; + int err; struct file *lower_file; struct dentry *dentry = file->f_path.dentry; @@ -73,29 +73,29 @@ static ssize_t sdcardfs_write(struct file *file, const char __user *buf, err = vfs_write(lower_file, buf, count, ppos); /* update our inode times+sizes upon a successful lower write */ if (err >= 0) { - fsstack_copy_inode_size(dentry->d_inode, - lower_file->f_path.dentry->d_inode); - fsstack_copy_attr_times(dentry->d_inode, - lower_file->f_path.dentry->d_inode); + fsstack_copy_inode_size(d_inode(dentry), + file_inode(lower_file)); + fsstack_copy_attr_times(d_inode(dentry), + file_inode(lower_file)); } return err; } -static int sdcardfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int sdcardfs_readdir(struct file *file, struct dir_context *ctx) { - int err = 0; + int err; struct file *lower_file = NULL; struct dentry *dentry = file->f_path.dentry; lower_file = sdcardfs_lower_file(file); lower_file->f_pos = file->f_pos; - err = vfs_readdir(lower_file, filldir, dirent); + err = iterate_dir(lower_file, ctx); file->f_pos = lower_file->f_pos; if (err >= 0) /* copy the atime */ - fsstack_copy_attr_atime(dentry->d_inode, - lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_atime(d_inode(dentry), + file_inode(lower_file)); return err; } @@ -191,7 +191,6 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma) */ file_accessed(file); vma->vm_ops = &sdcardfs_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */ if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */ @@ -242,8 +241,8 @@ static int sdcardfs_open(struct inode *inode, struct file *file) /* open lower object and link sdcardfs's file struct to lower's */ sdcardfs_get_lower_path(file->f_path.dentry, &lower_path); - lower_file = dentry_open(lower_path.dentry, lower_path.mnt, - file->f_flags, current_cred()); + lower_file = dentry_open(&lower_path, file->f_flags, current_cred()); + path_put(&lower_path); if (IS_ERR(lower_file)) { err = PTR_ERR(lower_file); lower_file = sdcardfs_lower_file(file); @@ -275,8 +274,10 @@ static int sdcardfs_flush(struct file *file, fl_owner_t id) struct file *lower_file = NULL; lower_file = sdcardfs_lower_file(file); - if (lower_file && lower_file->f_op && lower_file->f_op->flush) + if (lower_file && lower_file->f_op && lower_file->f_op->flush) { + filemap_write_and_wait(file->f_mapping); err = lower_file->f_op->flush(lower_file, id); + } return err; } @@ -296,19 +297,23 @@ static int sdcardfs_file_release(struct inode *inode, struct file *file) return 0; } -static int -sdcardfs_fsync(struct file *file, int datasync) +static int sdcardfs_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { int err; struct file *lower_file; struct path lower_path; struct dentry *dentry = file->f_path.dentry; + err = __generic_file_fsync(file, start, end, datasync); + if (err) + goto out; + lower_file = sdcardfs_lower_file(file); sdcardfs_get_lower_path(dentry, &lower_path); - err = vfs_fsync(lower_file, datasync); + err = vfs_fsync_range(lower_file, start, end, datasync); sdcardfs_put_lower_path(dentry, &lower_path); - +out: return err; } @@ -344,7 +349,7 @@ const struct file_operations sdcardfs_main_fops = { const struct file_operations sdcardfs_dir_fops = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = sdcardfs_readdir, + .iterate = sdcardfs_readdir, .unlocked_ioctl = sdcardfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = sdcardfs_compat_ioctl, diff --git a/fs/sdcardfs/hashtable.h b/fs/sdcardfs/hashtable.h deleted file mode 100644 index 1e770f3df148..000000000000 --- a/fs/sdcardfs/hashtable.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Statically sized hash table implementation - * (C) 2012 Sasha Levin - */ - -#ifndef _LINUX_HASHTABLE_H -#define _LINUX_HASHTABLE_H - -#include -#include -#include -#include -#include - -#define DEFINE_HASHTABLE(name, bits) \ - struct hlist_head name[1 << (bits)] = \ - { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } - -#define DECLARE_HASHTABLE(name, bits) \ - struct hlist_head name[1 << (bits)] - -#define HASH_SIZE(name) (ARRAY_SIZE(name)) -#define HASH_BITS(name) ilog2(HASH_SIZE(name)) - -/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ -#define hash_min(val, bits) \ - (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) - -static inline void __hash_init(struct hlist_head *ht, unsigned int sz) -{ - unsigned int i; - - for (i = 0; i < sz; i++) - INIT_HLIST_HEAD(&ht[i]); -} - -/** - * hash_init - initialize a hash table - * @hashtable: hashtable to be initialized - * - * Calculates the size of the hashtable from the given parameter, otherwise - * same as hash_init_size. - * - * This has to be a macro since HASH_BITS() will not work on pointers since - * it calculates the size during preprocessing. - */ -#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) - -/** - * hash_add - add an object to a hashtable - * @hashtable: hashtable to add to - * @node: the &struct hlist_node of the object to be added - * @key: the key of the object to be added - */ -#define hash_add(hashtable, node, key) \ - hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) - -/** - * hash_add_rcu - add an object to a rcu enabled hashtable - * @hashtable: hashtable to add to - * @node: the &struct hlist_node of the object to be added - * @key: the key of the object to be added - */ -#define hash_add_rcu(hashtable, node, key) \ - hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) - -/** - * hash_hashed - check whether an object is in any hashtable - * @node: the &struct hlist_node of the object to be checked - */ -static inline bool hash_hashed(struct hlist_node *node) -{ - return !hlist_unhashed(node); -} - -static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) -{ - unsigned int i; - - for (i = 0; i < sz; i++) - if (!hlist_empty(&ht[i])) - return false; - - return true; -} - -/** - * hash_empty - check whether a hashtable is empty - * @hashtable: hashtable to check - * - * This has to be a macro since HASH_BITS() will not work on pointers since - * it calculates the size during preprocessing. - */ -#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) - -/** - * hash_del - remove an object from a hashtable - * @node: &struct hlist_node of the object to remove - */ -static inline void hash_del(struct hlist_node *node) -{ - hlist_del_init(node); -} - -/** - * hash_del_rcu - remove an object from a rcu enabled hashtable - * @node: &struct hlist_node of the object to remove - */ -static inline void hash_del_rcu(struct hlist_node *node) -{ - hlist_del_init_rcu(node); -} - -/** - * hash_for_each - iterate over a hashtable - * @name: hashtable to iterate - * @bkt: integer to use as bucket loop cursor - * @obj: the type * to use as a loop cursor for each entry - * @member: the name of the hlist_node within the struct - */ -#define hash_for_each(name, bkt, obj, member, pos) \ - for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ - (bkt)++)\ - hlist_for_each_entry(obj, pos, &name[bkt], member) - -/** - * hash_for_each_rcu - iterate over a rcu enabled hashtable - * @name: hashtable to iterate - * @bkt: integer to use as bucket loop cursor - * @obj: the type * to use as a loop cursor for each entry - * @member: the name of the hlist_node within the struct - */ -#define hash_for_each_rcu(name, bkt, obj, member) \ - for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ - (bkt)++)\ - hlist_for_each_entry_rcu(obj, &name[bkt], member) - -/** - * hash_for_each_safe - iterate over a hashtable safe against removal of - * hash entry - * @name: hashtable to iterate - * @bkt: integer to use as bucket loop cursor - * @tmp: a &struct used for temporary storage - * @obj: the type * to use as a loop cursor for each entry - * @member: the name of the hlist_node within the struct - */ -#define hash_for_each_safe(name, bkt, tmp, obj, member, pos) \ - for ((bkt) = 0, obj = NULL; (bkt) < HASH_SIZE(name);\ - (bkt)++)\ - hlist_for_each_entry_safe(obj, pos, tmp, &name[bkt], member) - -/** - * hash_for_each_possible - iterate over all possible objects hashing to the - * same bucket - * @name: hashtable to iterate - * @obj: the type * to use as a loop cursor for each entry - * @member: the name of the hlist_node within the struct - * @key: the key of the objects to iterate over - */ -#define hash_for_each_possible(name, obj, member, key, pos) \ - hlist_for_each_entry(obj, pos, &name[hash_min(key, HASH_BITS(name))], member) - -/** - * hash_for_each_possible_rcu - iterate over all possible objects hashing to the - * same bucket in an rcu enabled hashtable - * in a rcu enabled hashtable - * @name: hashtable to iterate - * @obj: the type * to use as a loop cursor for each entry - * @member: the name of the hlist_node within the struct - * @key: the key of the objects to iterate over - */ -#define hash_for_each_possible_rcu(name, obj, member, key) \ - hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ - member) - -/** - * hash_for_each_possible_safe - iterate over all possible objects hashing to the - * same bucket safe against removals - * @name: hashtable to iterate - * @obj: the type * to use as a loop cursor for each entry - * @tmp: a &struct used for temporary storage - * @member: the name of the hlist_node within the struct - * @key: the key of the objects to iterate over - */ -#define hash_for_each_possible_safe(name, obj, tmp, member, key) \ - hlist_for_each_entry_safe(obj, tmp,\ - &name[hash_min(key, HASH_BITS(name))], member) - - -#endif diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index e8ed04250ed1..75c622bac2f5 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -30,8 +30,8 @@ const struct cred * override_fsids(struct sdcardfs_sb_info* sbi) if (!cred) return NULL; - cred->fsuid = sbi->options.fs_low_uid; - cred->fsgid = sbi->options.fs_low_gid; + cred->fsuid = make_kuid(&init_user_ns, sbi->options.fs_low_uid); + cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid); old_cred = override_creds(cred); @@ -49,12 +49,12 @@ void revert_fsids(const struct cred * old_cred) } static int sdcardfs_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, bool want_excl) { - int err = 0; + int err; struct dentry *lower_dentry; struct dentry *lower_parent_dentry = NULL; - struct path lower_path, saved_path; + struct path lower_path; struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; @@ -74,18 +74,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, lower_dentry = lower_path.dentry; lower_parent_dentry = lock_parent(lower_dentry); - err = mnt_want_write(lower_path.mnt); - if (err) - goto out_unlock; - - pathcpy(&saved_path, &nd->path); - pathcpy(&nd->path, &lower_path); - /* set last 16bytes of mode field to 0664 */ mode = (mode & S_IFMT) | 00664; - err = vfs_create(lower_parent_dentry->d_inode, lower_dentry, mode, nd); - - pathcpy(&nd->path, &saved_path); + err = vfs_create(d_inode(lower_parent_dentry), lower_dentry, mode, want_excl); if (err) goto out; @@ -93,11 +84,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); - fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); out: - mnt_drop_write(lower_path.mnt); -out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); REVERT_CRED(saved_cred); @@ -118,33 +107,27 @@ static int sdcardfs_link(struct dentry *old_dentry, struct inode *dir, OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); - file_size_save = i_size_read(old_dentry->d_inode); + file_size_save = i_size_read(d_inode(old_dentry)); sdcardfs_get_lower_path(old_dentry, &lower_old_path); sdcardfs_get_lower_path(new_dentry, &lower_new_path); lower_old_dentry = lower_old_path.dentry; lower_new_dentry = lower_new_path.dentry; lower_dir_dentry = lock_parent(lower_new_dentry); - err = mnt_want_write(lower_new_path.mnt); - if (err) - goto out_unlock; - - err = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, - lower_new_dentry); - if (err || !lower_new_dentry->d_inode) + err = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), + lower_new_dentry, NULL); + if (err || !d_inode(lower_new_dentry)) goto out; err = sdcardfs_interpose(new_dentry, dir->i_sb, &lower_new_path); if (err) goto out; - fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); - old_dentry->d_inode->i_nlink = - sdcardfs_lower_inode(old_dentry->d_inode)->i_nlink; - i_size_write(new_dentry->d_inode, file_size_save); + fsstack_copy_attr_times(dir, d_inode(lower_new_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_new_dentry)); + set_nlink(d_inode(old_dentry), + sdcardfs_lower_inode(d_inode(old_dentry))->i_nlink); + i_size_write(d_inode(new_dentry), file_size_save); out: - mnt_drop_write(lower_new_path.mnt); -out_unlock: unlock_dir(lower_dir_dentry); sdcardfs_put_lower_path(old_dentry, &lower_old_path); sdcardfs_put_lower_path(new_dentry, &lower_new_path); @@ -180,10 +163,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - err = mnt_want_write(lower_path.mnt); - if (err) - goto out_unlock; - err = vfs_unlink(lower_dir_inode, lower_dentry); + err = vfs_unlink(lower_dir_inode, lower_dentry, NULL); /* * Note: unlinking on top of NFS can cause silly-renamed files. @@ -198,13 +178,11 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) goto out; fsstack_copy_attr_times(dir, lower_dir_inode); fsstack_copy_inode_size(dir, lower_dir_inode); - dentry->d_inode->i_nlink = - sdcardfs_lower_inode(dentry->d_inode)->i_nlink; - dentry->d_inode->i_ctime = dir->i_ctime; + set_nlink(d_inode(dentry), + sdcardfs_lower_inode(d_inode(dentry))->i_nlink); + d_inode(dentry)->i_ctime = dir->i_ctime; d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */ out: - mnt_drop_write(lower_path.mnt); -out_unlock: unlock_dir(lower_dir_dentry); dput(lower_dentry); sdcardfs_put_lower_path(dentry, &lower_path); @@ -217,7 +195,7 @@ out_eacces: static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - int err = 0; + int err; struct dentry *lower_dentry; struct dentry *lower_parent_dentry = NULL; struct path lower_path; @@ -228,21 +206,16 @@ static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry, lower_dentry = lower_path.dentry; lower_parent_dentry = lock_parent(lower_dentry); - err = mnt_want_write(lower_path.mnt); - if (err) - goto out_unlock; - err = vfs_symlink(lower_parent_dentry->d_inode, lower_dentry, symname); + err = vfs_symlink(d_inode(lower_parent_dentry), lower_dentry, symname); if (err) goto out; err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); if (err) goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); - fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); out: - mnt_drop_write(lower_path.mnt); -out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); REVERT_CRED(); @@ -266,9 +239,9 @@ static int touch(char *abs_path, mode_t mode) { return 0; } -static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = 0; + int err; int make_nomedia_in_obb = 0; struct dentry *lower_dentry; struct dentry *lower_parent_dentry = NULL; @@ -306,13 +279,9 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) lower_dentry = lower_path.dentry; lower_parent_dentry = lock_parent(lower_dentry); - err = mnt_want_write(lower_path.mnt); - if (err) - goto out_unlock; - /* set last 16bytes of mode field to 0775 */ mode = (mode & S_IFMT) | 00775; - err = vfs_mkdir(lower_parent_dentry->d_inode, lower_dentry, mode); + err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode); if (err) goto out; @@ -341,9 +310,9 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); - fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); /* update number of links on parent directory */ - dir->i_nlink = sdcardfs_lower_inode(dir)->i_nlink; + set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink); if ((sbi->options.derive == DERIVE_UNIFIED) && (!strcasecmp(dentry->d_name.name, "obb")) && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) @@ -388,8 +357,6 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) kfree(nomedia_fullpath); } out: - mnt_drop_write(lower_path.mnt); -out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); out_revert: @@ -427,23 +394,18 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) lower_dentry = lower_path.dentry; lower_dir_dentry = lock_parent(lower_dentry); - err = mnt_want_write(lower_path.mnt); - if (err) - goto out_unlock; - err = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + err = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); if (err) goto out; d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */ - if (dentry->d_inode) - clear_nlink(dentry->d_inode); - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + if (d_inode(dentry)) + clear_nlink(d_inode(dentry)); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); out: - mnt_drop_write(lower_path.mnt); -out_unlock: unlock_dir(lower_dir_dentry); sdcardfs_put_real_lower(dentry, &lower_path); REVERT_CRED(saved_cred); @@ -452,10 +414,10 @@ out_eacces: } #if 0 -static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = 0; + int err; struct dentry *lower_dentry; struct dentry *lower_parent_dentry = NULL; struct path lower_path; @@ -466,10 +428,7 @@ static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, int mode, lower_dentry = lower_path.dentry; lower_parent_dentry = lock_parent(lower_dentry); - err = mnt_want_write(lower_path.mnt); - if (err) - goto out_unlock; - err = vfs_mknod(lower_parent_dentry->d_inode, lower_dentry, mode, dev); + err = vfs_mknod(d_inode(lower_parent_dentry), lower_dentry, mode, dev); if (err) goto out; @@ -477,11 +436,9 @@ static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, int mode, if (err) goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); - fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); + fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); out: - mnt_drop_write(lower_path.mnt); -out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); REVERT_CRED(); @@ -541,43 +498,33 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } - err = mnt_want_write(lower_old_path.mnt); + err = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, + d_inode(lower_new_dir_dentry), lower_new_dentry, + NULL, 0); if (err) goto out; - err = mnt_want_write(lower_new_path.mnt); - if (err) - goto out_drop_old_write; - - err = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, - lower_new_dir_dentry->d_inode, lower_new_dentry); - if (err) - goto out_err; /* Copy attrs from lower dir, but i_uid/i_gid */ - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); - fsstack_copy_inode_size(new_dir, lower_new_dir_dentry->d_inode); + fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry)); + fsstack_copy_inode_size(new_dir, d_inode(lower_new_dir_dentry)); fix_derived_permission(new_dir); if (new_dir != old_dir) { - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); - fsstack_copy_inode_size(old_dir, lower_old_dir_dentry->d_inode); + fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); + fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry)); fix_derived_permission(old_dir); /* update the derived permission of the old_dentry * with its new parent */ new_parent = dget_parent(new_dentry); if(new_parent) { - if(old_dentry->d_inode) { + if(d_inode(old_dentry)) { get_derived_permission(new_parent, old_dentry); - fix_derived_permission(old_dentry->d_inode); + fix_derived_permission(d_inode(old_dentry)); } dput(new_parent); } } -out_err: - mnt_drop_write(lower_new_path.mnt); -out_drop_old_write: - mnt_drop_write(lower_old_path.mnt); out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_old_dir_dentry); @@ -599,17 +546,17 @@ static int sdcardfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; - if (!lower_dentry->d_inode->i_op || - !lower_dentry->d_inode->i_op->readlink) { + if (!d_inode(lower_dentry)->i_op || + !d_inode(lower_dentry)->i_op->readlink) { err = -EINVAL; goto out; } - err = lower_dentry->d_inode->i_op->readlink(lower_dentry, + err = d_inode(lower_dentry)->i_op->readlink(lower_dentry, buf, bufsiz); if (err < 0) goto out; - fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); + fsstack_copy_attr_atime(d_inode(dentry), d_inode(lower_dentry)); out: sdcardfs_put_lower_path(dentry, &lower_path); @@ -618,7 +565,7 @@ out: #endif #if 0 -static void *sdcardfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie) { char *buf; int len = PAGE_SIZE, err; @@ -628,7 +575,7 @@ static void *sdcardfs_follow_link(struct dentry *dentry, struct nameidata *nd) buf = kmalloc(len, GFP_KERNEL); if (!buf) { buf = ERR_PTR(-ENOMEM); - goto out; + return buf; } /* read the symlink, and then we will follow it */ @@ -642,35 +589,19 @@ static void *sdcardfs_follow_link(struct dentry *dentry, struct nameidata *nd) } else { buf[err] = '\0'; } -out: - nd_set_link(nd, buf); - return NULL; + return *cookie = buf; } #endif -#if 0 -/* this @nd *IS* still used */ -static void sdcardfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *buf = nd_get_link(nd); - if (!IS_ERR(buf)) /* free the char* */ - kfree(buf); -} -#endif - -static int sdcardfs_permission(struct inode *inode, int mask, unsigned int flags) +static int sdcardfs_permission(struct inode *inode, int mask) { int err; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - /* * Permission check on sdcardfs inode. * Calling process should have AID_SDCARD_RW permission */ - err = generic_permission(inode, mask, 0, inode->i_op->check_acl); + err = generic_permission(inode, mask); /* XXX * Original sdcardfs code calls inode_permission(lower_inode,.. ) @@ -700,49 +631,9 @@ static int sdcardfs_permission(struct inode *inode, int mask, unsigned int flags } -static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct dentry *lower_dentry; - struct inode *inode; - struct inode *lower_inode; - struct path lower_path; - struct dentry *parent; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); - - parent = dget_parent(dentry); - if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, - sbi->options.derive, 0, 0)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); - dput(parent); - return -EACCES; - } - dput(parent); - - inode = dentry->d_inode; - - sdcardfs_get_lower_path(dentry, &lower_path); - lower_dentry = lower_path.dentry; - lower_inode = sdcardfs_lower_inode(inode); - - fsstack_copy_attr_all(inode, lower_inode); - fsstack_copy_inode_size(inode, lower_inode); - /* if the dentry has been moved from other location - * so, on this stage, its derived permission must be - * rechecked from its private field. - */ - fix_derived_permission(inode); - - generic_fillattr(inode, stat); - sdcardfs_put_lower_path(dentry, &lower_path); - return 0; -} - static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) { - int err = 0; + int err; struct dentry *lower_dentry; struct inode *inode; struct inode *lower_inode; @@ -752,7 +643,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) struct dentry *parent; int has_rw; - inode = dentry->d_inode; + inode = d_inode(dentry); /* * Check if user has permission to change inode. We don't check if @@ -766,7 +657,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) /* check the Android group ID */ has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); parent = dget_parent(dentry); - if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, + if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name, sbi->options.derive, 1, has_rw)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", @@ -819,13 +710,14 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) /* notify the (possibly copied-up) lower inode */ /* - * Note: we use lower_dentry->d_inode, because lower_inode may be + * Note: we use d_inode(lower_dentry), because lower_inode may be * unlinked (no inode->i_sb and i_ino==0. This happens if someone * tries to open(), unlink(), then ftruncate() a file. */ - mutex_lock(&lower_dentry->d_inode->i_mutex); - err = notify_change(lower_dentry, &lower_ia); /* note: lower_ia */ - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + err = notify_change(lower_dentry, &lower_ia, /* note: lower_ia */ + NULL); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); if (current->mm) up_write(¤t->mm->mmap_sem); if (err) @@ -848,6 +740,46 @@ out_err: return err; } +static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct dentry *lower_dentry; + struct inode *inode; + struct inode *lower_inode; + struct path lower_path; + struct dentry *parent; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + + parent = dget_parent(dentry); + if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name, + sbi->options.derive, 0, 0)) { + printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" + " dentry: %s, task:%s\n", + __func__, dentry->d_name.name, current->comm); + dput(parent); + return -EACCES; + } + dput(parent); + + inode = d_inode(dentry); + + sdcardfs_get_lower_path(dentry, &lower_path); + lower_dentry = lower_path.dentry; + lower_inode = sdcardfs_lower_inode(inode); + + fsstack_copy_attr_all(inode, lower_inode); + fsstack_copy_inode_size(inode, lower_inode); + /* if the dentry has been moved from other location + * so, on this stage, its derived permission must be + * rechecked from its private field. + */ + fix_derived_permission(inode); + + generic_fillattr(inode, stat); + sdcardfs_put_lower_path(dentry, &lower_path); + return 0; +} + const struct inode_operations sdcardfs_symlink_iops = { .permission = sdcardfs_permission, .setattr = sdcardfs_setattr, @@ -856,14 +788,16 @@ const struct inode_operations sdcardfs_symlink_iops = { * These methods are *NOT* perfectly tested. .readlink = sdcardfs_readlink, .follow_link = sdcardfs_follow_link, - .put_link = sdcardfs_put_link, + .put_link = kfree_put_link, */ }; const struct inode_operations sdcardfs_dir_iops = { .create = sdcardfs_create, .lookup = sdcardfs_lookup, +#if 0 .permission = sdcardfs_permission, +#endif .unlink = sdcardfs_unlink, .mkdir = sdcardfs_mkdir, .rmdir = sdcardfs_rmdir, diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index c0b12375b1bf..a4b94df99f32 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -79,8 +79,7 @@ static int sdcardfs_inode_set(struct inode *inode, void *lower_inode) return 0; } -static struct inode *sdcardfs_iget(struct super_block *sb, - struct inode *lower_inode) +struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode) { struct sdcardfs_inode_info *info; struct inode *inode; /* the new inode to return */ @@ -206,14 +205,13 @@ out: * Fills in lower_parent_path with on success. */ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, - struct nameidata *nd, struct path *lower_parent_path) + unsigned int flags, struct path *lower_parent_path) { int err = 0; struct vfsmount *lower_dir_mnt; struct dentry *lower_dir_dentry = NULL; struct dentry *lower_dentry; const char *name; - struct nameidata lower_nd; struct path lower_path; struct qstr this; struct sdcardfs_sb_info *sbi; @@ -234,10 +232,10 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, /* Use vfs_path_lookup to check if the dentry exists or not */ if (sbi->options.lower_fs == LOWER_FS_EXT4) { err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, - LOOKUP_CASE_INSENSITIVE, &lower_nd); + LOOKUP_CASE_INSENSITIVE, &lower_path); } else if (sbi->options.lower_fs == LOWER_FS_FAT) { err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, - &lower_nd); + &lower_path); } /* no error: handle positive dentries */ @@ -253,7 +251,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, * and the base obbpath will be copyed to the lower_path variable. * if an error returned, there's no change in the lower_path * returns: -ERRNO if error (0: no error) */ - err = setup_obb_dentry(dentry, &lower_nd.path); + err = setup_obb_dentry(dentry, &lower_path); if(err) { /* if the sbi->obbpath is not available, we can optionally @@ -267,8 +265,8 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, } } - sdcardfs_set_lower_path(dentry, &lower_nd.path); - err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_nd.path); + sdcardfs_set_lower_path(dentry, &lower_path); + err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_path); if (err) /* path_put underlying path on error */ sdcardfs_put_reset_lower_path(dentry); goto out; @@ -306,10 +304,7 @@ setup_lower: * the VFS will continue the process of making this negative dentry * into a positive one. */ - if (nd) { - if (nd->flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET)) - err = 0; - } else + if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET)) err = 0; out: @@ -328,7 +323,7 @@ out: * @nd : nameidata of parent inode */ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct dentry *ret = NULL, *parent; struct path lower_parent_path; @@ -359,7 +354,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ret = __sdcardfs_lookup(dentry, nd, &lower_parent_path); + ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path); if (IS_ERR(ret)) { goto out; diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 1fdceffec72c..9d04ae8ceb46 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -156,6 +156,7 @@ invalid_option: return 0; } +#if 0 /* * our custom d_alloc_root work-alike * @@ -181,6 +182,7 @@ static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb) } return ret; } +#endif /* * There is no need to lock the sdcardfs_super_info's rwsem as there is no @@ -195,6 +197,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, struct path lower_path; struct sdcardfs_sb_info *sb_info; void *pkgl_id; + struct inode *inode; printk(KERN_INFO "sdcardfs version 2.0\n"); @@ -259,12 +262,18 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, sb->s_magic = SDCARDFS_SUPER_MAGIC; sb->s_op = &sdcardfs_sops; - /* see comment next to the definition of sdcardfs_d_alloc_root */ - sb->s_root = sdcardfs_d_alloc_root(sb); - if (!sb->s_root) { - err = -ENOMEM; + /* get a new inode and allocate our root dentry */ + inode = sdcardfs_iget(sb, lower_path.dentry->d_inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_sput; } + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + err = -ENOMEM; + goto out_iput; + } + d_set_d_op(sb->s_root, &sdcardfs_ci_dops); /* link the upper and lower dentries */ sb->s_root->d_fsdata = NULL; @@ -275,56 +284,60 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, /* set the lower dentries for s_root */ sdcardfs_set_lower_path(sb->s_root, &lower_path); - /* call interpose to create the upper level inode */ - err = sdcardfs_interpose(sb->s_root, sb, &lower_path); - if (!err) { - /* setup permission policy */ - switch(sb_info->options.derive) { - case DERIVE_NONE: - setup_derived_state(sb->s_root->d_inode, + /* + * No need to call interpose because we already have a positive + * dentry, which was instantiated by d_make_root. Just need to + * d_rehash it. + */ + d_rehash(sb->s_root); + + /* setup permission policy */ + switch(sb_info->options.derive) { + case DERIVE_NONE: + setup_derived_state(sb->s_root->d_inode, PERM_ROOT, 0, AID_ROOT, AID_SDCARD_RW, 00775); - sb_info->obbpath_s = NULL; - break; - case DERIVE_LEGACY: - /* Legacy behavior used to support internal multiuser layout which - * places user_id at the top directory level, with the actual roots - * just below that. Shared OBB path is also at top level. */ - setup_derived_state(sb->s_root->d_inode, - PERM_LEGACY_PRE_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); - /* initialize the obbpath string and lookup the path - * sb_info->obb_path will be deactivated by path_put - * on sdcardfs_put_super */ - sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); - snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); - err = prepare_dir(sb_info->obbpath_s, - sb_info->options.fs_low_uid, - sb_info->options.fs_low_gid, 00755); - if(err) - printk(KERN_ERR "sdcardfs: %s: %d, error on creating %s\n", - __func__,__LINE__, sb_info->obbpath_s); - break; - case DERIVE_UNIFIED: - /* Unified multiuser layout which places secondary user_id under - * /Android/user and shared OBB path under /Android/obb. */ - setup_derived_state(sb->s_root->d_inode, - PERM_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); + sb_info->obbpath_s = NULL; + break; + case DERIVE_LEGACY: + /* Legacy behavior used to support internal multiuser layout which + * places user_id at the top directory level, with the actual roots + * just below that. Shared OBB path is also at top level. */ + setup_derived_state(sb->s_root->d_inode, + PERM_LEGACY_PRE_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); + /* initialize the obbpath string and lookup the path + * sb_info->obb_path will be deactivated by path_put + * on sdcardfs_put_super */ + sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); + snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); + err = prepare_dir(sb_info->obbpath_s, + sb_info->options.fs_low_uid, + sb_info->options.fs_low_gid, 00755); + if(err) + printk(KERN_ERR "sdcardfs: %s: %d, error on creating %s\n", + __func__,__LINE__, sb_info->obbpath_s); + break; + case DERIVE_UNIFIED: + /* Unified multiuser layout which places secondary user_id under + * /Android/user and shared OBB path under /Android/obb. */ + setup_derived_state(sb->s_root->d_inode, + PERM_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); - sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); - snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); - break; - } - fix_derived_permission(sb->s_root->d_inode); - - if (!silent) - printk(KERN_INFO "sdcardfs: mounted on top of %s type %s\n", - dev_name, lower_sb->s_type->name); - goto out; + sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); + snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); + break; } - /* else error: fall through */ + fix_derived_permission(sb->s_root->d_inode); - free_dentry_private_data(sb->s_root); + if (!silent) + printk(KERN_INFO "sdcardfs: mounted on top of %s type %s\n", + dev_name, lower_sb->s_type->name); + goto out; /* all is well */ + + /* no longer needed: free_dentry_private_data(sb->s_root); */ out_freeroot: dput(sb->s_root); +out_iput: + iput(inode); out_sput: /* drop refs we took earlier */ atomic_dec(&lower_sb->s_active); @@ -346,7 +359,7 @@ static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type, { int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -378,7 +391,7 @@ static struct file_system_type sdcardfs_fs_type = { .name = SDCARDFS_NAME, .mount = sdcardfs_mount, .kill_sb = generic_shutdown_super, - .fs_flags = FS_REVAL_DOT, + .fs_flags = 0, }; static int __init init_sdcardfs_fs(void) diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c index c807d7f18f8b..e21f64675a80 100644 --- a/fs/sdcardfs/mmap.c +++ b/fs/sdcardfs/mmap.c @@ -48,9 +48,8 @@ static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return err; } -static ssize_t sdcardfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) +static ssize_t sdcardfs_direct_IO(struct kiocb *iocb, + struct iov_iter *iter, loff_t pos) { /* * This function returns zero on purpose in order to support direct IO. diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index c786d8f92203..d7ba8d4a423e 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -20,7 +20,7 @@ #include "sdcardfs.h" #include "strtok.h" -#include "hashtable.h" +#include #include #include #include @@ -29,8 +29,8 @@ #define STRING_BUF_SIZE (512) struct hashtable_entry { - struct hlist_node hlist; - void *key; + struct hlist_node hlist; + void *key; int value; }; @@ -67,12 +67,12 @@ static unsigned int str_hash(void *key) { } static int contain_appid_key(struct packagelist_data *pkgl_dat, void *appid) { - struct hashtable_entry *hash_cur; - struct hlist_node *h_n; + struct hashtable_entry *hash_cur; - hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, (unsigned int)appid, h_n) - if (appid == hash_cur->key) - return 1; + hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, (unsigned int)appid) + + if (appid == hash_cur->key) + return 1; return 0; } @@ -87,7 +87,7 @@ int get_caller_has_rw_locked(void *pkgl_id, derive_t derive) { return 1; } - appid = multiuser_get_app_id(current_fsuid()); + appid = multiuser_get_app_id(from_kuid(&init_user_ns, current_fsuid())); mutex_lock(&pkgl_dat->hashtable_lock); ret = contain_appid_key(pkgl_dat, (void *)appid); mutex_unlock(&pkgl_dat->hashtable_lock); @@ -98,13 +98,12 @@ appid_t get_appid(void *pkgl_id, const char *app_name) { struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; struct hashtable_entry *hash_cur; - struct hlist_node *h_n; unsigned int hash = str_hash((void *)app_name); appid_t ret_id; //printk(KERN_INFO "sdcardfs: %s: %s, %u\n", __func__, (char *)app_name, hash); mutex_lock(&pkgl_dat->hashtable_lock); - hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash, h_n) { + hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { //printk(KERN_INFO "sdcardfs: %s: %s\n", __func__, (char *)hash_cur->key); if (!strcasecmp(app_name, hash_cur->key)) { ret_id = (appid_t)hash_cur->value; @@ -140,7 +139,7 @@ int check_caller_access_to_name(struct inode *parent_node, const char* name, /* Root always has access; access for any other UIDs should always * be controlled through packages.list. */ - if (current_fsuid() == 0) { + if (from_kuid(&init_user_ns, current_fsuid()) == 0) { return 1; } @@ -148,7 +147,8 @@ int check_caller_access_to_name(struct inode *parent_node, const char* name, * parent or holds sdcard_rw. */ if (w_ok) { if (parent_node && - (current_fsuid() == SDCARDFS_I(parent_node)->d_uid)) { + (from_kuid(&init_user_ns, current_fsuid()) == + SDCARDFS_I(parent_node)->d_uid)) { return 1; } return has_rw; @@ -174,11 +174,10 @@ int open_flags_to_access_mode(int open_flags) { static int insert_str_to_int(struct packagelist_data *pkgl_dat, void *key, int value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; - struct hlist_node *h_n; unsigned int hash = str_hash(key); //printk(KERN_INFO "sdcardfs: %s: %s: %d, %u\n", __func__, (char *)key, value, hash); - hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash, h_n) { + hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { if (!strcasecmp(key, hash_cur->key)) { hash_cur->value = value; return 0; @@ -202,11 +201,10 @@ static void remove_str_to_int(struct hashtable_entry *h_entry) { static int insert_int_to_null(struct packagelist_data *pkgl_dat, void *key, int value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; - struct hlist_node *h_n; //printk(KERN_INFO "sdcardfs: %s: %d: %d\n", __func__, (int)key, value); hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, - (unsigned int)key, h_n) { + (unsigned int)key) { if (key == hash_cur->key) { hash_cur->value = value; return 0; @@ -230,14 +228,13 @@ static void remove_int_to_null(struct hashtable_entry *h_entry) { static void remove_all_hashentrys(struct packagelist_data *pkgl_dat) { struct hashtable_entry *hash_cur; - struct hlist_node *h_n; struct hlist_node *h_t; int i; - hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist, h_n) + hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist) remove_str_to_int(hash_cur); - hash_for_each_safe(pkgl_dat->appid_with_rw, i, h_t, hash_cur, hlist, h_n) - remove_int_to_null(hash_cur); + hash_for_each_safe(pkgl_dat->appid_with_rw, i, h_t, hash_cur, hlist) + remove_int_to_null(hash_cur); hash_init(pkgl_dat->package_to_appid); hash_init(pkgl_dat->appid_with_rw); diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 90f8b24e4a52..51f6c7912584 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -69,8 +69,8 @@ #define fix_derived_permission(x) \ do { \ - (x)->i_uid = SDCARDFS_I(x)->d_uid; \ - (x)->i_gid = SDCARDFS_I(x)->d_gid; \ + (x)->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(x)->d_uid); \ + (x)->i_gid = make_kgid(&init_user_ns, SDCARDFS_I(x)->d_gid); \ (x)->i_mode = ((x)->i_mode & S_IFMT) | SDCARDFS_I(x)->d_mode;\ } while (0) @@ -159,7 +159,9 @@ extern void sdcardfs_destroy_dentry_cache(void); extern int new_dentry_private_data(struct dentry *dentry); extern void free_dentry_private_data(struct dentry *dentry); extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd); + unsigned int flags); +extern struct inode *sdcardfs_iget(struct super_block *sb, + struct inode *lower_inode); extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, struct path *lower_path); @@ -387,13 +389,13 @@ extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path); static inline struct dentry *lock_parent(struct dentry *dentry) { struct dentry *dir = dget_parent(dentry); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); return dir; } static inline void unlock_dir(struct dentry *dir) { - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); } @@ -402,16 +404,9 @@ static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t m int err; struct dentry *dent; struct iattr attrs; - struct nameidata nd; + struct path parent; - err = kern_path_parent(path_s, &nd); - if (err) { - if (err == -EEXIST) - err = 0; - goto out; - } - - dent = lookup_create(&nd, 1); + dent = kern_path_locked(path_s, &parent); if (IS_ERR(dent)) { err = PTR_ERR(dent); if (err == -EEXIST) @@ -419,29 +414,27 @@ static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t m goto out_unlock; } - err = vfs_mkdir(nd.path.dentry->d_inode, dent, mode); + err = vfs_mkdir(d_inode(parent.dentry), dent, mode); if (err) { if (err == -EEXIST) err = 0; goto out_dput; } - attrs.ia_uid = uid; - attrs.ia_gid = gid; + attrs.ia_uid = make_kuid(&init_user_ns, uid); + attrs.ia_gid = make_kgid(&init_user_ns, gid); attrs.ia_valid = ATTR_UID | ATTR_GID; - mutex_lock(&dent->d_inode->i_mutex); - notify_change(dent, &attrs); - mutex_unlock(&dent->d_inode->i_mutex); + mutex_lock(&d_inode(dent)->i_mutex); + notify_change(dent, &attrs, NULL); + mutex_unlock(&d_inode(dent)->i_mutex); out_dput: dput(dent); out_unlock: /* parent dentry locked by lookup_create */ - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - -out: + mutex_unlock(&d_inode(parent.dentry)->i_mutex); + path_put(&parent); return err; } diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 1d206c82dfdf..f153ce1b8cf3 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -122,7 +122,7 @@ static void sdcardfs_evict_inode(struct inode *inode) struct inode *lower_inode; truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); /* * Decrement a reference to a lower_inode, which was incremented * by our read_inode when it was created initially. @@ -193,9 +193,9 @@ static void sdcardfs_umount_begin(struct super_block *sb) lower_sb->s_op->umount_begin(lower_sb); } -static int sdcardfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static int sdcardfs_show_options(struct seq_file *m, struct dentry *root) { - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(mnt->mnt_sb); + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb); struct sdcardfs_mount_options *opts = &sbi->options; if (opts->fs_low_uid != 0) diff --git a/include/linux/namei.h b/include/linux/namei.h index ef3b4f74eaf0..f2b8acbdb928 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -78,6 +78,8 @@ extern struct dentry *user_path_create(int, const char __user *, struct path *, extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); +extern int vfs_path_lookup(struct dentry *, struct vfsmount *, + const char *, unsigned int, struct path *); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); From a1590fbeed5f259e5dc75f3b08f58eb5409815b2 Mon Sep 17 00:00:00 2001 From: Daniel Campello Date: Mon, 20 Jul 2015 16:33:46 -0700 Subject: [PATCH 033/607] Changed type-casting in packagelist management Fixed existing type-casting in packagelist management code. All warnings at compile time were taken care of. Change-Id: I1ea97786d1d1325f31b9f09ae966af1f896a2af5 Signed-off-by: Daniel Campello --- fs/sdcardfs/packagelist.c | 40 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index d7ba8d4a423e..f11591da141d 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -31,7 +31,7 @@ struct hashtable_entry { struct hlist_node hlist; void *key; - int value; + unsigned int value; }; struct packagelist_data { @@ -54,7 +54,7 @@ static const char* const kpackageslist_file = "/data/system/packages.list"; /* Supplementary groups to execute with */ static const gid_t kgroups[1] = { AID_PACKAGE_INFO }; -static unsigned int str_hash(void *key) { +static unsigned int str_hash(const char *key) { int i; unsigned int h = strlen(key); char *data = (char *)key; @@ -66,13 +66,13 @@ static unsigned int str_hash(void *key) { return h; } -static int contain_appid_key(struct packagelist_data *pkgl_dat, void *appid) { +static int contain_appid_key(struct packagelist_data *pkgl_dat, unsigned int appid) { struct hashtable_entry *hash_cur; - hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, (unsigned int)appid) - - if (appid == hash_cur->key) + hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, appid) + if ((void *)(uintptr_t)appid == hash_cur->key) return 1; + return 0; } @@ -89,7 +89,7 @@ int get_caller_has_rw_locked(void *pkgl_id, derive_t derive) { appid = multiuser_get_app_id(from_kuid(&init_user_ns, current_fsuid())); mutex_lock(&pkgl_dat->hashtable_lock); - ret = contain_appid_key(pkgl_dat, (void *)appid); + ret = contain_appid_key(pkgl_dat, appid); mutex_unlock(&pkgl_dat->hashtable_lock); return ret; } @@ -98,7 +98,7 @@ appid_t get_appid(void *pkgl_id, const char *app_name) { struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; struct hashtable_entry *hash_cur; - unsigned int hash = str_hash((void *)app_name); + unsigned int hash = str_hash(app_name); appid_t ret_id; //printk(KERN_INFO "sdcardfs: %s: %s, %u\n", __func__, (char *)app_name, hash); @@ -171,7 +171,9 @@ int open_flags_to_access_mode(int open_flags) { } } -static int insert_str_to_int(struct packagelist_data *pkgl_dat, void *key, int value) { +static int insert_str_to_int(struct packagelist_data *pkgl_dat, char *key, + unsigned int value) +{ struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; unsigned int hash = str_hash(key); @@ -198,14 +200,15 @@ static void remove_str_to_int(struct hashtable_entry *h_entry) { kmem_cache_free(hashtable_entry_cachep, h_entry); } -static int insert_int_to_null(struct packagelist_data *pkgl_dat, void *key, int value) { +static int insert_int_to_null(struct packagelist_data *pkgl_dat, unsigned int key, + unsigned int value) +{ struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; //printk(KERN_INFO "sdcardfs: %s: %d: %d\n", __func__, (int)key, value); - hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, - (unsigned int)key) { - if (key == hash_cur->key) { + hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, key) { + if ((void *)(uintptr_t)key == hash_cur->key) { hash_cur->value = value; return 0; } @@ -213,10 +216,9 @@ static int insert_int_to_null(struct packagelist_data *pkgl_dat, void *key, int new_entry = kmem_cache_alloc(hashtable_entry_cachep, GFP_KERNEL); if (!new_entry) return -ENOMEM; - new_entry->key = key; + new_entry->key = (void *)(uintptr_t)key; new_entry->value = value; - hash_add(pkgl_dat->appid_with_rw, &new_entry->hlist, - (unsigned int)new_entry->key); + hash_add(pkgl_dat->appid_with_rw, &new_entry->hlist, key); return 0; } @@ -260,7 +262,7 @@ static int read_package_list(struct packagelist_data *pkgl_dat) { while ((read_amount = sys_read(fd, pkgl_dat->read_buf, sizeof(pkgl_dat->read_buf))) > 0) { - int appid; + unsigned int appid; char *token; int one_line_len = 0; int additional_read; @@ -277,7 +279,7 @@ static int read_package_list(struct packagelist_data *pkgl_dat) { if (additional_read > 0) sys_lseek(fd, -additional_read, SEEK_CUR); - if (sscanf(pkgl_dat->read_buf, "%s %d %*d %*s %*s %s", + if (sscanf(pkgl_dat->read_buf, "%s %u %*d %*s %*s %s", pkgl_dat->app_name_buf, &appid, pkgl_dat->gids_buf) == 3) { ret = insert_str_to_int(pkgl_dat, pkgl_dat->app_name_buf, appid); @@ -291,7 +293,7 @@ static int read_package_list(struct packagelist_data *pkgl_dat) { while (token != NULL) { if (!kstrtoul(token, 10, &ret_gid) && (ret_gid == pkgl_dat->write_gid)) { - ret = insert_int_to_null(pkgl_dat, (void *)appid, 1); + ret = insert_int_to_null(pkgl_dat, appid, 1); if (ret) { sys_close(fd); mutex_unlock(&pkgl_dat->hashtable_lock); From 1e2d3bbcf3f5a603e62dfa0514a43e13ee679d4f Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 3 Feb 2016 21:08:21 -0800 Subject: [PATCH 034/607] sdcardfs: Bring up to date with Android M permissions: In M, the workings of sdcardfs were changed significantly. This brings sdcardfs into line with the changes. Change-Id: I10e91a84a884c838feef7aa26c0a2b21f02e052e --- fs/sdcardfs/Kconfig | 1 + fs/sdcardfs/derived_perm.c | 119 ++++---- fs/sdcardfs/file.c | 10 +- fs/sdcardfs/inode.c | 78 +++--- fs/sdcardfs/lookup.c | 40 +-- fs/sdcardfs/main.c | 141 +++++----- fs/sdcardfs/packagelist.c | 538 ++++++++++++++++++------------------- fs/sdcardfs/sdcardfs.h | 134 ++++++--- fs/sdcardfs/strtok.h | 75 ------ fs/sdcardfs/super.c | 11 +- 10 files changed, 518 insertions(+), 629 deletions(-) delete mode 100644 fs/sdcardfs/strtok.h diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig index d995f3eaae6d..ab25f88ebb37 100644 --- a/fs/sdcardfs/Kconfig +++ b/fs/sdcardfs/Kconfig @@ -1,5 +1,6 @@ config SDCARD_FS tristate "sdcard file system" + depends on CONFIGFS_FS default n help Sdcardfs is based on Wrapfs file system. diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 00c33a471dcc..128b3e56851f 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -29,24 +29,23 @@ static void inherit_derived_state(struct inode *parent, struct inode *child) ci->perm = PERM_INHERIT; ci->userid = pi->userid; ci->d_uid = pi->d_uid; - ci->d_gid = pi->d_gid; - ci->d_mode = pi->d_mode; + ci->under_android = pi->under_android; } /* helper function for derived state */ void setup_derived_state(struct inode *inode, perm_t perm, - userid_t userid, uid_t uid, gid_t gid, mode_t mode) + userid_t userid, uid_t uid, bool under_android) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); info->perm = perm; info->userid = userid; info->d_uid = uid; - info->d_gid = gid; - info->d_mode = mode; + info->under_android = under_android; } -void get_derived_permission(struct dentry *parent, struct dentry *dentry) +/* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ +void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry) { struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode); @@ -63,86 +62,68 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) inherit_derived_state(parent->d_inode, dentry->d_inode); - //printk(KERN_INFO "sdcardfs: derived: %s, %s, %d\n", parent->d_name.name, - // dentry->d_name.name, parent_info->perm); - - if (sbi->options.derive == DERIVE_NONE) { - return; - } - /* Derive custom permissions based on parent and current node */ switch (parent_info->perm) { case PERM_INHERIT: /* Already inherited above */ break; - case PERM_LEGACY_PRE_ROOT: + case PERM_PRE_ROOT: /* Legacy internal layout places users at top level */ info->perm = PERM_ROOT; - info->userid = simple_strtoul(dentry->d_name.name, NULL, 10); + info->userid = simple_strtoul(newdentry->d_name.name, NULL, 10); break; case PERM_ROOT: /* Assume masked off by default. */ - info->d_mode = 00770; - if (!strcasecmp(dentry->d_name.name, "Android")) { + if (!strcasecmp(newdentry->d_name.name, "Android")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID; - info->d_mode = 00771; - } else if (sbi->options.split_perms) { - if (!strcasecmp(dentry->d_name.name, "DCIM") - || !strcasecmp(dentry->d_name.name, "Pictures")) { - info->d_gid = AID_SDCARD_PICS; - } else if (!strcasecmp(dentry->d_name.name, "Alarms") - || !strcasecmp(dentry->d_name.name, "Movies") - || !strcasecmp(dentry->d_name.name, "Music") - || !strcasecmp(dentry->d_name.name, "Notifications") - || !strcasecmp(dentry->d_name.name, "Podcasts") - || !strcasecmp(dentry->d_name.name, "Ringtones")) { - info->d_gid = AID_SDCARD_AV; - } + info->under_android = true; } break; case PERM_ANDROID: - if (!strcasecmp(dentry->d_name.name, "data")) { + if (!strcasecmp(newdentry->d_name.name, "data")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_DATA; - info->d_mode = 00771; - } else if (!strcasecmp(dentry->d_name.name, "obb")) { + } else if (!strcasecmp(newdentry->d_name.name, "obb")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_OBB; - info->d_mode = 00771; - // FIXME : this feature will be implemented later. /* Single OBB directory is always shared */ - } else if (!strcasecmp(dentry->d_name.name, "user")) { - /* User directories must only be accessible to system, protected - * by sdcard_all. Zygote will bind mount the appropriate user- - * specific path. */ - info->perm = PERM_ANDROID_USER; - info->d_gid = AID_SDCARD_ALL; - info->d_mode = 00770; + } else if (!strcasecmp(newdentry->d_name.name, "media")) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID_MEDIA; } break; - /* same policy will be applied on PERM_ANDROID_DATA - * and PERM_ANDROID_OBB */ case PERM_ANDROID_DATA: case PERM_ANDROID_OBB: - appid = get_appid(sbi->pkgl_id, dentry->d_name.name); + case PERM_ANDROID_MEDIA: + appid = get_appid(sbi->pkgl_id, newdentry->d_name.name); if (appid != 0) { info->d_uid = multiuser_get_uid(parent_info->userid, appid); } - info->d_mode = 00770; - break; - case PERM_ANDROID_USER: - /* Root of a secondary user */ - info->perm = PERM_ROOT; - info->userid = simple_strtoul(dentry->d_name.name, NULL, 10); - info->d_gid = AID_SDCARD_R; - info->d_mode = 00771; break; } } +void get_derived_permission(struct dentry *parent, struct dentry *dentry) +{ + get_derived_permission_new(parent, dentry, dentry); +} + +void get_derive_permissions_recursive(struct dentry *parent) { + struct dentry *dentry; + list_for_each_entry(dentry, &parent->d_subdirs, d_child) { + if (dentry && dentry->d_inode) { + mutex_lock(&dentry->d_inode->i_mutex); + get_derived_permission(parent, dentry); + fix_derived_permission(dentry->d_inode); + get_derive_permissions_recursive(dentry); + mutex_unlock(&dentry->d_inode->i_mutex); + } + } +} + /* main function for updating derived permission */ -inline void update_derived_permission(struct dentry *dentry) +inline void update_derived_permission_lock(struct dentry *dentry) { struct dentry *parent; @@ -154,6 +135,7 @@ inline void update_derived_permission(struct dentry *dentry) * 1. need to check whether the dentry is updated or not * 2. remove the root dentry update */ + mutex_lock(&dentry->d_inode->i_mutex); if(IS_ROOT(dentry)) { //setup_default_pre_root_state(dentry->d_inode); } else { @@ -164,6 +146,7 @@ inline void update_derived_permission(struct dentry *dentry) } } fix_derived_permission(dentry->d_inode); + mutex_unlock(&dentry->d_inode->i_mutex); } int need_graft_path(struct dentry *dentry) @@ -177,7 +160,7 @@ int need_graft_path(struct dentry *dentry) !strcasecmp(dentry->d_name.name, "obb")) { /* /Android/obb is the base obbpath of DERIVED_UNIFIED */ - if(!(sbi->options.derive == DERIVE_UNIFIED + if(!(sbi->options.multiuser == false && parent_info->userid == 0)) { ret = 1; } @@ -207,8 +190,7 @@ int is_obbpath_invalid(struct dentry *dent) path_buf = kmalloc(PATH_MAX, GFP_ATOMIC); if(!path_buf) { ret = 1; - printk(KERN_ERR "sdcardfs: " - "fail to allocate path_buf in %s.\n", __func__); + printk(KERN_ERR "sdcardfs: fail to allocate path_buf in %s.\n", __func__); } else { obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX); if (d_unhashed(di->lower_path.dentry) || @@ -234,21 +216,16 @@ int is_base_obbpath(struct dentry *dentry) struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); spin_lock(&SDCARDFS_D(dentry)->lock); - /* DERIVED_LEGACY */ - if(parent_info->perm == PERM_LEGACY_PRE_ROOT && + if (sbi->options.multiuser) { + if(parent_info->perm == PERM_PRE_ROOT && + !strcasecmp(dentry->d_name.name, "obb")) { + ret = 1; + } + } else if (parent_info->perm == PERM_ANDROID && !strcasecmp(dentry->d_name.name, "obb")) { ret = 1; } - /* DERIVED_UNIFIED :/Android/obb is the base obbpath */ - else if (parent_info->perm == PERM_ANDROID && - !strcasecmp(dentry->d_name.name, "obb")) { - if((sbi->options.derive == DERIVE_UNIFIED - && parent_info->userid == 0)) { - ret = 1; - } - } spin_unlock(&SDCARDFS_D(dentry)->lock); - dput(parent); return ret; } @@ -272,8 +249,7 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) if(!err) { /* the obbpath base has been found */ - printk(KERN_INFO "sdcardfs: " - "the sbi->obbpath is found\n"); + printk(KERN_INFO "sdcardfs: the sbi->obbpath is found\n"); pathcpy(lower_path, &obbpath); } else { /* if the sbi->obbpath is not available, we can optionally @@ -281,8 +257,7 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) * but, the current implementation just returns an error * because the sdcard daemon also regards this case as * a lookup fail. */ - printk(KERN_INFO "sdcardfs: " - "the sbi->obbpath is not available\n"); + printk(KERN_INFO "sdcardfs: the sbi->obbpath is not available\n"); } return err; } diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index f9c5eaafc619..c249fa982d3c 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -209,7 +209,6 @@ static int sdcardfs_open(struct inode *inode, struct file *file) struct dentry *parent = dget_parent(dentry); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; - int has_rw; /* don't open unhashed/deleted files */ if (d_unhashed(dentry)) { @@ -217,11 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) goto out_err; } - has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); - - if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, - sbi->options.derive, - open_flags_to_access_mode(file->f_flags), has_rw)) { + if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -257,8 +252,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) if (err) kfree(SDCARDFS_F(file)); else { - fsstack_copy_attr_all(inode, sdcardfs_lower_inode(inode)); - fix_derived_permission(inode); + sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode)); } out_revert_cred: diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 75c622bac2f5..2528da0d3ae1 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -55,11 +55,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, struct dentry *lower_dentry; struct dentry *lower_parent_dentry = NULL; struct path lower_path; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; - int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); - if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + if(!check_caller_access_to_name(dir, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -80,7 +78,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out; - err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, SDCARDFS_I(dir)->userid); if (err) goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); @@ -143,11 +141,9 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *lower_dir_inode = sdcardfs_lower_inode(dir); struct dentry *lower_dir_dentry; struct path lower_path; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; - int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); - if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + if(!check_caller_access_to_name(dir, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -255,8 +251,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode int fullpath_namelen; int touch_err = 0; - int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); - if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + if(!check_caller_access_to_name(dir, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -293,19 +288,19 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode if(err) { /* if the sbi->obbpath is not available, the lower_path won't be * changed by setup_obb_dentry() but the lower path is saved to - * its orig_path. this dentry will be revalidated later. + * its orig_path. this dentry will be revalidated later. * but now, the lower_path should be NULL */ sdcardfs_put_reset_lower_path(dentry); /* the newly created lower path which saved to its orig_path or * the lower_path is the base obbpath. - * therefore, an additional path_get is required */ + * therefore, an additional path_get is required */ path_get(&lower_path); } else make_nomedia_in_obb = 1; } - err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pi->userid); if (err) goto out; @@ -314,7 +309,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* update number of links on parent directory */ set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink); - if ((sbi->options.derive == DERIVE_UNIFIED) && (!strcasecmp(dentry->d_name.name, "obb")) + if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb")) && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) make_nomedia_in_obb = 1; @@ -371,12 +366,9 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) struct dentry *lower_dir_dentry; int err; struct path lower_path; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; - //char *path_s = NULL; - int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); - if(!check_caller_access_to_name(dir, dentry->d_name.name, sbi->options.derive, 1, has_rw)) { + if(!check_caller_access_to_name(dir, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -461,14 +453,10 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *trap = NULL; struct dentry *new_parent = NULL; struct path lower_old_path, lower_new_path; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(old_dentry->d_sb); const struct cred *saved_cred = NULL; - int has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); - if(!check_caller_access_to_name(old_dir, old_dentry->d_name.name, - sbi->options.derive, 1, has_rw) || - !check_caller_access_to_name(new_dir, new_dentry->d_name.name, - sbi->options.derive, 1, has_rw)) { + if(!check_caller_access_to_name(old_dir, old_dentry->d_name.name) || + !check_caller_access_to_name(new_dir, new_dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " new_dentry: %s, task:%s\n", __func__, new_dentry->d_name.name, current->comm); @@ -505,26 +493,31 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; /* Copy attrs from lower dir, but i_uid/i_gid */ - fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry)); + sdcardfs_copy_and_fix_attrs(new_dir, d_inode(lower_new_dir_dentry)); fsstack_copy_inode_size(new_dir, d_inode(lower_new_dir_dentry)); - fix_derived_permission(new_dir); + if (new_dir != old_dir) { - fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); + sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry)); fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry)); - fix_derived_permission(old_dir); + /* update the derived permission of the old_dentry * with its new parent */ new_parent = dget_parent(new_dentry); if(new_parent) { if(d_inode(old_dentry)) { - get_derived_permission(new_parent, old_dentry); - fix_derived_permission(d_inode(old_dentry)); + update_derived_permission_lock(old_dentry); } dput(new_parent); } } - + /* At this point, not all dentry information has been moved, so + * we pass along new_dentry for the name.*/ + mutex_lock(&d_inode(old_dentry)->i_mutex); + get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry); + fix_derived_permission(d_inode(old_dentry)); + get_derive_permissions_recursive(old_dentry); + mutex_unlock(&d_inode(old_dentry)->i_mutex); out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_old_dir_dentry); @@ -639,9 +632,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) struct inode *lower_inode; struct path lower_path; struct iattr lower_ia; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); struct dentry *parent; - int has_rw; inode = d_inode(dentry); @@ -655,10 +646,8 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) /* no vfs_XXX operations required, cred overriding will be skipped. wj*/ if (!err) { /* check the Android group ID */ - has_rw = get_caller_has_rw_locked(sbi->pkgl_id, sbi->options.derive); parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name, - sbi->options.derive, 1, has_rw)) { + if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -723,10 +712,8 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) if (err) goto out; - /* get attributes from the lower inode */ - fsstack_copy_attr_all(inode, lower_inode); - /* update derived permission of the upper inode */ - fix_derived_permission(inode); + /* get attributes from the lower inode and update derived permissions */ + sdcardfs_copy_and_fix_attrs(inode, lower_inode); /* * Not running fsstack_copy_inode_size(inode, lower_inode), because @@ -748,11 +735,9 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *lower_inode; struct path lower_path; struct dentry *parent; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name, - sbi->options.derive, 0, 0)) { + if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -767,13 +752,10 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, lower_dentry = lower_path.dentry; lower_inode = sdcardfs_lower_inode(inode); - fsstack_copy_attr_all(inode, lower_inode); + + sdcardfs_copy_and_fix_attrs(inode, lower_inode); fsstack_copy_inode_size(inode, lower_inode); - /* if the dentry has been moved from other location - * so, on this stage, its derived permission must be - * rechecked from its private field. - */ - fix_derived_permission(inode); + generic_fillattr(inode, stat); sdcardfs_put_lower_path(dentry, &lower_path); diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index a4b94df99f32..f80abcb6b467 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -64,10 +64,17 @@ int new_dentry_private_data(struct dentry *dentry) return 0; } -static int sdcardfs_inode_test(struct inode *inode, void *candidate_lower_inode) +struct inode_data { + struct inode *lower_inode; + userid_t id; +}; + +static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/) { struct inode *current_lower_inode = sdcardfs_lower_inode(inode); - if (current_lower_inode == (struct inode *)candidate_lower_inode) + userid_t current_userid = SDCARDFS_I(inode)->userid; + if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode && + current_userid == ((struct inode_data *)candidate_data)->id) return 1; /* found a match */ else return 0; /* no match */ @@ -79,12 +86,15 @@ static int sdcardfs_inode_set(struct inode *inode, void *lower_inode) return 0; } -struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode) +struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, userid_t id) { struct sdcardfs_inode_info *info; + struct inode_data data; struct inode *inode; /* the new inode to return */ int err; + data.id = id; + data.lower_inode = lower_inode; inode = iget5_locked(sb, /* our superblock */ /* * hashval: we use inode number, but we can @@ -94,7 +104,7 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode) lower_inode->i_ino, /* hashval */ sdcardfs_inode_test, /* inode comparison function */ sdcardfs_inode_set, /* inode init function */ - lower_inode); /* data passed to test+set fxns */ + &data); /* data passed to test+set fxns */ if (!inode) { err = -EACCES; iput(lower_inode); @@ -146,11 +156,9 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode) lower_inode->i_rdev); /* all well, copy inode attributes */ - fsstack_copy_attr_all(inode, lower_inode); + sdcardfs_copy_and_fix_attrs(inode, lower_inode); fsstack_copy_inode_size(inode, lower_inode); - fix_derived_permission(inode); - unlock_new_inode(inode); return inode; } @@ -164,7 +172,7 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode) * @lower_path: the lower path (caller does path_get/put) */ int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, - struct path *lower_path) + struct path *lower_path, userid_t id) { int err = 0; struct inode *inode; @@ -186,14 +194,14 @@ int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, */ /* inherit lower inode number for sdcardfs's inode */ - inode = sdcardfs_iget(sb, lower_inode); + inode = sdcardfs_iget(sb, lower_inode, id); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out; } d_add(dentry, inode); - update_derived_permission(dentry); + update_derived_permission_lock(dentry); out: return err; } @@ -205,7 +213,7 @@ out: * Fills in lower_parent_path with on success. */ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, - unsigned int flags, struct path *lower_parent_path) + unsigned int flags, struct path *lower_parent_path, userid_t id) { int err = 0; struct vfsmount *lower_dir_mnt; @@ -266,7 +274,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, } sdcardfs_set_lower_path(dentry, &lower_path); - err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_path); + err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id); if (err) /* path_put underlying path on error */ sdcardfs_put_reset_lower_path(dentry); goto out; @@ -328,13 +336,11 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, struct dentry *ret = NULL, *parent; struct path lower_parent_path; int err = 0; - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; parent = dget_parent(dentry); - if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name, - sbi->options.derive, 0, 0)) { + if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name)) { ret = ERR_PTR(-EACCES); printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", @@ -354,7 +360,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path); + ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path, SDCARDFS_I(dir)->userid); if (IS_ERR(ret)) { goto out; @@ -365,8 +371,10 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, fsstack_copy_attr_times(dentry->d_inode, sdcardfs_lower_inode(dentry->d_inode)); /* get drived permission */ + mutex_lock(&dentry->d_inode->i_mutex); get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); + mutex_unlock(&dentry->d_inode->i_mutex); } /* update parent directory's atime */ fsstack_copy_attr_atime(parent->d_inode, diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 9d04ae8ceb46..80aa355d801e 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -24,25 +24,27 @@ #include enum { - Opt_uid, + Opt_fsuid, + Opt_fsgid, Opt_gid, - Opt_wgid, Opt_debug, - Opt_split, - Opt_derive, Opt_lower_fs, + Opt_mask, + Opt_multiuser, // May need? + Opt_userid, Opt_reserved_mb, Opt_err, }; static const match_table_t sdcardfs_tokens = { - {Opt_uid, "uid=%u"}, + {Opt_fsuid, "fsuid=%u"}, + {Opt_fsgid, "fsgid=%u"}, {Opt_gid, "gid=%u"}, - {Opt_wgid, "wgid=%u"}, {Opt_debug, "debug"}, - {Opt_split, "split"}, - {Opt_derive, "derive=%s"}, {Opt_lower_fs, "lower_fs=%s"}, + {Opt_mask, "mask=%u"}, + {Opt_userid, "userid=%d"}, + {Opt_multiuser, "multiuser"}, {Opt_reserved_mb, "reserved_mb=%u"}, {Opt_err, NULL} }; @@ -58,12 +60,10 @@ static int parse_options(struct super_block *sb, char *options, int silent, /* by default, we use AID_MEDIA_RW as uid, gid */ opts->fs_low_uid = AID_MEDIA_RW; opts->fs_low_gid = AID_MEDIA_RW; - /* by default, we use AID_SDCARD_RW as write_gid */ - opts->write_gid = AID_SDCARD_RW; - /* default permission policy - * (DERIVE_NONE | DERIVE_LEGACY | DERIVE_UNIFIED) */ - opts->derive = DERIVE_NONE; - opts->split_perms = 0; + opts->mask = 0; + opts->multiuser = false; + opts->fs_user_id = 0; + opts->gid = 0; /* by default, we use LOWER_FS_EXT4 as lower fs type */ opts->lower_fs = LOWER_FS_EXT4; /* by default, 0MB is reserved */ @@ -85,37 +85,33 @@ static int parse_options(struct super_block *sb, char *options, int silent, case Opt_debug: *debug = 1; break; - case Opt_uid: + case Opt_fsuid: if (match_int(&args[0], &option)) return 0; opts->fs_low_uid = option; break; - case Opt_gid: + case Opt_fsgid: if (match_int(&args[0], &option)) return 0; opts->fs_low_gid = option; break; - case Opt_wgid: + case Opt_gid: if (match_int(&args[0], &option)) return 0; - opts->write_gid = option; + opts->gid = option; break; - case Opt_split: - opts->split_perms=1; + case Opt_userid: + if (match_int(&args[0], &option)) + return 0; + opts->fs_user_id = option; break; - case Opt_derive: - string_option = match_strdup(&args[0]); - if (!strcmp("none", string_option)) { - opts->derive = DERIVE_NONE; - } else if (!strcmp("legacy", string_option)) { - opts->derive = DERIVE_LEGACY; - } else if (!strcmp("unified", string_option)) { - opts->derive = DERIVE_UNIFIED; - } else { - kfree(string_option); - goto invalid_option; - } - kfree(string_option); + case Opt_mask: + if (match_int(&args[0], &option)) + return 0; + opts->mask = option; + break; + case Opt_multiuser: + opts->multiuser = true; break; case Opt_lower_fs: string_option = match_strdup(&args[0]); @@ -184,6 +180,11 @@ static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb) } #endif +DEFINE_MUTEX(sdcardfs_super_list_lock); +LIST_HEAD(sdcardfs_super_list); +EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock); +EXPORT_SYMBOL_GPL(sdcardfs_super_list); + /* * There is no need to lock the sdcardfs_super_info's rwsem as there is no * way anyone can have a reference to the superblock at this point in time. @@ -196,7 +197,6 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, struct super_block *lower_sb; struct path lower_path; struct sdcardfs_sb_info *sb_info; - void *pkgl_id; struct inode *inode; printk(KERN_INFO "sdcardfs version 2.0\n"); @@ -215,8 +215,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &lower_path); if (err) { - printk(KERN_ERR "sdcardfs: error accessing " - "lower directory '%s'\n", dev_name); + printk(KERN_ERR "sdcardfs: error accessing lower directory '%s'\n", dev_name); goto out; } @@ -229,7 +228,6 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, } sb_info = sb->s_fs_info; - /* parse options */ err = parse_options(sb, raw_data, silent, &debug, &sb_info->options); if (err) { @@ -237,14 +235,6 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, goto out_freesbi; } - if (sb_info->options.derive != DERIVE_NONE) { - pkgl_id = packagelist_create(sb_info->options.write_gid); - if(IS_ERR(pkgl_id)) - goto out_freesbi; - else - sb_info->pkgl_id = pkgl_id; - } - /* set the lower superblock field of upper superblock */ lower_sb = lower_path.dentry->d_sb; atomic_inc(&lower_sb->s_active); @@ -263,7 +253,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, sb->s_op = &sdcardfs_sops; /* get a new inode and allocate our root dentry */ - inode = sdcardfs_iget(sb, lower_path.dentry->d_inode); + inode = sdcardfs_iget(sb, lower_path.dentry->d_inode, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_sput; @@ -292,41 +282,22 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, d_rehash(sb->s_root); /* setup permission policy */ - switch(sb_info->options.derive) { - case DERIVE_NONE: - setup_derived_state(sb->s_root->d_inode, - PERM_ROOT, 0, AID_ROOT, AID_SDCARD_RW, 00775); - sb_info->obbpath_s = NULL; - break; - case DERIVE_LEGACY: - /* Legacy behavior used to support internal multiuser layout which - * places user_id at the top directory level, with the actual roots - * just below that. Shared OBB path is also at top level. */ - setup_derived_state(sb->s_root->d_inode, - PERM_LEGACY_PRE_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); - /* initialize the obbpath string and lookup the path - * sb_info->obb_path will be deactivated by path_put - * on sdcardfs_put_super */ - sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); - snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); - err = prepare_dir(sb_info->obbpath_s, + sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); + mutex_lock(&sdcardfs_super_list_lock); + if(sb_info->options.multiuser) { + setup_derived_state(sb->s_root->d_inode, PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false); + snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); + /*err = prepare_dir(sb_info->obbpath_s, sb_info->options.fs_low_uid, - sb_info->options.fs_low_gid, 00755); - if(err) - printk(KERN_ERR "sdcardfs: %s: %d, error on creating %s\n", - __func__,__LINE__, sb_info->obbpath_s); - break; - case DERIVE_UNIFIED: - /* Unified multiuser layout which places secondary user_id under - * /Android/user and shared OBB path under /Android/obb. */ - setup_derived_state(sb->s_root->d_inode, - PERM_ROOT, 0, AID_ROOT, AID_SDCARD_R, 00771); - - sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); - snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); - break; + sb_info->options.fs_low_gid, 00755);*/ + } else { + setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false); + snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); } fix_derived_permission(sb->s_root->d_inode); + sb_info->sb = sb; + list_add(&sb_info->list, &sdcardfs_super_list); + mutex_unlock(&sdcardfs_super_list_lock); if (!silent) printk(KERN_INFO "sdcardfs: mounted on top of %s type %s\n", @@ -341,7 +312,6 @@ out_iput: out_sput: /* drop refs we took earlier */ atomic_dec(&lower_sb->s_active); - packagelist_destroy(sb_info->pkgl_id); out_freesbi: kfree(SDCARDFS_SB(sb)); sb->s_fs_info = NULL; @@ -386,11 +356,22 @@ struct dentry *sdcardfs_mount(struct file_system_type *fs_type, int flags, raw_data, sdcardfs_read_super); } +void sdcardfs_kill_sb(struct super_block *sb) { + struct sdcardfs_sb_info *sbi; + if (sb->s_magic == SDCARDFS_SUPER_MAGIC) { + sbi = SDCARDFS_SB(sb); + mutex_lock(&sdcardfs_super_list_lock); + list_del(&sbi->list); + mutex_unlock(&sdcardfs_super_list_lock); + } + generic_shutdown_super(sb); +} + static struct file_system_type sdcardfs_fs_type = { .owner = THIS_MODULE, .name = SDCARDFS_NAME, .mount = sdcardfs_mount, - .kill_sb = generic_shutdown_super, + .kill_sb = sdcardfs_kill_sb, .fs_flags = 0, }; diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index f11591da141d..ba3478d94107 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -19,13 +19,16 @@ */ #include "sdcardfs.h" -#include "strtok.h" #include -#include -#include -#include #include + +#include +#include +#include + +#include + #define STRING_BUF_SIZE (512) struct hashtable_entry { @@ -34,25 +37,20 @@ struct hashtable_entry { unsigned int value; }; -struct packagelist_data { - DECLARE_HASHTABLE(package_to_appid,8); - DECLARE_HASHTABLE(appid_with_rw,7); - struct mutex hashtable_lock; - struct task_struct *thread_id; - gid_t write_gid; - char *strtok_last; - char read_buf[STRING_BUF_SIZE]; - char event_buf[STRING_BUF_SIZE]; - char app_name_buf[STRING_BUF_SIZE]; - char gids_buf[STRING_BUF_SIZE]; +struct sb_list { + struct super_block *sb; + struct list_head list; }; -static struct kmem_cache *hashtable_entry_cachep; +struct packagelist_data { + DECLARE_HASHTABLE(package_to_appid,8); + struct mutex hashtable_lock; -/* Path to system-provided mapping of package name to appIds */ -static const char* const kpackageslist_file = "/data/system/packages.list"; -/* Supplementary groups to execute with */ -static const gid_t kgroups[1] = { AID_PACKAGE_INFO }; +}; + +static struct packagelist_data *pkgl_data_all; + +static struct kmem_cache *hashtable_entry_cachep; static unsigned int str_hash(const char *key) { int i; @@ -66,62 +64,29 @@ static unsigned int str_hash(const char *key) { return h; } -static int contain_appid_key(struct packagelist_data *pkgl_dat, unsigned int appid) { - struct hashtable_entry *hash_cur; - - hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, appid) - if ((void *)(uintptr_t)appid == hash_cur->key) - return 1; - - return 0; -} - -/* Return if the calling UID holds sdcard_rw. */ -int get_caller_has_rw_locked(void *pkgl_id, derive_t derive) { - struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; - appid_t appid; - int ret; - - /* No additional permissions enforcement */ - if (derive == DERIVE_NONE) { - return 1; - } - - appid = multiuser_get_app_id(from_kuid(&init_user_ns, current_fsuid())); - mutex_lock(&pkgl_dat->hashtable_lock); - ret = contain_appid_key(pkgl_dat, appid); - mutex_unlock(&pkgl_dat->hashtable_lock); - return ret; -} - appid_t get_appid(void *pkgl_id, const char *app_name) { - struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; + struct packagelist_data *pkgl_dat = pkgl_data_all; struct hashtable_entry *hash_cur; unsigned int hash = str_hash(app_name); appid_t ret_id; - //printk(KERN_INFO "sdcardfs: %s: %s, %u\n", __func__, (char *)app_name, hash); mutex_lock(&pkgl_dat->hashtable_lock); hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { - //printk(KERN_INFO "sdcardfs: %s: %s\n", __func__, (char *)hash_cur->key); if (!strcasecmp(app_name, hash_cur->key)) { ret_id = (appid_t)hash_cur->value; mutex_unlock(&pkgl_dat->hashtable_lock); - //printk(KERN_INFO "=> app_id: %d\n", (int)ret_id); return ret_id; } } mutex_unlock(&pkgl_dat->hashtable_lock); - //printk(KERN_INFO "=> app_id: %d\n", 0); return 0; } /* Kernel has already enforced everything we returned through * derive_permissions_locked(), so this is used to lock down access * even further, such as enforcing that apps hold sdcard_rw. */ -int check_caller_access_to_name(struct inode *parent_node, const char* name, - derive_t derive, int w_ok, int has_rw) { +int check_caller_access_to_name(struct inode *parent_node, const char* name) { /* Always block security-sensitive files at root */ if (parent_node && SDCARDFS_I(parent_node)->perm == PERM_ROOT) { @@ -132,28 +97,12 @@ int check_caller_access_to_name(struct inode *parent_node, const char* name, } } - /* No additional permissions enforcement */ - if (derive == DERIVE_NONE) { - return 1; - } - /* Root always has access; access for any other UIDs should always * be controlled through packages.list. */ if (from_kuid(&init_user_ns, current_fsuid()) == 0) { return 1; } - /* If asking to write, verify that caller either owns the - * parent or holds sdcard_rw. */ - if (w_ok) { - if (parent_node && - (from_kuid(&init_user_ns, current_fsuid()) == - SDCARDFS_I(parent_node)->d_uid)) { - return 1; - } - return has_rw; - } - /* No extra permissions to enforce */ return 1; } @@ -171,14 +120,13 @@ int open_flags_to_access_mode(int open_flags) { } } -static int insert_str_to_int(struct packagelist_data *pkgl_dat, char *key, +static int insert_str_to_int_lock(struct packagelist_data *pkgl_dat, char *key, unsigned int value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; unsigned int hash = str_hash(key); - //printk(KERN_INFO "sdcardfs: %s: %s: %d, %u\n", __func__, (char *)key, value, hash); hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { if (!strcasecmp(key, hash_cur->key)) { hash_cur->value = value; @@ -194,247 +142,277 @@ static int insert_str_to_int(struct packagelist_data *pkgl_dat, char *key, return 0; } -static void remove_str_to_int(struct hashtable_entry *h_entry) { - //printk(KERN_INFO "sdcardfs: %s: %s: %d\n", __func__, (char *)h_entry->key, h_entry->value); - kfree(h_entry->key); - kmem_cache_free(hashtable_entry_cachep, h_entry); +static void fixup_perms(struct super_block *sb) { + if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) { + mutex_lock(&sb->s_root->d_inode->i_mutex); + get_derive_permissions_recursive(sb->s_root); + mutex_unlock(&sb->s_root->d_inode->i_mutex); + } } -static int insert_int_to_null(struct packagelist_data *pkgl_dat, unsigned int key, - unsigned int value) -{ - struct hashtable_entry *hash_cur; - struct hashtable_entry *new_entry; +static int insert_str_to_int(struct packagelist_data *pkgl_dat, char *key, + unsigned int value) { + int ret; + struct sdcardfs_sb_info *sbinfo; + mutex_lock(&sdcardfs_super_list_lock); + mutex_lock(&pkgl_dat->hashtable_lock); + ret = insert_str_to_int_lock(pkgl_dat, key, value); + mutex_unlock(&pkgl_dat->hashtable_lock); - //printk(KERN_INFO "sdcardfs: %s: %d: %d\n", __func__, (int)key, value); - hash_for_each_possible(pkgl_dat->appid_with_rw, hash_cur, hlist, key) { - if ((void *)(uintptr_t)key == hash_cur->key) { - hash_cur->value = value; - return 0; + list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { + if (sbinfo) { + fixup_perms(sbinfo->sb); } } - new_entry = kmem_cache_alloc(hashtable_entry_cachep, GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->key = (void *)(uintptr_t)key; - new_entry->value = value; - hash_add(pkgl_dat->appid_with_rw, &new_entry->hlist, key); - return 0; + mutex_unlock(&sdcardfs_super_list_lock); + return ret; } -static void remove_int_to_null(struct hashtable_entry *h_entry) { - //printk(KERN_INFO "sdcardfs: %s: %d: %d\n", __func__, (int)h_entry->key, h_entry->value); +static void remove_str_to_int_lock(struct hashtable_entry *h_entry) { + kfree(h_entry->key); + hash_del(&h_entry->hlist); kmem_cache_free(hashtable_entry_cachep, h_entry); } +static void remove_str_to_int(struct packagelist_data *pkgl_dat, const char *key) +{ + struct sdcardfs_sb_info *sbinfo; + struct hashtable_entry *hash_cur; + unsigned int hash = str_hash(key); + mutex_lock(&sdcardfs_super_list_lock); + mutex_lock(&pkgl_dat->hashtable_lock); + hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { + if (!strcasecmp(key, hash_cur->key)) { + remove_str_to_int_lock(hash_cur); + break; + } + } + mutex_unlock(&pkgl_dat->hashtable_lock); + list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { + if (sbinfo) { + fixup_perms(sbinfo->sb); + } + } + mutex_unlock(&sdcardfs_super_list_lock); + return; +} + static void remove_all_hashentrys(struct packagelist_data *pkgl_dat) { struct hashtable_entry *hash_cur; struct hlist_node *h_t; int i; - - hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist) - remove_str_to_int(hash_cur); - hash_for_each_safe(pkgl_dat->appid_with_rw, i, h_t, hash_cur, hlist) - remove_int_to_null(hash_cur); - - hash_init(pkgl_dat->package_to_appid); - hash_init(pkgl_dat->appid_with_rw); -} - -static int read_package_list(struct packagelist_data *pkgl_dat) { - int ret; - int fd; - int read_amount; - - printk(KERN_INFO "sdcardfs: read_package_list\n"); - mutex_lock(&pkgl_dat->hashtable_lock); - - remove_all_hashentrys(pkgl_dat); - - fd = sys_open(kpackageslist_file, O_RDONLY, 0); - if (fd < 0) { - printk(KERN_ERR "sdcardfs: failed to open package list\n"); - mutex_unlock(&pkgl_dat->hashtable_lock); - return fd; - } - - while ((read_amount = sys_read(fd, pkgl_dat->read_buf, - sizeof(pkgl_dat->read_buf))) > 0) { - unsigned int appid; - char *token; - int one_line_len = 0; - int additional_read; - unsigned long ret_gid; - - while (one_line_len < read_amount) { - if (pkgl_dat->read_buf[one_line_len] == '\n') { - one_line_len++; - break; - } - one_line_len++; - } - additional_read = read_amount - one_line_len; - if (additional_read > 0) - sys_lseek(fd, -additional_read, SEEK_CUR); - - if (sscanf(pkgl_dat->read_buf, "%s %u %*d %*s %*s %s", - pkgl_dat->app_name_buf, &appid, - pkgl_dat->gids_buf) == 3) { - ret = insert_str_to_int(pkgl_dat, pkgl_dat->app_name_buf, appid); - if (ret) { - sys_close(fd); - mutex_unlock(&pkgl_dat->hashtable_lock); - return ret; - } - - token = strtok_r(pkgl_dat->gids_buf, ",", &pkgl_dat->strtok_last); - while (token != NULL) { - if (!kstrtoul(token, 10, &ret_gid) && - (ret_gid == pkgl_dat->write_gid)) { - ret = insert_int_to_null(pkgl_dat, appid, 1); - if (ret) { - sys_close(fd); - mutex_unlock(&pkgl_dat->hashtable_lock); - return ret; - } - break; - } - token = strtok_r(NULL, ",", &pkgl_dat->strtok_last); - } - } - } - - sys_close(fd); + hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist) + remove_str_to_int_lock(hash_cur); mutex_unlock(&pkgl_dat->hashtable_lock); - return 0; + hash_init(pkgl_dat->package_to_appid); } -static int packagelist_reader(void *thread_data) -{ - struct packagelist_data *pkgl_dat = (struct packagelist_data *)thread_data; - struct inotify_event *event; - bool active = false; - int event_pos; - int event_size; - int res = 0; - int nfd; - - allow_signal(SIGINT); - - nfd = sys_inotify_init(); - if (nfd < 0) { - printk(KERN_ERR "sdcardfs: inotify_init failed: %d\n", nfd); - return nfd; - } - - while (!kthread_should_stop()) { - if (signal_pending(current)) { - ssleep(1); - continue; - } - - if (!active) { - res = sys_inotify_add_watch(nfd, kpackageslist_file, IN_DELETE_SELF); - if (res < 0) { - if (res == -ENOENT || res == -EACCES) { - /* Framework may not have created yet, sleep and retry */ - printk(KERN_ERR "sdcardfs: missing packages.list; retrying\n"); - ssleep(2); - printk(KERN_ERR "sdcardfs: missing packages.list_end; retrying\n"); - continue; - } else { - printk(KERN_ERR "sdcardfs: inotify_add_watch failed: %d\n", res); - goto interruptable_sleep; - } - } - /* Watch above will tell us about any future changes, so - * read the current state. */ - res = read_package_list(pkgl_dat); - if (res) { - printk(KERN_ERR "sdcardfs: read_package_list failed: %d\n", res); - goto interruptable_sleep; - } - active = true; - } - - event_pos = 0; - res = sys_read(nfd, pkgl_dat->event_buf, sizeof(pkgl_dat->event_buf)); - if (res < (int) sizeof(*event)) { - if (res == -EINTR) - continue; - printk(KERN_ERR "sdcardfs: failed to read inotify event: %d\n", res); - goto interruptable_sleep; - } - - while (res >= (int) sizeof(*event)) { - event = (struct inotify_event *) (pkgl_dat->event_buf + event_pos); - - printk(KERN_INFO "sdcardfs: inotify event: %08x\n", event->mask); - if ((event->mask & IN_IGNORED) == IN_IGNORED) { - /* Previously watched file was deleted, probably due to move - * that swapped in new data; re-arm the watch and read. */ - active = false; - } - - event_size = sizeof(*event) + event->len; - res -= event_size; - event_pos += event_size; - } - continue; - -interruptable_sleep: - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } - flush_signals(current); - sys_close(nfd); - return res; -} - -void * packagelist_create(gid_t write_gid) +static struct packagelist_data * packagelist_create(void) { struct packagelist_data *pkgl_dat; - struct task_struct *packagelist_thread; pkgl_dat = kmalloc(sizeof(*pkgl_dat), GFP_KERNEL | __GFP_ZERO); if (!pkgl_dat) { - printk(KERN_ERR "sdcardfs: creating kthread failed\n"); + printk(KERN_ERR "sdcardfs: Failed to create hash\n"); return ERR_PTR(-ENOMEM); } mutex_init(&pkgl_dat->hashtable_lock); hash_init(pkgl_dat->package_to_appid); - hash_init(pkgl_dat->appid_with_rw); - pkgl_dat->write_gid = write_gid; - packagelist_thread = kthread_run(packagelist_reader, (void *)pkgl_dat, "pkgld"); - if (IS_ERR(packagelist_thread)) { - printk(KERN_ERR "sdcardfs: creating kthread failed\n"); - kfree(pkgl_dat); - return packagelist_thread; - } - pkgl_dat->thread_id = packagelist_thread; - - printk(KERN_INFO "sdcardfs: created packagelist pkgld/%d\n", - (int)pkgl_dat->thread_id->pid); - - return (void *)pkgl_dat; + return pkgl_dat; } -void packagelist_destroy(void *pkgl_id) +static void packagelist_destroy(struct packagelist_data *pkgl_dat) { - struct packagelist_data *pkgl_dat = (struct packagelist_data *)pkgl_id; - pid_t pkgl_pid = pkgl_dat->thread_id->pid; - - force_sig_info(SIGINT, SEND_SIG_PRIV, pkgl_dat->thread_id); - kthread_stop(pkgl_dat->thread_id); remove_all_hashentrys(pkgl_dat); - printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld/%d\n", (int)pkgl_pid); + printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n"); kfree(pkgl_dat); } +struct package_appid { + struct config_item item; + int add_pid; +}; + +static inline struct package_appid *to_package_appid(struct config_item *item) +{ + return item ? container_of(item, struct package_appid, item) : NULL; +} + +static ssize_t package_appid_attr_show(struct config_item *item, + char *page) +{ + ssize_t count; + count = sprintf(page, "%d\n", get_appid(pkgl_data_all, item->ci_name)); + return count; +} + +static ssize_t package_appid_attr_store(struct config_item *item, + const char *page, size_t count) +{ + struct package_appid *package_appid = to_package_appid(item); + unsigned long tmp; + char *p = (char *) page; + int ret; + + tmp = simple_strtoul(p, &p, 10); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp > INT_MAX) + return -ERANGE; + ret = insert_str_to_int(pkgl_data_all, item->ci_name, (unsigned int)tmp); + package_appid->add_pid = tmp; + if (ret) + return ret; + + return count; +} + +static struct configfs_attribute package_appid_attr_add_pid = { + .ca_owner = THIS_MODULE, + .ca_name = "appid", + .ca_mode = S_IRUGO | S_IWUGO, + .show = package_appid_attr_show, + .store = package_appid_attr_store, +}; + +static struct configfs_attribute *package_appid_attrs[] = { + &package_appid_attr_add_pid, + NULL, +}; + +static void package_appid_release(struct config_item *item) +{ + printk(KERN_INFO "sdcardfs: removing %s\n", item->ci_dentry->d_name.name); + /* item->ci_name is freed already, so we rely on the dentry */ + remove_str_to_int(pkgl_data_all, item->ci_dentry->d_name.name); + kfree(to_package_appid(item)); +} + +static struct configfs_item_operations package_appid_item_ops = { + .release = package_appid_release, +}; + +static struct config_item_type package_appid_type = { + .ct_item_ops = &package_appid_item_ops, + .ct_attrs = package_appid_attrs, + .ct_owner = THIS_MODULE, +}; + + +struct sdcardfs_packages { + struct config_group group; +}; + +static inline struct sdcardfs_packages *to_sdcardfs_packages(struct config_item *item) +{ + return item ? container_of(to_config_group(item), struct sdcardfs_packages, group) : NULL; +} + +static struct config_item *sdcardfs_packages_make_item(struct config_group *group, const char *name) +{ + struct package_appid *package_appid; + + package_appid = kzalloc(sizeof(struct package_appid), GFP_KERNEL); + if (!package_appid) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&package_appid->item, name, + &package_appid_type); + + package_appid->add_pid = 0; + + return &package_appid->item; +} + +static ssize_t packages_attr_show(struct config_item *item, + char *page) +{ + struct hashtable_entry *hash_cur; + struct hlist_node *h_t; + int i; + int count = 0; + mutex_lock(&pkgl_data_all->hashtable_lock); + hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) + count += snprintf(page + count, PAGE_SIZE - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); + mutex_unlock(&pkgl_data_all->hashtable_lock); + + + return count; +} + +static struct configfs_attribute sdcardfs_packages_attr_description = { + .ca_owner = THIS_MODULE, + .ca_name = "packages_gid.list", + .ca_mode = S_IRUGO, + .show = packages_attr_show, +}; + +static struct configfs_attribute *sdcardfs_packages_attrs[] = { + &sdcardfs_packages_attr_description, + NULL, +}; + +static void sdcardfs_packages_release(struct config_item *item) +{ + + printk(KERN_INFO "sdcardfs: destroyed something?\n"); + kfree(to_sdcardfs_packages(item)); +} + +static struct configfs_item_operations sdcardfs_packages_item_ops = { + .release = sdcardfs_packages_release, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations sdcardfs_packages_group_ops = { + .make_item = sdcardfs_packages_make_item, +}; + +static struct config_item_type sdcardfs_packages_type = { + .ct_item_ops = &sdcardfs_packages_item_ops, + .ct_group_ops = &sdcardfs_packages_group_ops, + .ct_attrs = sdcardfs_packages_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem sdcardfs_packages_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "sdcardfs", + .ci_type = &sdcardfs_packages_type, + }, + }, +}; + +static int __init configfs_sdcardfs_init(void) +{ + int ret; + struct configfs_subsystem *subsys = &sdcardfs_packages_subsys; + + config_group_init(&subsys->su_group); + mutex_init(&subsys->su_mutex); + ret = configfs_register_subsystem(subsys); + if (ret) { + printk(KERN_ERR "Error %d while registering subsystem %s\n", + ret, + subsys->su_group.cg_item.ci_namebuf); + } + return ret; +} + +static void __exit configfs_sdcardfs_exit(void) +{ + configfs_unregister_subsystem(&sdcardfs_packages_subsys); +} + int packagelist_init(void) { hashtable_entry_cachep = @@ -445,13 +423,15 @@ int packagelist_init(void) return -ENOMEM; } + pkgl_data_all = packagelist_create(); + configfs_sdcardfs_init(); return 0; } void packagelist_exit(void) { + configfs_sdcardfs_exit(); + packagelist_destroy(pkgl_data_all); if (hashtable_entry_cachep) kmem_cache_destroy(hashtable_entry_cachep); } - - diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 51f6c7912584..1b85f4e70324 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -42,6 +42,7 @@ #include #include #include +#include #include "multiuser.h" /* the file system name */ @@ -70,10 +71,11 @@ #define fix_derived_permission(x) \ do { \ (x)->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(x)->d_uid); \ - (x)->i_gid = make_kgid(&init_user_ns, SDCARDFS_I(x)->d_gid); \ - (x)->i_mode = ((x)->i_mode & S_IFMT) | SDCARDFS_I(x)->d_mode;\ + (x)->i_gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(x))); \ + (x)->i_mode = ((x)->i_mode & S_IFMT) | get_mode(SDCARDFS_I(x));\ } while (0) + /* OVERRIDE_CRED() and REVERT_CRED() * OVERRID_CRED() * backup original task->cred @@ -99,35 +101,28 @@ (int)current->cred->fsuid, \ (int)current->cred->fsgid); -/* Android 4.4 support */ +/* Android 5.0 support */ /* Permission mode for a specific node. Controls how file permissions * are derived for children nodes. */ typedef enum { - /* Nothing special; this node should just inherit from its parent. */ - PERM_INHERIT, - /* This node is one level above a normal root; used for legacy layouts - * which use the first level to represent user_id. */ - PERM_LEGACY_PRE_ROOT, - /* This node is "/" */ - PERM_ROOT, - /* This node is "/Android" */ - PERM_ANDROID, - /* This node is "/Android/data" */ - PERM_ANDROID_DATA, - /* This node is "/Android/obb" */ - PERM_ANDROID_OBB, - /* This node is "/Android/user" */ - PERM_ANDROID_USER, + /* Nothing special; this node should just inherit from its parent. */ + PERM_INHERIT, + /* This node is one level above a normal root; used for legacy layouts + * which use the first level to represent user_id. */ + PERM_PRE_ROOT, + /* This node is "/" */ + PERM_ROOT, + /* This node is "/Android" */ + PERM_ANDROID, + /* This node is "/Android/data" */ + PERM_ANDROID_DATA, + /* This node is "/Android/obb" */ + PERM_ANDROID_OBB, + /* This node is "/Android/media" */ + PERM_ANDROID_MEDIA, } perm_t; -/* Permissions structure to derive */ -typedef enum { - DERIVE_NONE, - DERIVE_LEGACY, - DERIVE_UNIFIED, -} derive_t; - typedef enum { LOWER_FS_EXT4, LOWER_FS_FAT, @@ -161,9 +156,9 @@ extern void free_dentry_private_data(struct dentry *dentry); extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern struct inode *sdcardfs_iget(struct super_block *sb, - struct inode *lower_inode); + struct inode *lower_inode, userid_t id); extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, - struct path *lower_path); + struct path *lower_path, userid_t id); /* file private data */ struct sdcardfs_file_info { @@ -174,18 +169,16 @@ struct sdcardfs_file_info { /* sdcardfs inode data in memory */ struct sdcardfs_inode_info { struct inode *lower_inode; - /* state derived based on current position in hierachy - * caution: d_mode does not include file types - */ + /* state derived based on current position in hierachy */ perm_t perm; userid_t userid; uid_t d_uid; - gid_t d_gid; - mode_t d_mode; + bool under_android; struct inode vfs_inode; }; + /* sdcardfs dentry data in memory */ struct sdcardfs_dentry_info { spinlock_t lock; /* protects lower_path */ @@ -196,15 +189,17 @@ struct sdcardfs_dentry_info { struct sdcardfs_mount_options { uid_t fs_low_uid; gid_t fs_low_gid; - gid_t write_gid; - int split_perms; - derive_t derive; + userid_t fs_user_id; + gid_t gid; lower_fs_t lower_fs; + mode_t mask; + bool multiuser; unsigned int reserved_mb; }; /* sdcardfs super-block data in memory */ struct sdcardfs_sb_info { + struct super_block *sb; struct super_block *lower_sb; /* derived perm policy : some of options have been added * to sdcardfs_mount_options (Android 4.4 support) */ @@ -213,6 +208,7 @@ struct sdcardfs_sb_info { char *obbpath_s; struct path obbpath; void *pkgl_id; + struct list_head list; }; /* @@ -331,6 +327,44 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \ SDCARDFS_DENT_FUNC(lower_path) SDCARDFS_DENT_FUNC(orig_path) +static inline int get_gid(struct sdcardfs_inode_info *info) { + struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); + if (sb_info->options.gid == AID_SDCARD_RW) { + /* As an optimization, certain trusted system components only run + * as owner but operate across all users. Since we're now handing + * out the sdcard_rw GID only to trusted apps, we're okay relaxing + * the user boundary enforcement for the default view. The UIDs + * assigned to app directories are still multiuser aware. */ + return AID_SDCARD_RW; + } else { + return multiuser_get_uid(info->userid, sb_info->options.gid); + } +} +static inline int get_mode(struct sdcardfs_inode_info *info) { + int owner_mode; + int filtered_mode; + struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); + int visible_mode = 0775 & ~sb_info->options.mask; + + if (info->perm == PERM_PRE_ROOT) { + /* Top of multi-user view should always be visible to ensure + * secondary users can traverse inside. */ + visible_mode = 0711; + } else if (info->under_android) { + /* Block "other" access to Android directories, since only apps + * belonging to a specific user should be in there; we still + * leave +x open for the default view. */ + if (sb_info->options.gid == AID_SDCARD_RW) { + visible_mode = visible_mode & ~0006; + } else { + visible_mode = visible_mode & ~0007; + } + } + owner_mode = info->lower_inode->i_mode & 0700; + filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6)); + return filtered_mode; +} + static inline int has_graft_path(const struct dentry *dent) { int ret = 0; @@ -364,22 +398,24 @@ static inline void sdcardfs_put_real_lower(const struct dentry *dent, sdcardfs_put_lower_path(dent, real_lower); } +extern struct mutex sdcardfs_super_list_lock; +extern struct list_head sdcardfs_super_list; + /* for packagelist.c */ -extern int get_caller_has_rw_locked(void *pkgl_id, derive_t derive); extern appid_t get_appid(void *pkgl_id, const char *app_name); -extern int check_caller_access_to_name(struct inode *parent_node, const char* name, - derive_t derive, int w_ok, int has_rw); +extern int check_caller_access_to_name(struct inode *parent_node, const char* name); extern int open_flags_to_access_mode(int open_flags); -extern void * packagelist_create(gid_t write_gid); -extern void packagelist_destroy(void *pkgl_id); extern int packagelist_init(void); extern void packagelist_exit(void); /* for derived_perm.c */ extern void setup_derived_state(struct inode *inode, perm_t perm, - userid_t userid, uid_t uid, gid_t gid, mode_t mode); + userid_t userid, uid_t uid, bool under_android); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); -extern void update_derived_permission(struct dentry *dentry); +extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry); +extern void get_derive_permissions_recursive(struct dentry *parent); + +extern void update_derived_permission_lock(struct dentry *dentry); extern int need_graft_path(struct dentry *dentry); extern int is_base_obbpath(struct dentry *dentry); extern int is_obbpath_invalid(struct dentry *dentry); @@ -483,4 +519,18 @@ static inline int check_min_free_space(struct dentry *dentry, size_t size, int d return 1; } +/* Copies attrs and maintains sdcardfs managed attrs */ +static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src) +{ + dest->i_mode = (src->i_mode & S_IFMT) | get_mode(SDCARDFS_I(dest)); + dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->d_uid); + dest->i_gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(dest))); + dest->i_rdev = src->i_rdev; + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + dest->i_blkbits = src->i_blkbits; + dest->i_flags = src->i_flags; + set_nlink(dest, src->i_nlink); +} #endif /* not _SDCARDFS_H_ */ diff --git a/fs/sdcardfs/strtok.h b/fs/sdcardfs/strtok.h deleted file mode 100644 index 50ab25aa0bc4..000000000000 --- a/fs/sdcardfs/strtok.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * fs/sdcardfs/strtok.h - * - * Copyright (c) 2013 Samsung Electronics Co. Ltd - * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun, - * Sunghwan Yun, Sungjong Seo - * - * This program has been developed as a stackable file system based on - * the WrapFS which written by - * - * Copyright (c) 1998-2011 Erez Zadok - * Copyright (c) 2009 Shrikar Archak - * Copyright (c) 2003-2011 Stony Brook University - * Copyright (c) 2003-2011 The Research Foundation of SUNY - * - * This file is dual licensed. It may be redistributed and/or modified - * under the terms of the Apache 2.0 License OR version 2 of the GNU - * General Public License. - */ - -static char * -strtok_r(char *s, const char *delim, char **last) -{ - char *spanp; - int c, sc; - char *tok; - - - /* if (s == NULL && (s = *last) == NULL) - return NULL; */ - if (s == NULL) { - s = *last; - if (s == NULL) - return NULL; - } - - /* - * Skip (span) leading delimiters (s += strspn(s, delim), sort of). - */ -cont: - c = *s++; - for (spanp = (char *)delim; (sc = *spanp++) != 0;) { - if (c == sc) - goto cont; - } - - if (c == 0) { /* no non-delimiter characters */ - *last = NULL; - return NULL; - } - tok = s - 1; - - /* - * Scan token (scan for delimiters: s += strcspn(s, delim), sort of). - * Note that delim must have one NUL; we stop if we see that, too. - */ - for (;;) { - c = *s++; - spanp = (char *)delim; - do { - sc = *spanp++; - if (sc == c) { - if (c == 0) - s = NULL; - else - s[-1] = 0; - *last = s; - return tok; - } - } while (sc != 0); - } - - /* NOTREACHED */ -} - diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index f153ce1b8cf3..1d6490128c99 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -46,9 +46,6 @@ static void sdcardfs_put_super(struct super_block *sb) sdcardfs_set_lower_super(sb, NULL); atomic_dec(&s->s_active); - if(spd->pkgl_id) - packagelist_destroy(spd->pkgl_id); - kfree(spd); sb->s_fs_info = NULL; } @@ -203,12 +200,8 @@ static int sdcardfs_show_options(struct seq_file *m, struct dentry *root) if (opts->fs_low_gid != 0) seq_printf(m, ",gid=%u", opts->fs_low_gid); - if (opts->derive == DERIVE_NONE) - seq_printf(m, ",derive=none"); - else if (opts->derive == DERIVE_LEGACY) - seq_printf(m, ",derive=legacy"); - else if (opts->derive == DERIVE_UNIFIED) - seq_printf(m, ",derive=unified"); + if (opts->multiuser) + seq_printf(m, ",multiuser"); if (opts->reserved_mb != 0) seq_printf(m, ",reserved=%uMB", opts->reserved_mb); From 530d9c5fc268b0c90ffbca40384f9ee657076724 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 11 Feb 2016 16:44:15 -0800 Subject: [PATCH 035/607] vfs: add d_canonical_path for stacked filesystem support Inotify does not currently know when a filesystem is acting as a wrapper around another fs. This means that inotify watchers will miss any modifications to the base file, as well as any made in a separate stacked fs that points to the same file. d_canonical_path solves this problem by allowing the fs to map a dentry to a path in the lower fs. Inotify can use it to find the appropriate place to watch to be informed of all changes to a file. Change-Id: I09563baffad1711a045e45c1bd0bd8713c2cc0b6 Signed-off-by: Daniel Rosenberg --- fs/notify/inotify/inotify_user.c | 17 ++++++++++++++--- include/linux/dcache.h | 1 + 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index b8d08d0d0a4d..9a0cd02cf3dc 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -702,6 +702,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, struct fsnotify_group *group; struct inode *inode; struct path path; + struct path alteredpath; + struct path *canonical_path = &path; struct fd f; int ret; unsigned flags = 0; @@ -741,13 +743,22 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (ret) goto fput_and_out; + /* support stacked filesystems */ + if(path.dentry && path.dentry->d_op) { + if (path.dentry->d_op->d_canonical_path) { + path.dentry->d_op->d_canonical_path(path.dentry, &alteredpath); + canonical_path = &alteredpath; + path_put(&path); + } + } + /* inode held in place by reference to path; group by fget on fd */ - inode = path.dentry->d_inode; + inode = canonical_path->dentry->d_inode; group = f.file->private_data; /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); - path_put(&path); + path_put(canonical_path); fput_and_out: fdput(f); return ret; @@ -814,7 +825,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); - BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); + BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d67ae119cf4e..ebad56e31a8c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -161,6 +161,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); struct inode *(*d_select_inode)(struct dentry *, unsigned); + void (*d_canonical_path)(const struct dentry *, struct path *); } ____cacheline_aligned; /* From f06e869f936e548fd3fd78ddcbebe171f0defadb Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 11 Feb 2016 16:53:36 -0800 Subject: [PATCH 036/607] sdcardfs: Add support for d_canonical_path Change-Id: I5d6f0e71b8ca99aec4b0894412f1dfd1cfe12add Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/dentry.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index dbbcfd091fc7..ba165ef11e27 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -177,5 +177,6 @@ const struct dentry_operations sdcardfs_ci_dops = { .d_release = sdcardfs_d_release, .d_hash = sdcardfs_hash_ci, .d_compare = sdcardfs_cmp_ci, + .d_canonical_path = sdcardfs_get_real_lower, }; From 87977cb3223256f62b18c31bc580715332ddce86 Mon Sep 17 00:00:00 2001 From: Thierry Strudel Date: Wed, 23 Mar 2016 10:02:15 -0700 Subject: [PATCH 037/607] trace: cpufreq: fix typo in min/max cpufreq Change-Id: Ieed402d3a912b7a318826e101efe2c24b07ebfe4 Signed-off-by: Thierry Strudel --- include/trace/events/power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index de77035567c4..9af0d898016a 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -135,7 +135,7 @@ TRACE_EVENT(cpu_frequency_limits, TP_fast_assign( __entry->min_freq = min_freq; - __entry->max_freq = min_freq; + __entry->max_freq = max_freq; __entry->cpu_id = cpu_id; ), From 3f3b4c799d2c9c706d68c2db31b332e5a548cf53 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 23 Mar 2016 08:32:23 -0700 Subject: [PATCH 038/607] fs: sdcardfs: Declare LOOKUP_CASE_INSENSITIVE unconditionally Attempts to build sdcardfs as module fail with fs/sdcardfs/lookup.c: In function '__sdcardfs_lookup': fs/sdcardfs/lookup.c:243:5: error: 'LOOKUP_CASE_INSENSITIVE' undeclared This occurs because the define is enclosed with #ifdef CONFIG_SDCARD_FS_CI_SEARCH. If SDCARD_FS_CI_SEARCH is configured to be built as module, this does not work. Alternatives would be to use #if IS_ENABLED(CONFIG_SDCARD_FS_CI_SEARCH), or to declare SDCARD_FS_CI_SEARCH as bool, but that does not work because the define is used unconditionally in the source. Note that LOOKUP_CASE_INSENSITIVE is only set but not evaluated in the current source code, so setting the flag has no real effect. Fixes: 84a1b7d3d312 ("Included sdcardfs source code for kernel 3.0") Cc: Daniel Rosenberg Signed-off-by: Guenter Roeck --- include/linux/namei.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/namei.h b/include/linux/namei.h index f2b8acbdb928..47b53673ec61 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -43,9 +43,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 -#ifdef CONFIG_SDCARD_FS_CI_SEARCH #define LOOKUP_CASE_INSENSITIVE 0x8000 -#endif extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); From 2dea38519013642e4cdba450151f8328a0763fad Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 23 Mar 2016 12:09:25 -0700 Subject: [PATCH 039/607] inotify: Fix erroneous update of bit count Patch "vfs: add d_canonical_path for stacked filesystem support" erroneously updated the ALL_INOTIFY_BITS count. This changes it back Change-Id: Idb04edc736da276159d30f04c40cff9d6b1e070f --- fs/notify/inotify/inotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 9a0cd02cf3dc..f72f3b25b3f2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -825,7 +825,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); - BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22); + BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); From 7581abb2ceff034995e098de33282f4c7c8dd0c7 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 23 Mar 2016 16:39:30 -0700 Subject: [PATCH 040/607] sdcardfs: remove effectless config option CONFIG_SDCARD_FS_CI_SEARCH only guards a define for LOOKUP_CASE_INSENSITIVE, which is never used in the kernel. Remove both, along with the option matching that supports it. Change-Id: I363a8f31de8ee7a7a934d75300cc9ba8176e2edf Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/Kconfig | 5 ----- fs/sdcardfs/lookup.c | 7 +------ fs/sdcardfs/main.c | 15 --------------- fs/sdcardfs/sdcardfs.h | 6 ------ include/linux/namei.h | 1 - 5 files changed, 1 insertion(+), 33 deletions(-) diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig index ab25f88ebb37..a1c103316ac7 100644 --- a/fs/sdcardfs/Kconfig +++ b/fs/sdcardfs/Kconfig @@ -11,8 +11,3 @@ config SDCARD_FS_FADV_NOACTIVE default y help Sdcardfs supports fadvise noactive mode. - -config SDCARD_FS_CI_SEARCH - tristate "sdcardfs case-insensitive search support" - depends on SDCARD_FS - default y diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index f80abcb6b467..a01b06a514fd 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -238,13 +238,8 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, lower_dir_mnt = lower_parent_path->mnt; /* Use vfs_path_lookup to check if the dentry exists or not */ - if (sbi->options.lower_fs == LOWER_FS_EXT4) { - err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, - LOOKUP_CASE_INSENSITIVE, &lower_path); - } else if (sbi->options.lower_fs == LOWER_FS_FAT) { - err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, + err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, &lower_path); - } /* no error: handle positive dentries */ if (!err) { diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 80aa355d801e..fa11a0458b84 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -41,7 +41,6 @@ static const match_table_t sdcardfs_tokens = { {Opt_fsgid, "fsgid=%u"}, {Opt_gid, "gid=%u"}, {Opt_debug, "debug"}, - {Opt_lower_fs, "lower_fs=%s"}, {Opt_mask, "mask=%u"}, {Opt_userid, "userid=%d"}, {Opt_multiuser, "multiuser"}, @@ -64,8 +63,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, opts->multiuser = false; opts->fs_user_id = 0; opts->gid = 0; - /* by default, we use LOWER_FS_EXT4 as lower fs type */ - opts->lower_fs = LOWER_FS_EXT4; /* by default, 0MB is reserved */ opts->reserved_mb = 0; @@ -113,18 +110,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, case Opt_multiuser: opts->multiuser = true; break; - case Opt_lower_fs: - string_option = match_strdup(&args[0]); - if (!strcmp("ext4", string_option)) { - opts->lower_fs = LOWER_FS_EXT4; - } else if (!strcmp("fat", string_option)) { - opts->lower_fs = LOWER_FS_FAT; - } else { - kfree(string_option); - goto invalid_option; - } - kfree(string_option); - break; case Opt_reserved_mb: if (match_int(&args[0], &option)) return 0; diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 1b85f4e70324..f111f898b630 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -123,11 +123,6 @@ typedef enum { PERM_ANDROID_MEDIA, } perm_t; -typedef enum { - LOWER_FS_EXT4, - LOWER_FS_FAT, -} lower_fs_t; - struct sdcardfs_sb_info; struct sdcardfs_mount_options; @@ -191,7 +186,6 @@ struct sdcardfs_mount_options { gid_t fs_low_gid; userid_t fs_user_id; gid_t gid; - lower_fs_t lower_fs; mode_t mask; bool multiuser; unsigned int reserved_mb; diff --git a/include/linux/namei.h b/include/linux/namei.h index 47b53673ec61..d53c25453aca 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -43,7 +43,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 -#define LOOKUP_CASE_INSENSITIVE 0x8000 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); From cdaa7dcb05f7955e855fa816c21c3210088ca2b8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 24 Mar 2016 10:32:35 -0700 Subject: [PATCH 041/607] fs: Export d_absolute_path The 0-day build bot reports the following build error, seen if SDCARD_FS is built as module. ERROR: "d_absolute_path" undefined! Fixes: 84a1b7d3d312 ("Included sdcardfs source code for kernel 3.0") Reported-by: Fengguang Wu Signed-off-by: Guenter Roeck --- fs/dcache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dcache.c b/fs/dcache.c index 5c33aeb0f68f..065da6950eb4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3027,6 +3027,7 @@ char *d_absolute_path(const struct path *path, return ERR_PTR(error); return res; } +EXPORT_SYMBOL(d_absolute_path); /* * same as __d_path but appends "(deleted)" for unlinked files. From a5aa95121d126ce2bde0d53d47c5d6641c900948 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 28 Mar 2016 15:00:20 -0700 Subject: [PATCH 042/607] sdcardfs: Remove unused code Change-Id: Ie97cba27ce44818ac56cfe40954f164ad44eccf6 --- fs/sdcardfs/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index fa11a0458b84..a6522286d731 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -54,7 +54,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, char *p; substring_t args[MAX_OPT_ARGS]; int option; - char *string_option; /* by default, we use AID_MEDIA_RW as uid, gid */ opts->fs_low_uid = AID_MEDIA_RW; @@ -117,7 +116,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, break; /* unknown option */ default: -invalid_option: if (!silent) { printk( KERN_ERR "Unrecognized mount option \"%s\" " "or missing value", p); From 73fbdac21118062150d729c8a93e80c09f8a3521 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 28 Mar 2016 16:00:34 -0700 Subject: [PATCH 043/607] sdcardfs: remove unneeded __init and __exit Change-Id: I2a2d45d52f891332174c3000e8681c5167c1564f --- fs/sdcardfs/packagelist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index ba3478d94107..10f0d6be718b 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -392,7 +392,7 @@ static struct configfs_subsystem sdcardfs_packages_subsys = { }, }; -static int __init configfs_sdcardfs_init(void) +static int configfs_sdcardfs_init(void) { int ret; struct configfs_subsystem *subsys = &sdcardfs_packages_subsys; @@ -408,7 +408,7 @@ static int __init configfs_sdcardfs_init(void) return ret; } -static void __exit configfs_sdcardfs_exit(void) +static void configfs_sdcardfs_exit(void) { configfs_unregister_subsystem(&sdcardfs_packages_subsys); } From de679b3bbe8b810b01929b1f27665ede5eb54f37 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 24 Mar 2016 10:39:14 -0700 Subject: [PATCH 044/607] mm: Export do_munmap The 0-day build bot reports the following build error, seen if SDCARD_FS is built as module. ERROR: "do_munmap" undefined! Fixes: 84a1b7d3d312 ("Included sdcardfs source code for kernel 3.0") Reported-by: Fengguang Wu Signed-off-by: Guenter Roeck --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 2b1143008793..ee856266cc61 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2639,6 +2639,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } +EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { From a9c7ddcf0347675f8bc681830d21d27df2ed8cb1 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 25 Nov 2015 16:03:31 -0500 Subject: [PATCH 045/607] UPSTREAM: dm: don't save and restore bi_private Device mapper used the field bi_private to point to dm_target_io. However, since kernel 3.15, the bi_private field is unused, and so the targets do not need to save and restore this field. This patch removes code that saves and restores bi_private from dm-cache, dm-snapshot and dm-verity. Change-Id: Ic72905ccb6d58ff94eafaa47ba54b2688d92d3d1 Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer (cherry picked from commit fe3265b180d6282648f03bc6ac3958c733df01c2) --- drivers/md/dm-cache-target.c | 3 --- drivers/md/dm-snap.c | 6 +----- drivers/md/dm-verity.c | 5 +---- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2fd4c8296144..5780accffa30 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) */ struct dm_hook_info { bio_end_io_t *bi_end_io; - void *bi_private; }; static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, bio_end_io_t *bi_end_io, void *bi_private) { h->bi_end_io = bio->bi_end_io; - h->bi_private = bio->bi_private; bio->bi_end_io = bi_end_io; bio->bi_private = bi_private; @@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) { bio->bi_end_io = h->bi_end_io; - bio->bi_private = h->bi_private; } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c06b74e91cd6..f68d0ae5b198 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -207,7 +207,6 @@ struct dm_snap_pending_exception { */ struct bio *full_bio; bio_end_io_t *full_bio_end_io; - void *full_bio_private; }; /* @@ -1485,10 +1484,8 @@ out: snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; - if (full_bio) { + if (full_bio) full_bio->bi_end_io = pe->full_bio_end_io; - full_bio->bi_private = pe->full_bio_private; - } increment_pending_exceptions_done_count(); up_write(&s->lock); @@ -1605,7 +1602,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - pe->full_bio_private = bio->bi_private; callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index ccf41886ebcf..9e8891507c1c 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -83,9 +83,8 @@ struct dm_verity { struct dm_verity_io { struct dm_verity *v; - /* original values of bio->bi_end_io and bio->bi_private */ + /* original value of bio->bi_end_io */ bio_end_io_t *orig_bi_end_io; - void *orig_bi_private; sector_t block; unsigned n_blocks; @@ -453,7 +452,6 @@ static void verity_finish_io(struct dm_verity_io *io, int error) struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); bio->bi_end_io = io->orig_bi_end_io; - bio->bi_private = io->orig_bi_private; bio->bi_error = error; bio_endio(bio); @@ -566,7 +564,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, ti->per_bio_data_size); io->v = v; io->orig_bi_end_io = bio->bi_end_io; - io->orig_bi_private = bio->bi_private; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; From ab14138e109c1d292c2eb28bdb8f701cf34a86a0 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 5 Nov 2015 02:02:31 +0000 Subject: [PATCH 046/607] UPSTREAM: dm verity: clean up duplicate hashing code Handle dm-verity salting in one place to simplify the code. Change-Id: If923a01dc63ae5123af13ba1b0863b73e33ddf46 Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Mike Snitzer (cherry picked from commit 6dbeda3469ced777bc3138ed5918f7ae79670b7b) --- drivers/md/dm-verity.c | 266 +++++++++++++++++++++++------------------ 1 file changed, 149 insertions(+), 117 deletions(-) diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 9e8891507c1c..24517055bd8e 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -172,6 +172,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } +/* + * Wrapper for crypto_shash_init, which handles verity salting. + */ +static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc) +{ + int r; + + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(desc); + + if (unlikely(r < 0)) { + DMERR("crypto_shash_init failed: %d", r); + return r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + + if (unlikely(r < 0)) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + return 0; +} + +static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len) +{ + int r = crypto_shash_update(desc, data, len); + + if (unlikely(r < 0)) + DMERR("crypto_shash_update failed: %d", r); + + return r; +} + +static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, + u8 *digest) +{ + int r; + + if (unlikely(!v->version)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + r = crypto_shash_final(desc, digest); + + if (unlikely(r < 0)) + DMERR("crypto_shash_final failed: %d", r); + + return r; +} + +static int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest) +{ + int r; + + r = verity_hash_init(v, desc); + if (unlikely(r < 0)) + return r; + + r = verity_hash_update(v, desc, data, len); + if (unlikely(r < 0)) + return r; + + return verity_hash_final(v, desc, digest); +} + static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, sector_t *hash_block, unsigned *offset) { @@ -252,10 +330,10 @@ out: * If "skip_unverified" is false, unverified buffer is hashed and verified * against current value of io_want_digest(v, io). */ -static int verity_verify_level(struct dm_verity_io *io, sector_t block, - int level, bool skip_unverified) +static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, int level, bool skip_unverified, + u8 *want_digest) { - struct dm_verity *v = io->v; struct dm_buffer *buf; struct buffer_aux *aux; u8 *data; @@ -272,74 +350,71 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block, aux = dm_bufio_get_aux_data(buf); if (!aux->hash_verified) { - struct shash_desc *desc; - u8 *result; - if (skip_unverified) { r = 1; goto release_ret_r; } - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); + r = verity_hash(v, io_hash_desc(v, io), + data, 1 << v->hash_dev_block_bits, + io_real_digest(v, io)); + if (unlikely(r < 0)) goto release_ret_r; - } - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - - r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); - goto release_ret_r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block)) { - r = -EIO; - goto release_ret_r; - } - } else + if (likely(memcmp(io_real_digest(v, io), want_digest, + v->digest_size) == 0)) aux->hash_verified = 1; + else if (verity_handle_err(v, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block)) { + r = -EIO; + goto release_ret_r; + } } data += offset; - - memcpy(io_want_digest(v, io), data, v->digest_size); - - dm_bufio_release(buf); - return 0; + memcpy(want_digest, data, v->digest_size); + r = 0; release_ret_r: dm_bufio_release(buf); - return r; } +/* + * Find a hash for a given block, write it to digest and verify the integrity + * of the hash tree if necessary. + */ +static int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest) +{ + int i; + int r; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 1 and we fall back to whole + * chain verification. + */ + r = verity_verify_level(v, io, block, 0, true, digest); + if (likely(r <= 0)) + return r; + } + + memcpy(digest, v->root_digest, v->digest_size); + + for (i = v->levels - 1; i >= 0; i--) { + r = verity_verify_level(v, io, block, i, false, digest); + if (unlikely(r)) + return r; + } + + return 0; +} + /* * Verify one "dm_verity_io" structure. */ @@ -349,54 +424,21 @@ static int verity_verify_io(struct dm_verity_io *io) struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); unsigned b; - int i; for (b = 0; b < io->n_blocks; b++) { - struct shash_desc *desc; - u8 *result; int r; unsigned todo; + struct shash_desc *desc = io_hash_desc(v, io); - if (likely(v->levels)) { - /* - * First, we try to get the requested hash for - * the current block. If the hash block itself is - * verified, zero is returned. If it isn't, this - * function returns 0 and we fall back to whole - * chain verification. - */ - int r = verity_verify_level(io, io->block + b, 0, true); - if (likely(!r)) - goto test_block_hash; - if (r < 0) - return r; - } - - memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); - - for (i = v->levels - 1; i >= 0; i--) { - int r = verity_verify_level(io, io->block + b, i, false); - if (unlikely(r)) - return r; - } - -test_block_hash: - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); + r = verity_hash_for_block(v, io, io->block + b, + io_want_digest(v, io)); + if (unlikely(r < 0)) + return r; + + r = verity_hash_init(v, desc); + if (unlikely(r < 0)) return r; - } - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } todo = 1 << v->data_dev_block_bits; do { u8 *page; @@ -407,37 +449,27 @@ test_block_hash: len = bv.bv_len; if (likely(len >= todo)) len = todo; - r = crypto_shash_update(desc, page + bv.bv_offset, len); + r = verity_hash_update(v, desc, page + bv.bv_offset, + len); kunmap_atomic(page); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); + if (unlikely(r < 0)) return r; - } bio_advance_iter(bio, &io->iter, len); todo -= len; } while (todo); - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } - - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); + r = verity_hash_final(v, desc, io_real_digest(v, io)); + if (unlikely(r < 0)) return r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) - return -EIO; - } + + if (likely(memcmp(io_real_digest(v, io), + io_want_digest(v, io), v->digest_size) == 0)) + continue; + else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b)) + return -EIO; } return 0; From 7475611ceceb9eb0cfd31f44d2c820444e1276d0 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 5 Nov 2015 02:02:32 +0000 Subject: [PATCH 047/607] UPSTREAM: dm verity: separate function for parsing opt args Move optional argument parsing into a separate function to make it easier to add more of them without making verity_ctr even longer. Change-Id: I9cd9df41c3326824f8cca5764075501987e78a52 Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Mike Snitzer (cherry picked from commit 753c1fd02807cb43a1c5d01d75d454054d46bdad) --- drivers/md/dm-verity.c | 71 +++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 24517055bd8e..b0a53c3b926d 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -34,6 +34,8 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPTS_MAX 1 + static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); @@ -721,6 +723,44 @@ static void verity_dtr(struct dm_target *ti) kfree(v); } +static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) +{ + int r; + unsigned argc; + struct dm_target *ti = v->ti; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"}, + }; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + do { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) { + v->mode = DM_VERITY_MODE_LOGGING; + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) { + v->mode = DM_VERITY_MODE_RESTART; + continue; + } + + ti->error = "Unrecognized verity feature request"; + return -EINVAL; + } while (argc && !r); + + return r; +} + /* * Target parameters: * The current format is version 1. @@ -739,18 +779,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; struct dm_arg_set as; - const char *opt_string; - unsigned int num, opt_params; + unsigned int num; unsigned long long num_ll; int r; int i; sector_t hash_position; char dummy; - static struct dm_arg _args[] = { - {0, 1, "Invalid number of feature args"}, - }; - v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); if (!v) { ti->error = "Cannot allocate verity structure"; @@ -895,29 +930,9 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) as.argc = argc; as.argv = argv; - r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); - if (r) + r = verity_parse_opt_args(&as, v); + if (r < 0) goto bad; - - while (opt_params) { - opt_params--; - opt_string = dm_shift_arg(&as); - if (!opt_string) { - ti->error = "Not enough feature arguments"; - r = -EINVAL; - goto bad; - } - - if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING)) - v->mode = DM_VERITY_MODE_LOGGING; - else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART)) - v->mode = DM_VERITY_MODE_RESTART; - else { - ti->error = "Invalid feature arguments"; - r = -EINVAL; - goto bad; - } - } } v->hash_per_block_bits = From 6558fe3cbe58107b887fb449faf7cf0295f65932 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 15:36:00 -0500 Subject: [PATCH 048/607] UPSTREAM: dm verity: move dm-verity.c to dm-verity-target.c Prepare for extending dm-verity with an optional object. Follows the naming convention used by other DM targets (e.g. dm-cache and dm-era). Change-Id: If6d2f27b290adf14fa77f3745fdc13aaa417c8dc Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit 03045cbafa2d663ad8d0a583ac219d202d824344) --- drivers/md/Makefile | 1 + drivers/md/{dm-verity.c => dm-verity-target.c} | 0 2 files changed, 1 insertion(+) rename drivers/md/{dm-verity.c => dm-verity-target.c} (100%) diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f34979cd141a..94e9f6bb33d1 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -16,6 +16,7 @@ dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o +dm-verity-y += dm-verity-target.o md-mod-y += md.o bitmap.o raid456-y += raid5.o raid5-cache.o diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c similarity index 100% rename from drivers/md/dm-verity.c rename to drivers/md/dm-verity-target.c From 3ec912f8ef374d0805eaf496535aaf51b159e355 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 16:01:51 -0500 Subject: [PATCH 049/607] UPSTREAM: dm verity: factor out structures and functions useful to separate object Prepare for an optional verity object to make use of existing dm-verity structures and functions. Change-Id: Ib14c3834bfed222b33e068908fb5f71a53e1187b Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit ffa393807cd69656d5b6bc9d9622e205071cbab8) --- drivers/md/dm-verity-target.c | 116 +++++----------------------------- drivers/md/dm-verity.h | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 100 deletions(-) create mode 100644 drivers/md/dm-verity.h diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index b0a53c3b926d..7e200ba631fb 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -14,12 +14,10 @@ * access behavior. */ -#include "dm-bufio.h" +#include "dm-verity.h" #include -#include #include -#include #define DM_MSG_PREFIX "verity" @@ -28,7 +26,6 @@ #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 -#define DM_VERITY_MAX_LEVELS 63 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 #define DM_VERITY_OPT_LOGGING "ignore_corruption" @@ -40,72 +37,6 @@ static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); -enum verity_mode { - DM_VERITY_MODE_EIO, - DM_VERITY_MODE_LOGGING, - DM_VERITY_MODE_RESTART -}; - -enum verity_block_type { - DM_VERITY_BLOCK_TYPE_DATA, - DM_VERITY_BLOCK_TYPE_METADATA -}; - -struct dm_verity { - struct dm_dev *data_dev; - struct dm_dev *hash_dev; - struct dm_target *ti; - struct dm_bufio_client *bufio; - char *alg_name; - struct crypto_shash *tfm; - u8 *root_digest; /* digest of the root block */ - u8 *salt; /* salt: its size is salt_size */ - unsigned salt_size; - sector_t data_start; /* data offset in 512-byte sectors */ - sector_t hash_start; /* hash start in blocks */ - sector_t data_blocks; /* the number of data blocks */ - sector_t hash_blocks; /* the number of hash blocks */ - unsigned char data_dev_block_bits; /* log2(data blocksize) */ - unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ - unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ - unsigned char levels; /* the number of tree levels */ - unsigned char version; - unsigned digest_size; /* digest size for the current hash algorithm */ - unsigned shash_descsize;/* the size of temporary space for crypto */ - int hash_failed; /* set to 1 if hash of any block failed */ - enum verity_mode mode; /* mode for handling verification errors */ - unsigned corrupted_errs;/* Number of errors for corrupted blocks */ - - struct workqueue_struct *verify_wq; - - /* starting blocks for each tree level. 0 is the lowest level. */ - sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; -}; - -struct dm_verity_io { - struct dm_verity *v; - - /* original value of bio->bi_end_io */ - bio_end_io_t *orig_bi_end_io; - - sector_t block; - unsigned n_blocks; - - struct bvec_iter iter; - - struct work_struct work; - - /* - * Three variably-size fields follow this struct: - * - * u8 hash_desc[v->shash_descsize]; - * u8 real_digest[v->digest_size]; - * u8 want_digest[v->digest_size]; - * - * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). - */ -}; - struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -113,21 +44,6 @@ struct dm_verity_prefetch_work { unsigned n_blocks; }; -static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) -{ - return (struct shash_desc *)(io + 1); -} - -static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize; -} - -static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; -} - /* * Auxiliary structure appended to each dm-bufio buffer. If the value * hash_verified is nonzero, hash of the block has been verified. @@ -236,8 +152,8 @@ static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, return r; } -static int verity_hash(struct dm_verity *v, struct shash_desc *desc, - const u8 *data, size_t len, u8 *digest) +int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest) { int r; @@ -325,12 +241,12 @@ out: * Verify hash of a metadata block pertaining to the specified data block * ("block" argument) at a specified level ("level" argument). * - * On successful return, io_want_digest(v, io) contains the hash value for - * a lower tree level or for the data block (if we're at the lowest leve). + * On successful return, verity_io_want_digest(v, io) contains the hash value + * for a lower tree level or for the data block (if we're at the lowest level). * * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. * If "skip_unverified" is false, unverified buffer is hashed and verified - * against current value of io_want_digest(v, io). + * against current value of verity_io_want_digest(v, io). */ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, sector_t block, int level, bool skip_unverified, @@ -357,13 +273,13 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, goto release_ret_r; } - r = verity_hash(v, io_hash_desc(v, io), + r = verity_hash(v, verity_io_hash_desc(v, io), data, 1 << v->hash_dev_block_bits, - io_real_digest(v, io)); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) goto release_ret_r; - if (likely(memcmp(io_real_digest(v, io), want_digest, + if (likely(memcmp(verity_io_real_digest(v, io), want_digest, v->digest_size) == 0)) aux->hash_verified = 1; else if (verity_handle_err(v, @@ -387,8 +303,8 @@ release_ret_r: * Find a hash for a given block, write it to digest and verify the integrity * of the hash tree if necessary. */ -static int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, - sector_t block, u8 *digest) +int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest) { int i; int r; @@ -430,10 +346,10 @@ static int verity_verify_io(struct dm_verity_io *io) for (b = 0; b < io->n_blocks; b++) { int r; unsigned todo; - struct shash_desc *desc = io_hash_desc(v, io); + struct shash_desc *desc = verity_io_hash_desc(v, io); r = verity_hash_for_block(v, io, io->block + b, - io_want_digest(v, io)); + verity_io_want_digest(v, io)); if (unlikely(r < 0)) return r; @@ -462,12 +378,12 @@ static int verity_verify_io(struct dm_verity_io *io) todo -= len; } while (todo); - r = verity_hash_final(v, desc, io_real_digest(v, io)); + r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); if (unlikely(r < 0)) return r; - if (likely(memcmp(io_real_digest(v, io), - io_want_digest(v, io), v->digest_size) == 0)) + if (likely(memcmp(verity_io_real_digest(v, io), + verity_io_want_digest(v, io), v->digest_size) == 0)) continue; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, io->block + b)) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h new file mode 100644 index 000000000000..c7ad4fd05188 --- /dev/null +++ b/drivers/md/dm-verity.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2015 Google, Inc. + * + * Author: Mikulas Patocka + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * This file is released under the GPLv2. + */ + +#ifndef DM_VERITY_H +#define DM_VERITY_H + +#include "dm-bufio.h" +#include +#include + +#define DM_VERITY_MAX_LEVELS 63 + +enum verity_mode { + DM_VERITY_MODE_EIO, + DM_VERITY_MODE_LOGGING, + DM_VERITY_MODE_RESTART +}; + +enum verity_block_type { + DM_VERITY_BLOCK_TYPE_DATA, + DM_VERITY_BLOCK_TYPE_METADATA +}; + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_shash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + unsigned salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + unsigned digest_size; /* digest size for the current hash algorithm */ + unsigned shash_descsize;/* the size of temporary space for crypto */ + int hash_failed; /* set to 1 if hash of any block failed */ + enum verity_mode mode; /* mode for handling verification errors */ + unsigned corrupted_errs;/* Number of errors for corrupted blocks */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; +}; + +struct dm_verity_io { + struct dm_verity *v; + + /* original value of bio->bi_end_io */ + bio_end_io_t *orig_bi_end_io; + + sector_t block; + unsigned n_blocks; + + struct bvec_iter iter; + + struct work_struct work; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_desc[v->shash_descsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: verity_io_hash_desc(), verity_io_real_digest() + * and verity_io_want_digest(). + */ +}; + +static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (struct shash_desc *)(io + 1); +} + +static inline u8 *verity_io_real_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize; +} + +static inline u8 *verity_io_want_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; +} + +extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest); + +extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest); + +#endif /* DM_VERITY_H */ From 890b7865c5f5e7e5b5bc6acaa4193d967fb8bea7 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 16:30:36 -0500 Subject: [PATCH 050/607] UPSTREAM: dm verity: factor out verity_for_bv_block() verity_for_bv_block() will be re-used by optional dm-verity object. Change-Id: I80e0f8e7c9f234fce3fbdf21cb05aba3041d7f98 Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit bb4d73ac5e4f0a6c4853f35824f6cb2d396a2f9c) --- drivers/md/dm-verity-target.c | 72 ++++++++++++++++++++++++----------- drivers/md/dm-verity.h | 6 +++ 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 7e200ba631fb..2b0ee52d1ad8 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -333,19 +333,61 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, return 0; } +/* + * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec + * starting from iter. + */ +int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, u8 *data, + size_t len)) +{ + unsigned todo = 1 << v->data_dev_block_bits; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + + do { + int r; + u8 *page; + unsigned len; + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + page = kmap_atomic(bv.bv_page); + len = bv.bv_len; + + if (likely(len >= todo)) + len = todo; + + r = process(v, io, page + bv.bv_offset, len); + kunmap_atomic(page); + + if (r < 0) + return r; + + bio_advance_iter(bio, iter, len); + todo -= len; + } while (todo); + + return 0; +} + +static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); +} + /* * Verify one "dm_verity_io" structure. */ static int verity_verify_io(struct dm_verity_io *io) { struct dm_verity *v = io->v; - struct bio *bio = dm_bio_from_per_bio_data(io, - v->ti->per_bio_data_size); + struct bvec_iter start; unsigned b; for (b = 0; b < io->n_blocks; b++) { int r; - unsigned todo; struct shash_desc *desc = verity_io_hash_desc(v, io); r = verity_hash_for_block(v, io, io->block + b, @@ -357,26 +399,10 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; - todo = 1 << v->data_dev_block_bits; - do { - u8 *page; - unsigned len; - struct bio_vec bv = bio_iter_iovec(bio, io->iter); - - page = kmap_atomic(bv.bv_page); - len = bv.bv_len; - if (likely(len >= todo)) - len = todo; - r = verity_hash_update(v, desc, page + bv.bv_offset, - len); - kunmap_atomic(page); - - if (unlikely(r < 0)) - return r; - - bio_advance_iter(bio, &io->iter, len); - todo -= len; - } while (todo); + start = io->iter; + r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update); + if (unlikely(r < 0)) + return r; r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); if (unlikely(r < 0)) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index c7ad4fd05188..f5af52df8e38 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -103,6 +103,12 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; } +extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, + u8 *data, size_t len)); + extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, const u8 *data, size_t len, u8 *digest); From 83c1827e5887c9288e4ecb7962c6dc63596ad922 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 14:26:30 +0000 Subject: [PATCH 051/607] UPSTREAM: dm verity: add support for forward error correction Add support for correcting corrupted blocks using Reed-Solomon. This code uses RS(255, N) interleaved across data and hash blocks. Each error-correcting block covers N bytes evenly distributed across the combined total data, so that each byte is a maximum distance away from the others. This makes it possible to recover from several consecutive corrupted blocks with relatively small space overhead. In addition, using verity hashes to locate erasures nearly doubles the effectiveness of error correction. Being able to detect corrupted blocks also improves performance, because only corrupted blocks need to corrected. For a 2 GiB partition, RS(255, 253) (two parity bytes for each 253-byte block) can correct up to 16 MiB of consecutive corrupted blocks if erasures can be located, and 8 MiB if they cannot, with 16 MiB space overhead. Change-Id: Ife4f8889f7fbf0974bf3ed4be6d3322ae9b4cb0e Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit a739ff3f543afbb4a041c16cd0182c8e8d366e70) --- Documentation/device-mapper/verity.txt | 35 +- drivers/md/Kconfig | 12 + drivers/md/Makefile | 4 + drivers/md/dm-verity-fec.c | 812 +++++++++++++++++++++++++ drivers/md/dm-verity-fec.h | 152 +++++ drivers/md/dm-verity-target.c | 55 +- drivers/md/dm-verity.h | 10 + 7 files changed, 1071 insertions(+), 9 deletions(-) create mode 100644 drivers/md/dm-verity-fec.c create mode 100644 drivers/md/dm-verity-fec.h diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index e15bc1a0fb98..d602c801ff59 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -18,11 +18,11 @@ Construction Parameters 0 is the original format used in the Chromium OS. The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeros. + the rest of the block is padded with zeroes. 1 is the current format that should be used for new devices. The salt is prepended when hashing and each digest is - padded with zeros to the power of two. + padded with zeroes to the power of two. This is the device containing data, the integrity of which needs to be @@ -79,6 +79,32 @@ restart_on_corruption not compatible with ignore_corruption and requires user space support to avoid restart loops. +use_fec_from_device + Use forward error correction (FEC) to recover from corruption if hash + verification fails. Use encoding data from the specified device. This + may be the same device where data and hash blocks reside, in which case + fec_start must be outside data and hash areas. + + If the encoding data covers additional metadata, it must be accessible + on the hash device after the hash blocks. + + Note: block sizes for data and hash devices must match. Also, if the + verity is encrypted the should be too. + +fec_roots + Number of generator roots. This equals to the number of parity bytes in + the encoding data. For example, in RS(M, N) encoding, the number of roots + is M-N. + +fec_blocks + The number of encoding data blocks on the FEC device. The block size for + the FEC device is . + +fec_start + This is the offset, in blocks, from the start of the + FEC device to the beginning of the encoding data. + + Theory of operation =================== @@ -98,6 +124,11 @@ per-block basis. This allows for a lightweight hash computation on first read into the page cache. Block hashes are stored linearly, aligned to the nearest block size. +If forward error correction (FEC) support is enabled any recovery of +corrupted data will be verified using the cryptographic hash of the +corresponding data. This is why combining error correction with +integrity checking is essential. + Hash Tree --------- diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 7913fdcfc849..d8b0ab6f3753 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -458,6 +458,18 @@ config DM_VERITY If unsure, say N. +config DM_VERITY_FEC + bool "Verity forward error correction support" + depends on DM_VERITY + select REED_SOLOMON + select REED_SOLOMON_DEC8 + ---help--- + Add forward error correction support to dm-verity. This option + makes it possible to use pre-generated error correction data to + recover from corrupted blocks. + + If unsure, say N. + config DM_SWITCH tristate "Switch target support (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 94e9f6bb33d1..62a65764e8e0 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -64,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif + +ifeq ($(CONFIG_DM_VERITY_FEC),y) +dm-verity-objs += dm-verity-fec.o +endif diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c new file mode 100644 index 000000000000..88143d36a1d2 --- /dev/null +++ b/drivers/md/dm-verity-fec.c @@ -0,0 +1,812 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include "dm-verity-fec.h" +#include + +#define DM_MSG_PREFIX "verity-fec" + +/* + * If error correction has been configured, returns true. + */ +bool verity_fec_is_enabled(struct dm_verity *v) +{ + return v->fec && v->fec->dev; +} + +/* + * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable + * length fields. + */ +static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io) +{ + return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io); +} + +/* + * Return an interleaved offset for a byte in RS block. + */ +static inline u64 fec_interleave(struct dm_verity *v, u64 offset) +{ + u32 mod; + + mod = do_div(offset, v->fec->rsn); + return offset + mod * (v->fec->rounds << v->data_dev_block_bits); +} + +/* + * Decode an RS block using Reed-Solomon. + */ +static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, + u8 *data, u8 *fec, int neras) +{ + int i; + uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + + for (i = 0; i < v->fec->roots; i++) + par[i] = fec[i]; + + return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, + fio->erasures, 0, NULL); +} + +/* + * Read error-correcting codes for the requested RS block. Returns a pointer + * to the data block. Caller is responsible for releasing buf. + */ +static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, + unsigned *offset, struct dm_buffer **buf) +{ + u64 position, block; + u8 *res; + + position = (index + rsb) * v->fec->roots; + block = position >> v->data_dev_block_bits; + *offset = (unsigned)(position - (block << v->data_dev_block_bits)); + + res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf); + if (unlikely(IS_ERR(res))) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, (unsigned long long)rsb, + (unsigned long long)(v->fec->start + block), + PTR_ERR(res)); + *buf = NULL; + } + + return res; +} + +/* Loop over each preallocated buffer slot. */ +#define fec_for_each_prealloc_buffer(__i) \ + for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++) + +/* Loop over each extra buffer slot. */ +#define fec_for_each_extra_buffer(io, __i) \ + for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++) + +/* Loop over each allocated buffer. */ +#define fec_for_each_buffer(io, __i) \ + for (__i = 0; __i < (io)->nbufs; __i++) + +/* Loop over each RS block in each allocated buffer. */ +#define fec_for_each_buffer_rs_block(io, __i, __j) \ + fec_for_each_buffer(io, __i) \ + for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++) + +/* + * Return a pointer to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline u8 *fec_buffer_rs_block(struct dm_verity *v, + struct dm_verity_fec_io *fio, + unsigned i, unsigned j) +{ + return &fio->bufs[i][j * v->fec->rsn]; +} + +/* + * Return an index to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j) +{ + return (i << DM_VERITY_FEC_BUF_RS_BITS) + j; +} + +/* + * Decode all RS blocks from buffers and copy corrected bytes into fio->output + * starting from block_offset. + */ +static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, + u64 rsb, int byte_index, unsigned block_offset, + int neras) +{ + int r, corrected = 0, res; + struct dm_buffer *buf; + unsigned n, i, offset; + u8 *par, *block; + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (IS_ERR(par)) + return PTR_ERR(par); + + /* + * Decode the RS blocks we have in bufs. Each RS block results in + * one corrected target byte and consumes fec->roots parity bytes. + */ + fec_for_each_buffer_rs_block(fio, n, i) { + block = fec_buffer_rs_block(v, fio, n, i); + res = fec_decode_rs8(v, fio, block, &par[offset], neras); + if (res < 0) { + dm_bufio_release(buf); + + r = res; + goto error; + } + + corrected += res; + fio->output[block_offset] = block[byte_index]; + + block_offset++; + if (block_offset >= 1 << v->data_dev_block_bits) + goto done; + + /* read the next block when we run out of parity bytes */ + offset += v->fec->roots; + if (offset >= 1 << v->data_dev_block_bits) { + dm_bufio_release(buf); + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (unlikely(IS_ERR(par))) + return PTR_ERR(par); + } + } +done: + r = corrected; +error: + if (r < 0 && neras) + DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", + v->data_dev->name, (unsigned long long)rsb, r); + else if (r > 0) + DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", + v->data_dev->name, (unsigned long long)rsb, r); + + return r; +} + +/* + * Locate data block erasures using verity hashes. + */ +static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, + u8 *want_digest, u8 *data) +{ + if (unlikely(verity_hash(v, verity_io_hash_desc(v, io), + data, 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io)))) + return 0; + + return memcmp(verity_io_real_digest(v, io), want_digest, + v->digest_size) != 0; +} + +/* + * Read data blocks that are part of the RS block and deinterleave as much as + * fits into buffers. Check for erasure locations if @neras is non-NULL. + */ +static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, + u64 rsb, u64 target, unsigned block_offset, + int *neras) +{ + int i, j, target_index = -1; + struct dm_buffer *buf; + struct dm_bufio_client *bufio; + struct dm_verity_fec_io *fio = fec_io(io); + u64 block, ileaved; + u8 *bbuf, *rs_block; + u8 want_digest[v->digest_size]; + unsigned n, k; + + if (neras) + *neras = 0; + + /* + * read each of the rsn data blocks that are part of the RS block, and + * interleave contents to available bufs + */ + for (i = 0; i < v->fec->rsn; i++) { + ileaved = fec_interleave(v, rsb * v->fec->rsn + i); + + /* + * target is the data block we want to correct, target_index is + * the index of this block within the rsn RS blocks + */ + if (ileaved == target) + target_index = i; + + block = ileaved >> v->data_dev_block_bits; + bufio = v->fec->data_bufio; + + if (block >= v->data_blocks) { + block -= v->data_blocks; + + /* + * blocks outside the area were assumed to contain + * zeros when encoding data was generated + */ + if (unlikely(block >= v->fec->hash_blocks)) + continue; + + block += v->hash_start; + bufio = v->bufio; + } + + bbuf = dm_bufio_read(bufio, block, &buf); + if (unlikely(IS_ERR(bbuf))) { + DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", + v->data_dev->name, + (unsigned long long)rsb, + (unsigned long long)block, PTR_ERR(bbuf)); + + /* assume the block is corrupted */ + if (neras && *neras <= v->fec->roots) + fio->erasures[(*neras)++] = i; + + continue; + } + + /* locate erasures if the block is on the data device */ + if (bufio == v->fec->data_bufio && + verity_hash_for_block(v, io, block, want_digest) == 0) { + /* + * skip if we have already found the theoretical + * maximum number (i.e. fec->roots) of erasures + */ + if (neras && *neras <= v->fec->roots && + fec_is_erasure(v, io, want_digest, bbuf)) + fio->erasures[(*neras)++] = i; + } + + /* + * deinterleave and copy the bytes that fit into bufs, + * starting from block_offset + */ + fec_for_each_buffer_rs_block(fio, n, j) { + k = fec_buffer_rs_index(n, j) + block_offset; + + if (k >= 1 << v->data_dev_block_bits) + goto done; + + rs_block = fec_buffer_rs_block(v, fio, n, j); + rs_block[i] = bbuf[k]; + } +done: + dm_bufio_release(buf); + } + + return target_index; +} + +/* + * Allocate RS control structure and FEC buffers from preallocated mempools, + * and attempt to allocate as many extra buffers as available. + */ +static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned n; + + if (!fio->rs) { + fio->rs = mempool_alloc(v->fec->rs_pool, 0); + if (unlikely(!fio->rs)) { + DMERR("failed to allocate RS"); + return -ENOMEM; + } + } + + fec_for_each_prealloc_buffer(n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO); + if (unlikely(!fio->bufs[n])) { + DMERR("failed to allocate FEC buffer"); + return -ENOMEM; + } + } + + /* try to allocate the maximum number of buffers */ + fec_for_each_extra_buffer(fio, n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO); + /* we can manage with even one buffer if necessary */ + if (unlikely(!fio->bufs[n])) + break; + } + fio->nbufs = n; + + if (!fio->output) { + fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); + + if (!fio->output) { + DMERR("failed to allocate FEC page"); + return -ENOMEM; + } + } + + return 0; +} + +/* + * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are + * zeroed before deinterleaving. + */ +static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned n; + + fec_for_each_buffer(fio, n) + memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS); + + memset(fio->erasures, 0, sizeof(fio->erasures)); +} + +/* + * Decode all RS blocks in a single data block and return the target block + * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses + * hashes to locate erasures. + */ +static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 rsb, u64 offset, + bool use_erasures) +{ + int r, neras = 0; + unsigned pos; + + r = fec_alloc_bufs(v, fio); + if (unlikely(r < 0)) + return r; + + for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { + fec_init_bufs(v, fio); + + r = fec_read_bufs(v, io, rsb, offset, pos, + use_erasures ? &neras : NULL); + if (unlikely(r < 0)) + return r; + + r = fec_decode_bufs(v, fio, rsb, r, pos, neras); + if (r < 0) + return r; + + pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; + } + + /* Always re-validate the corrected block against the expected hash */ + r = verity_hash(v, verity_io_hash_desc(v, io), fio->output, + 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io)); + if (unlikely(r < 0)) + return r; + + if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), + v->digest_size)) { + DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", + v->data_dev->name, (unsigned long long)rsb, neras); + return -EILSEQ; + } + + return 0; +} + +static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data, + size_t len) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + memcpy(data, &fio->output[fio->output_pos], len); + fio->output_pos += len; + + return 0; +} + +/* + * Correct errors in a block. Copies corrected block to dest if non-NULL, + * otherwise to a bio_vec starting from iter. + */ +int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + int r; + struct dm_verity_fec_io *fio = fec_io(io); + u64 offset, res, rsb; + + if (!verity_fec_is_enabled(v)) + return -EOPNOTSUPP; + + if (type == DM_VERITY_BLOCK_TYPE_METADATA) + block += v->data_blocks; + + /* + * For RS(M, N), the continuous FEC data is divided into blocks of N + * bytes. Since block size may not be divisible by N, the last block + * is zero padded when decoding. + * + * Each byte of the block is covered by a different RS(M, N) code, + * and each code is interleaved over N blocks to make it less likely + * that bursty corruption will leave us in unrecoverable state. + */ + + offset = block << v->data_dev_block_bits; + + res = offset; + div64_u64(res, v->fec->rounds << v->data_dev_block_bits); + + /* + * The base RS block we can feed to the interleaver to find out all + * blocks required for decoding. + */ + rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits); + + /* + * Locating erasures is slow, so attempt to recover the block without + * them first. Do a second attempt with erasures if the corruption is + * bad enough. + */ + r = fec_decode_rsb(v, io, fio, rsb, offset, false); + if (r < 0) { + r = fec_decode_rsb(v, io, fio, rsb, offset, true); + if (r < 0) + return r; + } + + if (dest) + memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + else if (iter) { + fio->output_pos = 0; + r = verity_for_bv_block(v, io, iter, fec_bv_copy); + } + + return r; +} + +/* + * Clean up per-bio data. + */ +void verity_fec_finish_io(struct dm_verity_io *io) +{ + unsigned n; + struct dm_verity_fec *f = io->v->fec; + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + mempool_free(fio->rs, f->rs_pool); + + fec_for_each_prealloc_buffer(n) + mempool_free(fio->bufs[n], f->prealloc_pool); + + fec_for_each_extra_buffer(fio, n) + mempool_free(fio->bufs[n], f->extra_pool); + + mempool_free(fio->output, f->output_pool); +} + +/* + * Initialize per-bio data. + */ +void verity_fec_init_io(struct dm_verity_io *io) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + fio->rs = NULL; + memset(fio->bufs, 0, sizeof(fio->bufs)); + fio->nbufs = 0; + fio->output = NULL; +} + +/* + * Append feature arguments and values to the status table. + */ +unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, + char *result, unsigned maxlen) +{ + if (!verity_fec_is_enabled(v)) + return sz; + + DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s " + DM_VERITY_OPT_FEC_BLOCKS " %llu " + DM_VERITY_OPT_FEC_START " %llu " + DM_VERITY_OPT_FEC_ROOTS " %d", + v->fec->dev->name, + (unsigned long long)v->fec->blocks, + (unsigned long long)v->fec->start, + v->fec->roots); + + return sz; +} + +void verity_fec_dtr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + + if (!verity_fec_is_enabled(v)) + goto out; + + mempool_destroy(f->rs_pool); + mempool_destroy(f->prealloc_pool); + mempool_destroy(f->extra_pool); + kmem_cache_destroy(f->cache); + + if (f->data_bufio) + dm_bufio_client_destroy(f->data_bufio); + if (f->bufio) + dm_bufio_client_destroy(f->bufio); + + if (f->dev) + dm_put_device(v->ti, f->dev); +out: + kfree(f); + v->fec = NULL; +} + +static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct dm_verity *v = (struct dm_verity *)pool_data; + + return init_rs(8, 0x11d, 0, 1, v->fec->roots); +} + +static void fec_rs_free(void *element, void *pool_data) +{ + struct rs_control *rs = (struct rs_control *)element; + + if (rs) + free_rs(rs); +} + +bool verity_is_fec_opt_arg(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)); +} + +int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + unsigned *argc, const char *arg_name) +{ + int r; + struct dm_target *ti = v->ti; + const char *arg_value; + unsigned long long num_ll; + unsigned char num_c; + char dummy; + + if (!*argc) { + ti->error = "FEC feature arguments require a value"; + return -EINVAL; + } + + arg_value = dm_shift_arg(as); + (*argc)--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev); + if (r) { + ti->error = "FEC device lookup failed"; + return r; + } + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + v->fec->blocks = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >> + (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_START; + return -EINVAL; + } + v->fec->start = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) { + if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c || + num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) || + num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + v->fec->roots = num_c; + + } else { + ti->error = "Unrecognized verity FEC feature request"; + return -EINVAL; + } + + return 0; +} + +/* + * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. + */ +int verity_fec_ctr_alloc(struct dm_verity *v) +{ + struct dm_verity_fec *f; + + f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL); + if (!f) { + v->ti->error = "Cannot allocate FEC structure"; + return -ENOMEM; + } + v->fec = f; + + return 0; +} + +/* + * Validate arguments and preallocate memory. Must be called after arguments + * have been parsed using verity_fec_parse_opt_args. + */ +int verity_fec_ctr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + struct dm_target *ti = v->ti; + u64 hash_blocks; + + if (!verity_fec_is_enabled(v)) { + verity_fec_dtr(v); + return 0; + } + + /* + * FEC is computed over data blocks, possible metadata, and + * hash blocks. In other words, FEC covers total of fec_blocks + * blocks consisting of the following: + * + * data blocks | hash blocks | metadata (optional) + * + * We allow metadata after hash blocks to support a use case + * where all data is stored on the same device and FEC covers + * the entire area. + * + * If metadata is included, we require it to be available on the + * hash device after the hash blocks. + */ + + hash_blocks = v->hash_blocks - v->hash_start; + + /* + * Require matching block sizes for data and hash devices for + * simplicity. + */ + if (v->data_dev_block_bits != v->hash_dev_block_bits) { + ti->error = "Block sizes must match to use FEC"; + return -EINVAL; + } + + if (!f->roots) { + ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + f->rsn = DM_VERITY_FEC_RSM - f->roots; + + if (!f->blocks) { + ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + f->rounds = f->blocks; + if (sector_div(f->rounds, f->rsn)) + f->rounds++; + + /* + * Due to optional metadata, f->blocks can be larger than + * data_blocks and hash_blocks combined. + */ + if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + /* + * Metadata is accessed through the hash device, so we require + * it to be large enough. + */ + f->hash_blocks = f->blocks - v->data_blocks; + if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) { + ti->error = "Hash device is too small for " + DM_VERITY_OPT_FEC_BLOCKS; + return -E2BIG; + } + + f->bufio = dm_bufio_client_create(f->dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL); + if (IS_ERR(f->bufio)) { + ti->error = "Cannot initialize FEC bufio client"; + return PTR_ERR(f->bufio); + } + + if (dm_bufio_get_device_size(f->bufio) < + ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) { + ti->error = "FEC device is too small"; + return -E2BIG; + } + + f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL); + if (IS_ERR(f->data_bufio)) { + ti->error = "Cannot initialize FEC data bufio client"; + return PTR_ERR(f->data_bufio); + } + + if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) { + ti->error = "Data device is too small"; + return -E2BIG; + } + + /* Preallocate an rs_control structure for each worker thread */ + f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc, + fec_rs_free, (void *) v); + if (!f->rs_pool) { + ti->error = "Cannot allocate RS pool"; + return -ENOMEM; + } + + f->cache = kmem_cache_create("dm_verity_fec_buffers", + f->rsn << DM_VERITY_FEC_BUF_RS_BITS, + 0, 0, NULL); + if (!f->cache) { + ti->error = "Cannot create FEC buffer cache"; + return -ENOMEM; + } + + /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */ + f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() * + DM_VERITY_FEC_BUF_PREALLOC, + f->cache); + if (!f->prealloc_pool) { + ti->error = "Cannot allocate FEC buffer prealloc pool"; + return -ENOMEM; + } + + f->extra_pool = mempool_create_slab_pool(0, f->cache); + if (!f->extra_pool) { + ti->error = "Cannot allocate FEC buffer extra pool"; + return -ENOMEM; + } + + /* Preallocate an output buffer for each thread */ + f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(), + 1 << v->data_dev_block_bits); + if (!f->output_pool) { + ti->error = "Cannot allocate FEC output pool"; + return -ENOMEM; + } + + /* Reserve space for our per-bio data */ + ti->per_bio_data_size += sizeof(struct dm_verity_fec_io); + + return 0; +} diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h new file mode 100644 index 000000000000..7fa0298b995e --- /dev/null +++ b/drivers/md/dm-verity-fec.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#ifndef DM_VERITY_FEC_H +#define DM_VERITY_FEC_H + +#include "dm-verity.h" +#include + +/* Reed-Solomon(M, N) parameters */ +#define DM_VERITY_FEC_RSM 255 +#define DM_VERITY_FEC_MAX_RSN 253 +#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ + +/* buffers for deinterleaving and decoding */ +#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */ +#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ +/* we need buffers for at most 1 << block size RS blocks */ +#define DM_VERITY_FEC_BUF_MAX \ + (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) + +#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" +#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" +#define DM_VERITY_OPT_FEC_START "fec_start" +#define DM_VERITY_OPT_FEC_ROOTS "fec_roots" + +/* configuration */ +struct dm_verity_fec { + struct dm_dev *dev; /* parity data device */ + struct dm_bufio_client *data_bufio; /* for data dev access */ + struct dm_bufio_client *bufio; /* for parity data access */ + sector_t start; /* parity data start in blocks */ + sector_t blocks; /* number of blocks covered */ + sector_t rounds; /* number of interleaving rounds */ + sector_t hash_blocks; /* blocks covered after v->hash_start */ + unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ + unsigned char rsn; /* N of RS(M, N) */ + mempool_t *rs_pool; /* mempool for fio->rs */ + mempool_t *prealloc_pool; /* mempool for preallocated buffers */ + mempool_t *extra_pool; /* mempool for extra buffers */ + mempool_t *output_pool; /* mempool for output */ + struct kmem_cache *cache; /* cache for buffers */ +}; + +/* per-bio data */ +struct dm_verity_fec_io { + struct rs_control *rs; /* Reed-Solomon state */ + int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ + u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ + unsigned nbufs; /* number of buffers allocated */ + u8 *output; /* buffer for corrected output */ + size_t output_pos; +}; + +#ifdef CONFIG_DM_VERITY_FEC + +/* each feature parameter requires a value */ +#define DM_VERITY_OPTS_FEC 8 + +extern bool verity_fec_is_enabled(struct dm_verity *v); + +extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, + u8 *dest, struct bvec_iter *iter); + +extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, + char *result, unsigned maxlen); + +extern void verity_fec_finish_io(struct dm_verity_io *io); +extern void verity_fec_init_io(struct dm_verity_io *io); + +extern bool verity_is_fec_opt_arg(const char *arg_name); +extern int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, unsigned *argc, + const char *arg_name); + +extern void verity_fec_dtr(struct dm_verity *v); + +extern int verity_fec_ctr_alloc(struct dm_verity *v); +extern int verity_fec_ctr(struct dm_verity *v); + +#else /* !CONFIG_DM_VERITY_FEC */ + +#define DM_VERITY_OPTS_FEC 0 + +static inline bool verity_fec_is_enabled(struct dm_verity *v) +{ + return false; +} + +static inline int verity_fec_decode(struct dm_verity *v, + struct dm_verity_io *io, + enum verity_block_type type, + sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + return -EOPNOTSUPP; +} + +static inline unsigned verity_fec_status_table(struct dm_verity *v, + unsigned sz, char *result, + unsigned maxlen) +{ + return sz; +} + +static inline void verity_fec_finish_io(struct dm_verity_io *io) +{ +} + +static inline void verity_fec_init_io(struct dm_verity_io *io) +{ +} + +static inline bool verity_is_fec_opt_arg(const char *arg_name) +{ + return false; +} + +static inline int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, + unsigned *argc, + const char *arg_name) +{ + return -EINVAL; +} + +static inline void verity_fec_dtr(struct dm_verity *v) +{ +} + +static inline int verity_fec_ctr_alloc(struct dm_verity *v) +{ + return 0; +} + +static inline int verity_fec_ctr(struct dm_verity *v) +{ + return 0; +} + +#endif /* CONFIG_DM_VERITY_FEC */ + +#endif /* DM_VERITY_FEC_H */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 2b0ee52d1ad8..4f90ec2c6b7a 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -15,6 +15,7 @@ */ #include "dm-verity.h" +#include "dm-verity-fec.h" #include #include @@ -31,7 +32,7 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" -#define DM_VERITY_OPTS_MAX 1 +#define DM_VERITY_OPTS_MAX (1 + DM_VERITY_OPTS_FEC) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -282,6 +283,10 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (likely(memcmp(verity_io_real_digest(v, io), want_digest, v->digest_size) == 0)) aux->hash_verified = 1; + else if (verity_fec_decode(v, io, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data, NULL) == 0) + aux->hash_verified = 1; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, hash_block)) { @@ -411,8 +416,11 @@ static int verity_verify_io(struct dm_verity_io *io) if (likely(memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), v->digest_size) == 0)) continue; + else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b, NULL, &start) == 0) + continue; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) + io->block + b)) return -EIO; } @@ -430,6 +438,8 @@ static void verity_finish_io(struct dm_verity_io *io, int error) bio->bi_end_io = io->orig_bi_end_io; bio->bi_error = error; + verity_fec_finish_io(io); + bio_endio(bio); } @@ -444,7 +454,7 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_error && !verity_fec_is_enabled(io->v)) { verity_finish_io(io, bio->bi_error); return; } @@ -547,6 +557,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_private = io; io->iter = bio->bi_iter; + verity_fec_init_io(io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -561,6 +573,7 @@ static void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct dm_verity *v = ti->private; + unsigned args = 0; unsigned sz = 0; unsigned x; @@ -587,8 +600,15 @@ static void verity_status(struct dm_target *ti, status_type_t type, else for (x = 0; x < v->salt_size; x++) DMEMIT("%02x", v->salt[x]); + if (v->mode != DM_VERITY_MODE_EIO) + args++; + if (verity_fec_is_enabled(v)) + args += DM_VERITY_OPTS_FEC; + if (!args) + return; + DMEMIT(" %u", args); if (v->mode != DM_VERITY_MODE_EIO) { - DMEMIT(" 1 "); + DMEMIT(" "); switch (v->mode) { case DM_VERITY_MODE_LOGGING: DMEMIT(DM_VERITY_OPT_LOGGING); @@ -600,6 +620,7 @@ static void verity_status(struct dm_target *ti, status_type_t type, BUG(); } } + sz = verity_fec_status_table(v, sz, result, maxlen); break; } } @@ -662,6 +683,8 @@ static void verity_dtr(struct dm_target *ti) if (v->data_dev) dm_put_device(ti, v->data_dev); + verity_fec_dtr(v); + kfree(v); } @@ -694,6 +717,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) { v->mode = DM_VERITY_MODE_RESTART; continue; + + } else if (verity_is_fec_opt_arg(arg_name)) { + r = verity_fec_parse_opt_args(as, v, &argc, arg_name); + if (r) + return r; + continue; } ti->error = "Unrecognized verity feature request"; @@ -736,6 +765,10 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->private = v; v->ti = ti; + r = verity_fec_ctr_alloc(v); + if (r) + goto bad; + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { ti->error = "Device must be readonly"; r = -EINVAL; @@ -924,8 +957,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io)); - /* WQ_UNBOUND greatly improves performance when running on ramdisk */ v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!v->verify_wq) { @@ -934,6 +965,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + ti->per_bio_data_size = sizeof(struct dm_verity_io) + + v->shash_descsize + v->digest_size * 2; + + r = verity_fec_ctr(v); + if (r) + goto bad; + + ti->per_bio_data_size = roundup(ti->per_bio_data_size, + __alignof__(struct dm_verity_io)); + return 0; bad: @@ -944,7 +985,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index f5af52df8e38..8e853722f6c6 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -29,6 +29,8 @@ enum verity_block_type { DM_VERITY_BLOCK_TYPE_METADATA }; +struct dm_verity_fec; + struct dm_verity { struct dm_dev *data_dev; struct dm_dev *hash_dev; @@ -58,6 +60,8 @@ struct dm_verity { /* starting blocks for each tree level. 0 is the lowest level. */ sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; + + struct dm_verity_fec *fec; /* forward error correction */ }; struct dm_verity_io { @@ -103,6 +107,12 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; } +static inline u8 *verity_io_digest_end(struct dm_verity *v, + struct dm_verity_io *io) +{ + return verity_io_want_digest(v, io) + v->digest_size; +} + extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, struct bvec_iter *iter, int (*process)(struct dm_verity *v, From f75998fe8443541f599455aba93871be1e124b9a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 14:26:31 +0000 Subject: [PATCH 052/607] UPSTREAM: dm verity: add ignore_zero_blocks feature If ignore_zero_blocks is enabled dm-verity will return zeroes for blocks matching a zero hash without validating the content. Change-Id: I728fa4b2586b29f2793ea5cb014289892819d249 Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit 0cc37c2df4fa0aa702f9662edce4b7ce12c86b7a) --- Documentation/device-mapper/verity.txt | 5 ++ drivers/md/dm-verity-fec.c | 8 ++- drivers/md/dm-verity-target.c | 87 +++++++++++++++++++++++--- drivers/md/dm-verity.h | 3 +- 4 files changed, 93 insertions(+), 10 deletions(-) diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index d602c801ff59..89fd8f9a259f 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -79,6 +79,11 @@ restart_on_corruption not compatible with ignore_corruption and requires user space support to avoid restart loops. +ignore_zero_blocks + Do not verify blocks that are expected to contain zeroes and always return + zeroes instead. This may be useful if the partition contains unused blocks + that are not guaranteed to contain zeroes. + use_fec_from_device Use forward error correction (FEC) to recover from corruption if hash verification fails. Use encoding data from the specified device. This diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 88143d36a1d2..1cc10c4de701 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -205,6 +205,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, u64 rsb, u64 target, unsigned block_offset, int *neras) { + bool is_zero; int i, j, target_index = -1; struct dm_buffer *buf; struct dm_bufio_client *bufio; @@ -264,7 +265,12 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, /* locate erasures if the block is on the data device */ if (bufio == v->fec->data_bufio && - verity_hash_for_block(v, io, block, want_digest) == 0) { + verity_hash_for_block(v, io, block, want_digest, + &is_zero) == 0) { + /* skip known zero blocks entirely */ + if (is_zero) + continue; + /* * skip if we have already found the theoretical * maximum number (i.e. fec->roots) of erasures diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 4f90ec2c6b7a..5c5d30cb6ec5 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -31,8 +31,9 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" -#define DM_VERITY_OPTS_MAX (1 + DM_VERITY_OPTS_FEC) +#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -309,10 +310,9 @@ release_ret_r: * of the hash tree if necessary. */ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, - sector_t block, u8 *digest) + sector_t block, u8 *digest, bool *is_zero) { - int i; - int r; + int r = 0, i; if (likely(v->levels)) { /* @@ -324,7 +324,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, */ r = verity_verify_level(v, io, block, 0, true, digest); if (likely(r <= 0)) - return r; + goto out; } memcpy(digest, v->root_digest, v->digest_size); @@ -332,10 +332,15 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, for (i = v->levels - 1; i >= 0; i--) { r = verity_verify_level(v, io, block, i, false, digest); if (unlikely(r)) - return r; + goto out; } +out: + if (!r && v->zero_digest) + *is_zero = !memcmp(v->zero_digest, digest, v->digest_size); + else + *is_zero = false; - return 0; + return r; } /* @@ -382,11 +387,19 @@ static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); } +static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + memset(data, 0, len); + return 0; +} + /* * Verify one "dm_verity_io" structure. */ static int verity_verify_io(struct dm_verity_io *io) { + bool is_zero; struct dm_verity *v = io->v; struct bvec_iter start; unsigned b; @@ -396,10 +409,24 @@ static int verity_verify_io(struct dm_verity_io *io) struct shash_desc *desc = verity_io_hash_desc(v, io); r = verity_hash_for_block(v, io, io->block + b, - verity_io_want_digest(v, io)); + verity_io_want_digest(v, io), + &is_zero); if (unlikely(r < 0)) return r; + if (is_zero) { + /* + * If we expect a zero block, don't validate, just + * return zeros. + */ + r = verity_for_bv_block(v, io, &io->iter, + verity_bv_zero); + if (unlikely(r < 0)) + return r; + + continue; + } + r = verity_hash_init(v, desc); if (unlikely(r < 0)) return r; @@ -604,6 +631,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args++; if (verity_fec_is_enabled(v)) args += DM_VERITY_OPTS_FEC; + if (v->zero_digest) + args++; if (!args) return; DMEMIT(" %u", args); @@ -620,6 +649,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, BUG(); } } + if (v->zero_digest) + DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); sz = verity_fec_status_table(v, sz, result, maxlen); break; } @@ -671,6 +702,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->salt); kfree(v->root_digest); + kfree(v->zero_digest); if (v->tfm) crypto_free_shash(v->tfm); @@ -688,6 +720,37 @@ static void verity_dtr(struct dm_target *ti) kfree(v); } +static int verity_alloc_zero_digest(struct dm_verity *v) +{ + int r = -ENOMEM; + struct shash_desc *desc; + u8 *zero_data; + + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); + + if (!v->zero_digest) + return r; + + desc = kmalloc(v->shash_descsize, GFP_KERNEL); + + if (!desc) + return r; /* verity_dtr will free zero_digest */ + + zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); + + if (!zero_data) + goto out; + + r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits, + v->zero_digest); + +out: + kfree(desc); + kfree(zero_data); + + return r; +} + static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) { int r; @@ -718,6 +781,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) v->mode = DM_VERITY_MODE_RESTART; continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { + r = verity_alloc_zero_digest(v); + if (r) { + ti->error = "Cannot allocate zero digest"; + return r; + } + continue; + } else if (verity_is_fec_opt_arg(arg_name)) { r = verity_fec_parse_opt_args(as, v, &argc, arg_name); if (r) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 8e853722f6c6..fb419f422d73 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -40,6 +40,7 @@ struct dm_verity { struct crypto_shash *tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ + u8 *zero_digest; /* digest for a zero block */ unsigned salt_size; sector_t data_start; /* data offset in 512-byte sectors */ sector_t hash_start; /* hash start in blocks */ @@ -123,6 +124,6 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, const u8 *data, size_t len, u8 *digest); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, - sector_t block, u8 *digest); + sector_t block, u8 *digest, bool *is_zero); #endif /* DM_VERITY_H */ From fcd614efb6c4dd4c4f543feefa6be7f8e49dcc09 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 16 Dec 2015 16:23:49 +0000 Subject: [PATCH 053/607] ANDROID: android: base-cfg: enable CONFIG_DM_VERITY_FEC Bug: 21893453 Change-Id: Idd0dfe4e3e527df2eff2f0d734effc40dce294c7 Signed-off-by: Sami Tolvanen (cherry picked from commit 9408350ed80005174918ce5147490035b2cf451b) --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index a4cffb1840d7..220ddb5304df 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -21,6 +21,7 @@ CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y CONFIG_DM_CRYPT=y CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y From e7d40d2bf8b2991f051734dd45a95385644f8147 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 30 Mar 2016 14:10:13 -0700 Subject: [PATCH 054/607] ANDROID: dm verity fec: add sysfs attribute fec/corrected Add a sysfs entry that allows user space to determine whether dm-verity has come across correctable errors on the underlying block device. Bug: 22655252 Bug: 27928374 Change-Id: I80547a2aa944af2fb9ffde002650482877ade31b Signed-off-by: Sami Tolvanen (cherry picked from commit 7911fad5f0a2cf5afc2215657219a21e6630e001) --- drivers/md/dm-verity-fec.c | 45 +++++++++++++++++++++++++++++++++++++- drivers/md/dm-verity-fec.h | 3 +++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1cc10c4de701..ad10d6d8ed28 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -11,6 +11,7 @@ #include "dm-verity-fec.h" #include +#include #define DM_MSG_PREFIX "verity-fec" @@ -175,9 +176,11 @@ error: if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); - else if (r > 0) + else if (r > 0) { DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", v->data_dev->name, (unsigned long long)rsb, r); + atomic_add_unless(&v->fec->corrected, 1, INT_MAX); + } return r; } @@ -548,6 +551,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, void verity_fec_dtr(struct dm_verity *v) { struct dm_verity_fec *f = v->fec; + struct kobject *kobj = &f->kobj_holder.kobj; if (!verity_fec_is_enabled(v)) goto out; @@ -564,6 +568,12 @@ void verity_fec_dtr(struct dm_verity *v) if (f->dev) dm_put_device(v->ti, f->dev); + + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + out: kfree(f); v->fec = NULL; @@ -652,6 +662,27 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return 0; } +static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec, + kobj_holder.kobj); + + return sprintf(buf, "%d\n", atomic_read(&f->corrected)); +} + +static struct kobj_attribute attr_corrected = __ATTR_RO(corrected); + +static struct attribute *fec_attrs[] = { + &attr_corrected.attr, + NULL +}; + +static struct kobj_type fec_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = fec_attrs +}; + /* * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. */ @@ -675,8 +706,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v) */ int verity_fec_ctr(struct dm_verity *v) { + int r; struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; + struct mapped_device *md = dm_table_get_md(ti->table); u64 hash_blocks; if (!verity_fec_is_enabled(v)) { @@ -684,6 +717,16 @@ int verity_fec_ctr(struct dm_verity *v) return 0; } + /* Create a kobject and sysfs attributes */ + init_completion(&f->kobj_holder.completion); + + r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", "fec"); + if (r) { + ti->error = "Cannot create kobject"; + return r; + } + /* * FEC is computed over data blocks, possible metadata, and * hash blocks. In other words, FEC covers total of fec_blocks diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 7fa0298b995e..8c4bee052a73 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -12,6 +12,7 @@ #ifndef DM_VERITY_FEC_H #define DM_VERITY_FEC_H +#include "dm.h" #include "dm-verity.h" #include @@ -48,6 +49,8 @@ struct dm_verity_fec { mempool_t *extra_pool; /* mempool for extra buffers */ mempool_t *output_pool; /* mempool for output */ struct kmem_cache *cache; /* cache for buffers */ + atomic_t corrected; /* corrected errors */ + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ }; /* per-bio data */ From 2ebb4d97e0c9afeaa45c92f6df2d169d25b6db0b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 4 Feb 2015 13:54:48 -0800 Subject: [PATCH 055/607] cpufreq: interactive: fix policy locking cpufreq_interactive_speedchange_task() is running as a separate kernel thread and is calling __cpufreq_driver_target(), which requires callers to hold policy->rwsem for writing to prevent racing with other parts of the kernel trying to adjust the frequency, for example kernel thermal throttling. Let's change the code to take policy->rwsem and while at it refactor the code a bit. This was originally 2 changes reviewed at: https://chromium-review.googlesource.com/246273 https://chromium-review.googlesource.com/256120 Change-Id: Icc2d97c6c1b929acd2ee32e8c81d81fd2af778ab Signed-off-by: Dmitry Torokhov Reviewed-by: Dylan Reid Reviewed-by: Douglas Anderson Signed-off-by: Dmitry Torokhov --- drivers/cpufreq/cpufreq_interactive.c | 99 ++++++++++++++++----------- 1 file changed, 60 insertions(+), 39 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 0be66df4a6e6..5224e897d460 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -511,6 +511,58 @@ static void cpufreq_interactive_idle_end(void) up_read(&pcpu->enable_sem); } +static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy, + unsigned int *pmax_freq, + u64 *phvt, u64 *pfvt) +{ + struct cpufreq_interactive_cpuinfo *pcpu; + unsigned int max_freq = 0; + u64 hvt = ~0ULL, fvt = 0; + unsigned int i; + + for_each_cpu(i, policy->cpus) { + pcpu = &per_cpu(cpuinfo, i); + + fvt = max(fvt, pcpu->loc_floor_val_time); + if (pcpu->target_freq > max_freq) { + max_freq = pcpu->target_freq; + hvt = pcpu->loc_hispeed_val_time; + } else if (pcpu->target_freq == max_freq) { + hvt = min(hvt, pcpu->loc_hispeed_val_time); + } + } + + *pmax_freq = max_freq; + *phvt = hvt; + *pfvt = fvt; +} + +static void cpufreq_interactive_adjust_cpu(unsigned int cpu, + struct cpufreq_policy *policy) +{ + struct cpufreq_interactive_cpuinfo *pcpu; + u64 hvt, fvt; + unsigned int max_freq; + int i; + + cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt); + + for_each_cpu(i, policy->cpus) { + pcpu = &per_cpu(cpuinfo, i); + pcpu->pol_floor_val_time = fvt; + } + + if (max_freq != policy->cur) { + __cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H); + for_each_cpu(i, policy->cpus) { + pcpu = &per_cpu(cpuinfo, i); + pcpu->pol_hispeed_val_time = hvt; + } + } + + trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur); +} + static int cpufreq_interactive_speedchange_task(void *data) { unsigned int cpu; @@ -539,49 +591,18 @@ static int cpufreq_interactive_speedchange_task(void *data) spin_unlock_irqrestore(&speedchange_cpumask_lock, flags); for_each_cpu(cpu, &tmp_mask) { - unsigned int j; - unsigned int max_freq = 0; - struct cpufreq_interactive_cpuinfo *pjcpu; - u64 hvt = ~0ULL, fvt = 0; - pcpu = &per_cpu(cpuinfo, cpu); - if (!down_read_trylock(&pcpu->enable_sem)) - continue; - if (!pcpu->governor_enabled) { + + down_write(&pcpu->policy->rwsem); + + if (likely(down_read_trylock(&pcpu->enable_sem))) { + if (likely(pcpu->governor_enabled)) + cpufreq_interactive_adjust_cpu(cpu, + pcpu->policy); up_read(&pcpu->enable_sem); - continue; } - for_each_cpu(j, pcpu->policy->cpus) { - pjcpu = &per_cpu(cpuinfo, j); - - fvt = max(fvt, pjcpu->loc_floor_val_time); - if (pjcpu->target_freq > max_freq) { - max_freq = pjcpu->target_freq; - hvt = pjcpu->loc_hispeed_val_time; - } else if (pjcpu->target_freq == max_freq) { - hvt = min(hvt, pjcpu->loc_hispeed_val_time); - } - } - for_each_cpu(j, pcpu->policy->cpus) { - pjcpu = &per_cpu(cpuinfo, j); - pjcpu->pol_floor_val_time = fvt; - } - - if (max_freq != pcpu->policy->cur) { - __cpufreq_driver_target(pcpu->policy, - max_freq, - CPUFREQ_RELATION_H); - for_each_cpu(j, pcpu->policy->cpus) { - pjcpu = &per_cpu(cpuinfo, j); - pjcpu->pol_hispeed_val_time = hvt; - } - } - trace_cpufreq_interactive_setspeed(cpu, - pcpu->target_freq, - pcpu->policy->cur); - - up_read(&pcpu->enable_sem); + up_write(&pcpu->policy->rwsem); } } From 440a57717bcc2a86a5e9800bce62e14d8115a001 Mon Sep 17 00:00:00 2001 From: Daniel Kurtz Date: Thu, 28 May 2015 12:08:11 +0800 Subject: [PATCH 056/607] cpufreq: interactive: only apply interactive boost when enabled Only apply the interactive boost when the interactive governor is enabled. This seems like the right thing to do. This was originally reviewed on https://chromium-review.googlesource.com/273501 Change-Id: I5f4a7320683eada099f9a4253e3d6b0f03057fe8 Signed-off-by: Daniel Kurtz Reviewed-by: Douglas Anderson Signed-off-by: Dmitry Torokhov --- drivers/cpufreq/cpufreq_interactive.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 5224e897d460..bd83be39f17b 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -622,9 +622,20 @@ static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunab for_each_online_cpu(i) { pcpu = &per_cpu(cpuinfo, i); - if (tunables != pcpu->policy->governor_data) + + if (!down_read_trylock(&pcpu->enable_sem)) continue; + if (!pcpu->governor_enabled) { + up_read(&pcpu->enable_sem); + continue; + } + + if (tunables != pcpu->policy->governor_data) { + up_read(&pcpu->enable_sem); + continue; + } + spin_lock_irqsave(&pcpu->target_freq_lock, flags[1]); if (pcpu->target_freq < tunables->hispeed_freq) { pcpu->target_freq = tunables->hispeed_freq; @@ -634,6 +645,8 @@ static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunab anyboost = 1; } spin_unlock_irqrestore(&pcpu->target_freq_lock, flags[1]); + + up_read(&pcpu->enable_sem); } spin_unlock_irqrestore(&speedchange_cpumask_lock, flags[0]); From 03fbd079bac71e15a414082cb5aee980ce2935be Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 31 Mar 2016 13:21:09 -0700 Subject: [PATCH 057/607] android: base-cfg: Add CONFIG_INET_DIAG_DESTROY Change-Id: I67430b05eca8fd520d2795d3db60faf2ec0fab9e Signed-off-by: Dmitry Shmidt --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 220ddb5304df..fa53af0c37ad 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -29,6 +29,7 @@ CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y CONFIG_INET_ESP=y CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_IP6_NF_FILTER=y From d68354b3b7ef8b8a1d9cf0ba071123ed62fe567f Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Thu, 28 Jan 2016 11:12:25 -0800 Subject: [PATCH 058/607] ANDROID: mmc: Add CONFIG_MMC_SIMULATE_MAX_SPEED When CONFIG_MMC_SIMULATE_MAX_SPEED is enabled, Expose max_read_speed, max_write_speed and cache_size default module parameters and sysfs controls to simulate a slow eMMC device. Default values are 0 (off), 0 (off) and 4 MB respectively. Signed-off-by: Mark Salyzyn Bug: 26976972 Change-Id: I342bfbd8b85f9b790e3f0e1e4e51a900ae07e05d --- Documentation/block/00-INDEX | 6 + Documentation/block/mmc-max-speed.txt | 38 ++++ drivers/mmc/card/Kconfig | 12 ++ drivers/mmc/card/block.c | 300 ++++++++++++++++++++++++++ drivers/mmc/card/queue.h | 8 + 5 files changed, 364 insertions(+) create mode 100644 Documentation/block/mmc-max-speed.txt diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index e840b47613f7..bc5148757edb 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -26,3 +26,9 @@ switching-sched.txt - Switching I/O schedulers at runtime writeback_cache_control.txt - Control of volatile write back caches +mmc-max-speed.txt + - eMMC layer speed simulation, related to /sys/block/mmcblk*/ + attributes: + max_read_speed + max_write_speed + cache_size diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt new file mode 100644 index 000000000000..3f052b9fb999 --- /dev/null +++ b/Documentation/block/mmc-max-speed.txt @@ -0,0 +1,38 @@ +eMMC Block layer simulation speed controls in /sys/block/mmcblk*/ +=============================================== + +Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed +limiting. Used to test and simulate the behavior of the system when +confronted with a slow MMC. + +Enables max_read_speed, max_write_speed and cache_size attributes and module +default parameters to control the write or read maximum KB/second speed +behaviors. + +NB: There is room for improving the algorithm for aspects tied directly to +eMMC specific behavior. For instance, wear leveling and stalls from an +exhausted erase pool. We would expect that if there was a need to provide +similar speed simulation controls to other types of block devices, aspects of +their behavior are modelled separately (e.g. head seek times, heat assist, +shingling and rotational latency). + +/sys/block/mmcblk0/max_read_speed: + +Number of KB/second reads allowed to the block device. Used to test and +simulate the behavior of the system when confronted with a slow reading MMC. +Set to 0 or "off" to place no speed limit. + +/sys/block/mmcblk0/max_write_speed: + +Number of KB/second writes allowed to the block device. Used to test and +simulate the behavior of the system when confronted with a slow writing MMC. +Set to 0 or "off" to place no speed limit. + +/sys/block/mmcblk0/cache_size: + +Number of MB of high speed memory or high speed SLC cache expected on the +eMMC device being simulated. Used to help simulate the write-back behavior +more accurately. The assumption is the cache has no delay, but draws down +in the background to the MLC/TLC primary store at the max_write_speed rate. +Any write speed delays will show up when the cache is full, or when an I/O +request to flush is issued. diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig index 5562308699bc..6142ec1b9dfb 100644 --- a/drivers/mmc/card/Kconfig +++ b/drivers/mmc/card/Kconfig @@ -68,3 +68,15 @@ config MMC_TEST This driver is only of interest to those developing or testing a host driver. Most people should say N here. + +config MMC_SIMULATE_MAX_SPEED + bool "Turn on maximum speed control per block device" + depends on MMC_BLOCK + help + Say Y here to enable MMC device speed limiting. Used to test and + simulate the behavior of the system when confronted with a slow MMC. + + Enables max_read_speed, max_write_speed and cache_size attributes to + control the write or read maximum KB/second speed behaviors. + + If unsure, say N here. diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index f2ce13ab7ae6..c15d3f380524 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -288,6 +288,250 @@ out: return ret; } +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + +static int max_read_speed, max_write_speed, cache_size = 4; + +module_param(max_read_speed, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off"); +module_param(max_write_speed, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off"); +module_param(cache_size, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache"); + +/* + * helper macros and expectations: + * size - unsigned long number of bytes + * jiffies - unsigned long HZ timestamp difference + * speed - unsigned KB/s transfer rate + */ +#define size_and_speed_to_jiffies(size, speed) \ + ((size) * HZ / (speed) / 1024UL) +#define jiffies_and_speed_to_size(jiffies, speed) \ + (((speed) * (jiffies) * 1024UL) / HZ) +#define jiffies_and_size_to_speed(jiffies, size) \ + ((size) * HZ / (jiffies) / 1024UL) + +/* Limits to report warning */ +/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */ +#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */ +#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL) + +#define speed_valid(speed) ((speed) > 0) + +static const char off[] = "off\n"; + +static int max_speed_show(int speed, char *buf) +{ + if (speed) + return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed); + else + return scnprintf(buf, PAGE_SIZE, off); +} + +static int max_speed_store(const char *buf, struct request_queue *q) +{ + unsigned int limit, set = 0; + + if (!strncasecmp(off, buf, sizeof(off) - 2)) + return set; + if (kstrtouint(buf, 0, &set) || (set > INT_MAX)) + return -EINVAL; + if (set == 0) + return set; + limit = MAX_SPEED(q); + if (set > limit) + pr_warn("max speed %u ineffective above %u\n", set, limit); + limit = MIN_SPEED(q); + if (set < limit) + pr_warn("max speed %u painful below %u\n", set, limit); + return set; +} + +static ssize_t max_write_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf); + + mmc_blk_put(md); + return ret; +} + +static ssize_t max_write_speed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int set = max_speed_store(buf, md->queue.queue); + + if (set < 0) { + mmc_blk_put(md); + return set; + } + + atomic_set(&md->queue.max_write_speed, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR, + max_write_speed_show, max_write_speed_store); + +static ssize_t max_read_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf); + + mmc_blk_put(md); + return ret; +} + +static ssize_t max_read_speed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int set = max_speed_store(buf, md->queue.queue); + + if (set < 0) { + mmc_blk_put(md); + return set; + } + + atomic_set(&md->queue.max_read_speed, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR, + max_read_speed_show, max_read_speed_store); + +static ssize_t cache_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + struct mmc_queue *mq = &md->queue; + int cache_size = atomic_read(&mq->cache_size); + int ret; + + if (!cache_size) + ret = scnprintf(buf, PAGE_SIZE, off); + else { + int speed = atomic_read(&mq->max_write_speed); + + if (!speed_valid(speed)) + ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size); + else { /* We accept race between cache_jiffies and cache_used */ + unsigned long size = jiffies_and_speed_to_size( + jiffies - mq->cache_jiffies, speed); + long used = atomic_long_read(&mq->cache_used); + + if (size >= used) + size = 0; + else + size = (used - size) * 100 / cache_size + / 1024UL / 1024UL; + + ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n", + cache_size, size); + } + } + + mmc_blk_put(md); + return ret; +} + +static ssize_t cache_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md; + unsigned int set = 0; + + if (strncasecmp(off, buf, sizeof(off) - 2) + && (kstrtouint(buf, 0, &set) || (set > INT_MAX))) + return -EINVAL; + + md = mmc_blk_get(dev_to_disk(dev)); + atomic_set(&md->queue.cache_size, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR, + cache_size_show, cache_size_store); + +/* correct for write-back */ +static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor) +{ + long used = 0; + int speed = atomic_read(&mq->max_write_speed); + + if (speed_valid(speed)) { + unsigned long size = jiffies_and_speed_to_size( + waitfor - mq->cache_jiffies, speed); + used = atomic_long_read(&mq->cache_used); + + if (size >= used) + used = 0; + else + used -= size; + } + + atomic_long_set(&mq->cache_used, used); + mq->cache_jiffies = waitfor; + + return used; +} + +static void mmc_blk_simulate_delay( + struct mmc_queue *mq, + struct request *req, + unsigned long waitfor) +{ + int max_speed; + + if (!req) + return; + + max_speed = (rq_data_dir(req) == READ) + ? atomic_read(&mq->max_read_speed) + : atomic_read(&mq->max_write_speed); + if (speed_valid(max_speed)) { + unsigned long bytes = blk_rq_bytes(req); + + if (rq_data_dir(req) != READ) { + int cache_size = atomic_read(&mq->cache_size); + + if (cache_size) { + unsigned long size = cache_size * 1024L * 1024L; + long used = mmc_blk_cache_used(mq, waitfor); + + used += bytes; + atomic_long_set(&mq->cache_used, used); + bytes = 0; + if (used > size) + bytes = used - size; + } + } + waitfor += size_and_speed_to_jiffies(bytes, max_speed); + if (time_is_after_jiffies(waitfor)) { + long msecs = jiffies_to_msecs(waitfor - jiffies); + + if (likely(msecs > 0)) + msleep(msecs); + } + } +} + +#else + +#define mmc_blk_simulate_delay(mq, req, waitfor) + +#endif + static int mmc_blk_open(struct block_device *bdev, fmode_t mode) { struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); @@ -1264,6 +1508,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) if (ret) ret = -EIO; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + else if (atomic_read(&mq->cache_size)) { + long used = mmc_blk_cache_used(mq, jiffies); + + if (used) { + int speed = atomic_read(&mq->max_write_speed); + + if (speed_valid(speed)) { + unsigned long msecs = jiffies_to_msecs( + size_and_speed_to_jiffies( + used, speed)); + if (msecs) + msleep(msecs); + } + } + } +#endif blk_end_request_all(req, ret); return ret ? 0 : 1; @@ -1943,6 +2204,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) struct mmc_async_req *areq; const u8 packed_nr = 2; u8 reqs = 0; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + unsigned long waitfor = jiffies; +#endif if (!rqc && !mq->mqrq_prev->req) return 0; @@ -1993,6 +2257,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) */ mmc_blk_reset_success(md, type); + mmc_blk_simulate_delay(mq, rqc, waitfor); + if (mmc_packed_cmd(mq_rq->cmd_type)) { ret = mmc_blk_end_packed_req(mq_rq); break; @@ -2411,6 +2677,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md) card->ext_csd.boot_ro_lockable) device_remove_file(disk_to_dev(md->disk), &md->power_ro_lock); +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + device_remove_file(disk_to_dev(md->disk), + &dev_attr_max_write_speed); + device_remove_file(disk_to_dev(md->disk), + &dev_attr_max_read_speed); + device_remove_file(disk_to_dev(md->disk), + &dev_attr_cache_size); +#endif del_gendisk(md->disk); } @@ -2446,6 +2720,24 @@ static int mmc_add_disk(struct mmc_blk_data *md) ret = device_create_file(disk_to_dev(md->disk), &md->force_ro); if (ret) goto force_ro_fail; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + atomic_set(&md->queue.max_write_speed, max_write_speed); + ret = device_create_file(disk_to_dev(md->disk), + &dev_attr_max_write_speed); + if (ret) + goto max_write_speed_fail; + atomic_set(&md->queue.max_read_speed, max_read_speed); + ret = device_create_file(disk_to_dev(md->disk), + &dev_attr_max_read_speed); + if (ret) + goto max_read_speed_fail; + atomic_set(&md->queue.cache_size, cache_size); + atomic_long_set(&md->queue.cache_used, 0); + md->queue.cache_jiffies = jiffies; + ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size); + if (ret) + goto cache_size_fail; +#endif if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && card->ext_csd.boot_ro_lockable) { @@ -2470,6 +2762,14 @@ static int mmc_add_disk(struct mmc_blk_data *md) return ret; power_ro_lock_fail: +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size); +cache_size_fail: + device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed); +max_read_speed_fail: + device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed); +max_write_speed_fail: +#endif device_remove_file(disk_to_dev(md->disk), &md->force_ro); force_ro_fail: del_gendisk(md->disk); diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h index 36cddab57d77..d890d8832e21 100644 --- a/drivers/mmc/card/queue.h +++ b/drivers/mmc/card/queue.h @@ -58,6 +58,14 @@ struct mmc_queue { struct mmc_queue_req mqrq[2]; struct mmc_queue_req *mqrq_cur; struct mmc_queue_req *mqrq_prev; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + atomic_t max_write_speed; + atomic_t max_read_speed; + atomic_t cache_size; + /* i/o tracking */ + atomic_long_t cache_used; + unsigned long cache_jiffies; +#endif }; extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, From a354cef9805bef94bb6fb6bd7887447846fee737 Mon Sep 17 00:00:00 2001 From: Rom Lemarchand Date: Thu, 7 Apr 2016 07:19:34 -0700 Subject: [PATCH 059/607] android: base-cfg: enable CONFIG_QUOTA Bug: 28032718 Change-Id: I7cb6b641f72085e69b90dca11d2ea68adcd02390 (cherry picked from commit e1b53a388e9cfcf870520a6899a37456cf1ae2c6) --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index fa53af0c37ad..3f82e9339dad 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -138,6 +138,7 @@ CONFIG_PPP_BSDCOMP=y CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y +CONFIG_QUOTA=y CONFIG_RESOURCE_COUNTERS=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y From 00526cde6b4a4154e4ea528f8a0cea1017abbc1c Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Tue, 5 Apr 2016 13:06:27 -0700 Subject: [PATCH 060/607] BACKPORT: selinux: restrict kernel module loading Backport notes: Backport uses kernel_module_from_file not kernel_read_file hook. kernel_read_file replaced kernel_module_from_file in the 4.6 kernel. There are no inode_security_() helper functions (also introduced in 4.6) so the inode lookup is done using the file_inode() helper which is standard for kernel version < 4.6. (Cherry picked from commit 61d612ea731e57dc510472fb746b55cdc017f371) Utilize existing kernel_read_file hook on kernel module load. Add module_load permission to the system class. Enforces restrictions on kernel module origin when calling the finit_module syscall. The hook checks that source type has permission module_load for the target type. Example for finit_module: allow foo bar_file:system module_load; Similarly restrictions are enforced on kernel module loading when calling the init_module syscall. The hook checks that source type has permission module_load with itself as the target object because the kernel module is sourced from the calling process. Example for init_module: allow foo foo:system module_load; Bug: 27824855 Change-Id: I64bf3bd1ab2dc735321160642dc6bbfa996f8068 Signed-off-by: Jeff Vander Stoep Signed-off-by: Paul Moore --- security/selinux/hooks.c | 33 +++++++++++++++++++++++++++++ security/selinux/include/classmap.h | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7c22a15c7e4b..78d06ff4eeb7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3660,6 +3660,38 @@ static int selinux_kernel_module_request(char *kmod_name) SYSTEM__MODULE_REQUEST, &ad); } +static int selinux_kernel_module_from_file(struct file *file) +{ + struct common_audit_data ad; + struct inode_security_struct *isec; + struct file_security_struct *fsec; + struct inode *inode; + u32 sid = current_sid(); + int rc; + + /* init_module */ + if (file == NULL) + return avc_has_perm(sid, sid, SECCLASS_SYSTEM, + SYSTEM__MODULE_LOAD, NULL); + + /* finit_module */ + ad.type = LSM_AUDIT_DATA_PATH; + ad.u.path = file->f_path; + + inode = file_inode(file); + isec = inode->i_security; + fsec = file->f_security; + + if (sid != fsec->sid) { + rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); + if (rc) + return rc; + } + + return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, + SYSTEM__MODULE_LOAD, &ad); +} + static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return current_has_perm(p, PROCESS__SETPGID); @@ -5950,6 +5982,7 @@ static struct security_hook_list selinux_hooks[] = { LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as), LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request), + LSM_HOOK_INIT(kernel_module_from_file, selinux_kernel_module_from_file), LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 5a4eef59aeff..b393d29ae857 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -32,7 +32,7 @@ struct security_class_mapping secclass_map[] = { "setsockcreate", NULL } }, { "system", { "ipc_info", "syslog_read", "syslog_mod", - "syslog_console", "module_request", NULL } }, + "syslog_console", "module_request", "module_load", NULL } }, { "capability", { "chown", "dac_override", "dac_read_search", "fowner", "fsetid", "kill", "setgid", "setuid", "setpcap", From 9f6bbb427fc67e5caceec70741def34234078f97 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 12 Apr 2016 01:19:24 +0530 Subject: [PATCH 061/607] ANDROID: base-cfg: enable CONFIG_IP_NF_NAT IP_NF_TARGET_{MASQUERADE,NETMAP,REDIRECT} configs, already enabled in android-base.cfg for tethering, are of no use if CONFIG_IP_NF_NAT is not enabled. Don't rely on platform config for that and enable CONFIG_IP_NF_NAT in android-base.cfg as well. Change-Id: Ic72bcebbd925b142b09539466bf963188c83108a Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 3f82e9339dad..a8b529cae6eb 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -57,6 +57,7 @@ CONFIG_IP_NF_MANGLE=y CONFIG_IP_NF_MATCH_AH=y CONFIG_IP_NF_MATCH_ECN=y CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_NAT=y CONFIG_IP_NF_RAW=y CONFIG_IP_NF_SECURITY=y CONFIG_IP_NF_TARGET_MASQUERADE=y From ea20f758a0f688b713920e1b6c455ab3f0983c9e Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 19 Apr 2016 12:37:31 -0700 Subject: [PATCH 062/607] Revert "drivers: switch: remove S_IWUSR from dev_attr" This reverts commit dc66dee02dcd6ea774e3ed4ae32e88b0f3b4bee7. --- drivers/switch/switch_class.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c index e373b625806e..e05fc2591147 100644 --- a/drivers/switch/switch_class.c +++ b/drivers/switch/switch_class.c @@ -54,8 +54,8 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%s\n", sdev->name); } -static DEVICE_ATTR(state, S_IRUGO, state_show, NULL); -static DEVICE_ATTR(name, S_IRUGO, name_show, NULL); +static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, state_show, NULL); +static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, name_show, NULL); void switch_set_state(struct switch_dev *sdev, int state) { From 00c13787f8275d7ff3e232a0e06a3c26478811e0 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 19 Apr 2016 12:37:47 -0700 Subject: [PATCH 063/607] Revert "switch: switch class and GPIO drivers." Drivers should use extcon moving forward. Documentation/extcon/porting-android-switch-class describes how to port existing switch class drivers to extcon. This reverts commit e4b8e66e0ae2e78e913d7b86f2507fdb0aa731b4. Change-Id: I5b622c7ab4c0cb9670f8903f259a99888f503c1a --- drivers/Kconfig | 2 - drivers/Makefile | 1 - drivers/switch/Kconfig | 15 --- drivers/switch/Makefile | 4 - drivers/switch/switch_class.c | 174 ---------------------------------- drivers/switch/switch_gpio.c | 172 --------------------------------- include/linux/switch.h | 53 ----------- 7 files changed, 421 deletions(-) delete mode 100644 drivers/switch/Kconfig delete mode 100644 drivers/switch/Makefile delete mode 100644 drivers/switch/switch_class.c delete mode 100644 drivers/switch/switch_gpio.c delete mode 100644 include/linux/switch.h diff --git a/drivers/Kconfig b/drivers/Kconfig index 3c363e559864..d2ac339de85f 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -104,8 +104,6 @@ source "drivers/memstick/Kconfig" source "drivers/leds/Kconfig" -source "drivers/switch/Kconfig" - source "drivers/accessibility/Kconfig" source "drivers/infiniband/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 8acfb530993d..795d0ca714bf 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -122,7 +122,6 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-y += mmc/ obj-$(CONFIG_MEMSTICK) += memstick/ obj-y += leds/ -obj-$(CONFIG_SWITCH) += switch/ obj-$(CONFIG_INFINIBAND) += infiniband/ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ diff --git a/drivers/switch/Kconfig b/drivers/switch/Kconfig deleted file mode 100644 index 19404b6f7778..000000000000 --- a/drivers/switch/Kconfig +++ /dev/null @@ -1,15 +0,0 @@ -menuconfig SWITCH - tristate "Switch class support" - help - Say Y here to enable switch class support. This allows - monitoring switches by userspace via sysfs and uevent. - -if SWITCH - -config SWITCH_GPIO - tristate "GPIO Swith support" - depends on GPIOLIB - help - Say Y here to enable GPIO based switch support. - -endif # SWITCH diff --git a/drivers/switch/Makefile b/drivers/switch/Makefile deleted file mode 100644 index f7606ed4a719..000000000000 --- a/drivers/switch/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# Switch Class Driver -obj-$(CONFIG_SWITCH) += switch_class.o -obj-$(CONFIG_SWITCH_GPIO) += switch_gpio.o - diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c deleted file mode 100644 index e05fc2591147..000000000000 --- a/drivers/switch/switch_class.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * drivers/switch/switch_class.c - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#include -#include -#include -#include -#include -#include -#include - -struct class *switch_class; -static atomic_t device_count; - -static ssize_t state_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct switch_dev *sdev = (struct switch_dev *) - dev_get_drvdata(dev); - - if (sdev->print_state) { - int ret = sdev->print_state(sdev, buf); - if (ret >= 0) - return ret; - } - return sprintf(buf, "%d\n", sdev->state); -} - -static ssize_t name_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct switch_dev *sdev = (struct switch_dev *) - dev_get_drvdata(dev); - - if (sdev->print_name) { - int ret = sdev->print_name(sdev, buf); - if (ret >= 0) - return ret; - } - return sprintf(buf, "%s\n", sdev->name); -} - -static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, state_show, NULL); -static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, name_show, NULL); - -void switch_set_state(struct switch_dev *sdev, int state) -{ - char name_buf[120]; - char state_buf[120]; - char *prop_buf; - char *envp[3]; - int env_offset = 0; - int length; - - if (sdev->state != state) { - sdev->state = state; - - prop_buf = (char *)get_zeroed_page(GFP_KERNEL); - if (prop_buf) { - length = name_show(sdev->dev, NULL, prop_buf); - if (length > 0) { - if (prop_buf[length - 1] == '\n') - prop_buf[length - 1] = 0; - snprintf(name_buf, sizeof(name_buf), - "SWITCH_NAME=%s", prop_buf); - envp[env_offset++] = name_buf; - } - length = state_show(sdev->dev, NULL, prop_buf); - if (length > 0) { - if (prop_buf[length - 1] == '\n') - prop_buf[length - 1] = 0; - snprintf(state_buf, sizeof(state_buf), - "SWITCH_STATE=%s", prop_buf); - envp[env_offset++] = state_buf; - } - envp[env_offset] = NULL; - kobject_uevent_env(&sdev->dev->kobj, KOBJ_CHANGE, envp); - free_page((unsigned long)prop_buf); - } else { - printk(KERN_ERR "out of memory in switch_set_state\n"); - kobject_uevent(&sdev->dev->kobj, KOBJ_CHANGE); - } - } -} -EXPORT_SYMBOL_GPL(switch_set_state); - -static int create_switch_class(void) -{ - if (!switch_class) { - switch_class = class_create(THIS_MODULE, "switch"); - if (IS_ERR(switch_class)) - return PTR_ERR(switch_class); - atomic_set(&device_count, 0); - } - - return 0; -} - -int switch_dev_register(struct switch_dev *sdev) -{ - int ret; - - if (!switch_class) { - ret = create_switch_class(); - if (ret < 0) - return ret; - } - - sdev->index = atomic_inc_return(&device_count); - sdev->dev = device_create(switch_class, NULL, - MKDEV(0, sdev->index), NULL, sdev->name); - if (IS_ERR(sdev->dev)) - return PTR_ERR(sdev->dev); - - ret = device_create_file(sdev->dev, &dev_attr_state); - if (ret < 0) - goto err_create_file_1; - ret = device_create_file(sdev->dev, &dev_attr_name); - if (ret < 0) - goto err_create_file_2; - - dev_set_drvdata(sdev->dev, sdev); - sdev->state = 0; - return 0; - -err_create_file_2: - device_remove_file(sdev->dev, &dev_attr_state); -err_create_file_1: - device_destroy(switch_class, MKDEV(0, sdev->index)); - printk(KERN_ERR "switch: Failed to register driver %s\n", sdev->name); - - return ret; -} -EXPORT_SYMBOL_GPL(switch_dev_register); - -void switch_dev_unregister(struct switch_dev *sdev) -{ - device_remove_file(sdev->dev, &dev_attr_name); - device_remove_file(sdev->dev, &dev_attr_state); - device_destroy(switch_class, MKDEV(0, sdev->index)); - dev_set_drvdata(sdev->dev, NULL); -} -EXPORT_SYMBOL_GPL(switch_dev_unregister); - -static int __init switch_class_init(void) -{ - return create_switch_class(); -} - -static void __exit switch_class_exit(void) -{ - class_destroy(switch_class); -} - -module_init(switch_class_init); -module_exit(switch_class_exit); - -MODULE_AUTHOR("Mike Lockwood "); -MODULE_DESCRIPTION("Switch class driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/switch/switch_gpio.c b/drivers/switch/switch_gpio.c deleted file mode 100644 index 621d62d20c99..000000000000 --- a/drivers/switch/switch_gpio.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * drivers/switch/switch_gpio.c - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct gpio_switch_data { - struct switch_dev sdev; - unsigned gpio; - const char *name_on; - const char *name_off; - const char *state_on; - const char *state_off; - int irq; - struct work_struct work; -}; - -static void gpio_switch_work(struct work_struct *work) -{ - int state; - struct gpio_switch_data *data = - container_of(work, struct gpio_switch_data, work); - - state = gpio_get_value(data->gpio); - switch_set_state(&data->sdev, state); -} - -static irqreturn_t gpio_irq_handler(int irq, void *dev_id) -{ - struct gpio_switch_data *switch_data = - (struct gpio_switch_data *)dev_id; - - schedule_work(&switch_data->work); - return IRQ_HANDLED; -} - -static ssize_t switch_gpio_print_state(struct switch_dev *sdev, char *buf) -{ - struct gpio_switch_data *switch_data = - container_of(sdev, struct gpio_switch_data, sdev); - const char *state; - if (switch_get_state(sdev)) - state = switch_data->state_on; - else - state = switch_data->state_off; - - if (state) - return sprintf(buf, "%s\n", state); - return -1; -} - -static int gpio_switch_probe(struct platform_device *pdev) -{ - struct gpio_switch_platform_data *pdata = pdev->dev.platform_data; - struct gpio_switch_data *switch_data; - int ret = 0; - - if (!pdata) - return -EBUSY; - - switch_data = kzalloc(sizeof(struct gpio_switch_data), GFP_KERNEL); - if (!switch_data) - return -ENOMEM; - - switch_data->sdev.name = pdata->name; - switch_data->gpio = pdata->gpio; - switch_data->name_on = pdata->name_on; - switch_data->name_off = pdata->name_off; - switch_data->state_on = pdata->state_on; - switch_data->state_off = pdata->state_off; - switch_data->sdev.print_state = switch_gpio_print_state; - - ret = switch_dev_register(&switch_data->sdev); - if (ret < 0) - goto err_switch_dev_register; - - ret = gpio_request(switch_data->gpio, pdev->name); - if (ret < 0) - goto err_request_gpio; - - ret = gpio_direction_input(switch_data->gpio); - if (ret < 0) - goto err_set_gpio_input; - - INIT_WORK(&switch_data->work, gpio_switch_work); - - switch_data->irq = gpio_to_irq(switch_data->gpio); - if (switch_data->irq < 0) { - ret = switch_data->irq; - goto err_detect_irq_num_failed; - } - - ret = request_irq(switch_data->irq, gpio_irq_handler, - IRQF_TRIGGER_LOW, pdev->name, switch_data); - if (ret < 0) - goto err_request_irq; - - /* Perform initial detection */ - gpio_switch_work(&switch_data->work); - - return 0; - -err_request_irq: -err_detect_irq_num_failed: -err_set_gpio_input: - gpio_free(switch_data->gpio); -err_request_gpio: - switch_dev_unregister(&switch_data->sdev); -err_switch_dev_register: - kfree(switch_data); - - return ret; -} - -static int gpio_switch_remove(struct platform_device *pdev) -{ - struct gpio_switch_data *switch_data = platform_get_drvdata(pdev); - - cancel_work_sync(&switch_data->work); - gpio_free(switch_data->gpio); - switch_dev_unregister(&switch_data->sdev); - kfree(switch_data); - - return 0; -} - -static struct platform_driver gpio_switch_driver = { - .probe = gpio_switch_probe, - .remove = gpio_switch_remove, - .driver = { - .name = "switch-gpio", - .owner = THIS_MODULE, - }, -}; - -static int __init gpio_switch_init(void) -{ - return platform_driver_register(&gpio_switch_driver); -} - -static void __exit gpio_switch_exit(void) -{ - platform_driver_unregister(&gpio_switch_driver); -} - -module_init(gpio_switch_init); -module_exit(gpio_switch_exit); - -MODULE_AUTHOR("Mike Lockwood "); -MODULE_DESCRIPTION("GPIO Switch driver"); -MODULE_LICENSE("GPL"); diff --git a/include/linux/switch.h b/include/linux/switch.h deleted file mode 100644 index 3e4c748e343a..000000000000 --- a/include/linux/switch.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Switch class driver - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#ifndef __LINUX_SWITCH_H__ -#define __LINUX_SWITCH_H__ - -struct switch_dev { - const char *name; - struct device *dev; - int index; - int state; - - ssize_t (*print_name)(struct switch_dev *sdev, char *buf); - ssize_t (*print_state)(struct switch_dev *sdev, char *buf); -}; - -struct gpio_switch_platform_data { - const char *name; - unsigned gpio; - - /* if NULL, switch_dev.name will be printed */ - const char *name_on; - const char *name_off; - /* if NULL, "0" or "1" will be printed */ - const char *state_on; - const char *state_off; -}; - -extern int switch_dev_register(struct switch_dev *sdev); -extern void switch_dev_unregister(struct switch_dev *sdev); - -static inline int switch_get_state(struct switch_dev *sdev) -{ - return sdev->state; -} - -extern void switch_set_state(struct switch_dev *sdev, int state); - -#endif /* __LINUX_SWITCH_H__ */ From a9de5129787db0d135eab61359b820c783fb5676 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 19 Apr 2016 12:44:42 -0700 Subject: [PATCH 064/607] android: base-cfg: remove CONFIG_SWITCH Change-Id: I3fd1aa7a54fe3a8d3ad5537cbc61386e52f41ea0 Signed-off-by: Dmitry Shmidt --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index a8b529cae6eb..304f1d4fd7c4 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -148,7 +148,6 @@ CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y -CONFIG_SWITCH=y CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y From 24c4a0f75c51c1f42cc0125a024b7c262e664ff7 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:43:29 -0700 Subject: [PATCH 065/607] Revert "tcp: Fix IPV6 module build errors" This reverts commit 3823c8136f2170b3ac5e6a5f8b857746a786e845. --- net/ipv4/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b799f20387c6..c0e0fd2dd378 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3260,7 +3260,7 @@ static int tcp_is_local(struct net *net, __be32 addr) { return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK); } -#if defined(CONFIG_IPV6) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static int tcp_is_local6(struct net *net, struct in6_addr *addr) { struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0); return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK); @@ -3328,7 +3328,7 @@ restart: continue; } -#if defined(CONFIG_IPV6) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (family == AF_INET6) { struct in6_addr *s6; if (!inet->pinet6) @@ -3365,4 +3365,3 @@ restart: return 0; } -EXPORT_SYMBOL_GPL(tcp_nuke_addr); From 215856823c282d0339acdad8c7f154abc30d1027 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:43:58 -0700 Subject: [PATCH 066/607] Revert "Don't kill IPv4 sockets when killing IPv6 sockets was requested." This reverts commit 8bf4413b4f54e24120b90ecbfee426beeddc3ff0. --- net/ipv4/tcp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c0e0fd2dd378..9819762c4a92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3331,8 +3331,6 @@ restart: #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (family == AF_INET6) { struct in6_addr *s6; - if (!inet->pinet6) - continue; s6 = &sk->sk_v6_rcv_saddr; if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED) From 9826b2ec8383b14a82f9d1d545a8573986c16abe Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:44:11 -0700 Subject: [PATCH 067/607] Revert "net: fix crash in tcp_nuke_addr()" This reverts commit 08f7c4280cd5efe9e274240c42177f459431bac2. --- net/ipv4/tcp.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9819762c4a92..9b19742cbf42 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3301,19 +3301,8 @@ restart: sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_state == TCP_TIME_WAIT) { - /* - * Sockets that are in TIME_WAIT state are - * instances of lightweight inet_timewait_sock, - * we should simply skip them (or we'll try to - * access non-existing fields and crash). - */ - continue; - } - if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT) continue; - if (sock_flag(sk, SOCK_DEAD)) continue; From 20de1c6bc4d1b4636314fd1a372533b45af809b5 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:44:25 -0700 Subject: [PATCH 068/607] Revert "net: fix iterating over hashtable in tcp_nuke_addr()" This reverts commit 4747299b2c8e8778927b3df0501023d76fe4f2d5. --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9b19742cbf42..854e606f3bce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3291,7 +3291,7 @@ int tcp_nuke_addr(struct net *net, struct sockaddr *addr) return -EAFNOSUPPORT; } - for (bucket = 0; bucket <= tcp_hashinfo.ehash_mask; bucket++) { + for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) { struct hlist_nulls_node *node; struct sock *sk; spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket); From 8591a83caea440173feca9008d2c37898bd7dcba Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:47:01 -0700 Subject: [PATCH 069/607] Revert "net: socket ioctl to reset connections matching local address" Use SOCK_DESTROY from now instead of SIOCKILLADDR This reverts commit 38f0ec724f5306c81130ca9343c856aa37a76d54. Change-Id: I2dcd833b66c88a48de8978dce9d72ab78f9af549 --- include/net/tcp.h | 2 - include/uapi/linux/sockios.h | 1 - net/ipv4/af_inet.c | 1 - net/ipv4/devinet.c | 8 +-- net/ipv4/tcp.c | 105 ----------------------------------- net/ipv6/af_inet6.c | 17 ------ 6 files changed, 1 insertion(+), 133 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 984e59d99deb..bcce99a951f8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1681,8 +1681,6 @@ static inline bool tcp_stream_memory_free(const struct sock *sk) return notsent_bytes < tcp_notsent_lowat(tp); } -extern int tcp_nuke_addr(struct net *net, struct sockaddr *addr); - #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index 623e9aab645e..e888b1aed69f 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -65,7 +65,6 @@ #define SIOCDIFADDR 0x8936 /* delete PA address */ #define SIOCSIFHWBROADCAST 0x8937 /* set hardware broadcast addr */ #define SIOCGIFCOUNT 0x8938 /* get number of devices */ -#define SIOCKILLADDR 0x8939 /* kill sockets with this local addr */ #define SIOCGIFBR 0x8940 /* Bridging support */ #define SIOCSIFBR 0x8941 /* Set bridging options */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 671eb0092915..eb12bd0ff9d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -886,7 +886,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCSIFFLAGS: - case SIOCKILLADDR: err = devinet_ioctl(net, cmd, (void __user *)arg); break; default: diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3792624af316..cebd9d31e65a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -59,7 +59,6 @@ #include #include -#include #include #include #include @@ -965,7 +964,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ - case SIOCKILLADDR: /* Nuke all sockets on this address */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; @@ -1017,8 +1015,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) } ret = -EADDRNOTAVAIL; - if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS - && cmd != SIOCKILLADDR) + if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { @@ -1145,9 +1142,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) inet_insert_ifa(ifa); } break; - case SIOCKILLADDR: /* Nuke all connections on this address */ - ret = tcp_nuke_addr(net, (struct sockaddr *) sin); - break; } done: rtnl_unlock(); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 854e606f3bce..e0921748b2c0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -276,9 +276,6 @@ #include #include #include -#include -#include -#include #include #include @@ -3250,105 +3247,3 @@ void __init tcp_init(void) BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); } - -static int tcp_is_local(struct net *net, __be32 addr) { - struct rtable *rt; - struct flowi4 fl4 = { .daddr = addr }; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR_OR_NULL(rt)) - return 0; - return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK); -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int tcp_is_local6(struct net *net, struct in6_addr *addr) { - struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0); - return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK); -} -#endif - -/* - * tcp_nuke_addr - destroy all sockets on the given local address - * if local address is the unspecified address (0.0.0.0 or ::), destroy all - * sockets with local addresses that are not configured. - */ -int tcp_nuke_addr(struct net *net, struct sockaddr *addr) -{ - int family = addr->sa_family; - unsigned int bucket; - - struct in_addr *in; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct in6_addr *in6 = NULL; -#endif - if (family == AF_INET) { - in = &((struct sockaddr_in *)addr)->sin_addr; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - } else if (family == AF_INET6) { - in6 = &((struct sockaddr_in6 *)addr)->sin6_addr; -#endif - } else { - return -EAFNOSUPPORT; - } - - for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) { - struct hlist_nulls_node *node; - struct sock *sk; - spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket); - -restart: - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) { - struct inet_sock *inet = inet_sk(sk); - - if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT) - continue; - if (sock_flag(sk, SOCK_DEAD)) - continue; - - if (family == AF_INET) { - __be32 s4 = inet->inet_rcv_saddr; - if (s4 == LOOPBACK4_IPV6) - continue; - - if (in->s_addr != s4 && - !(in->s_addr == INADDR_ANY && - !tcp_is_local(net, s4))) - continue; - } - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if (family == AF_INET6) { - struct in6_addr *s6; - - s6 = &sk->sk_v6_rcv_saddr; - if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED) - continue; - - if (!ipv6_addr_equal(in6, s6) && - !(ipv6_addr_equal(in6, &in6addr_any) && - !tcp_is_local6(net, s6))) - continue; - } -#endif - - sock_hold(sk); - spin_unlock_bh(lock); - - local_bh_disable(); - bh_lock_sock(sk); - sk->sk_err = ETIMEDOUT; - sk->sk_error_report(sk); - - tcp_done(sk); - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); - - goto restart; - } - spin_unlock_bh(lock); - } - - return 0; -} diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 99fccad391e0..d9b25bd17bf1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -495,21 +495,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet6_getname); -int inet6_killaddr_ioctl(struct net *net, void __user *arg) { - struct in6_ifreq ireq; - struct sockaddr_in6 sin6; - - if (!capable(CAP_NET_ADMIN)) - return -EACCES; - - if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) - return -EFAULT; - - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = ireq.ifr6_addr; - return tcp_nuke_addr(net, (struct sockaddr *) &sin6); -} - int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -533,8 +518,6 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return addrconf_del_ifaddr(net, (void __user *) arg); case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, (void __user *) arg); - case SIOCKILLADDR: - return inet6_killaddr_ioctl(net, (void __user *) arg); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; From a7eabf90aea68842b735193f0df703f659b45715 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 16:58:20 +0530 Subject: [PATCH 070/607] Revert "misc: uid_stat: Include linux/atomic.h instead of asm/atomic.h" This reverts commit 8d3a6c1538fb021448c4f6381f6191061f947ba1. This series of patches revert AOSP UID_STAT and NET_ACTIVITY_STATS drivers. I could not find any meaningful usage of these interfaces in AOSP master. UID_STAT driver expose "/proc/uid_stat/*" interfaces but it is only used in AOSP master as in what appears be an out of date bandwidth test in frameworks/base and in somewhat recent battery utils test in external/chromium-trace project. NET_ACTIVITY_STATS driver expose "/proc/net/stat/activity" interface but I can not track its usage anywhere in AOSP at all. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 185c69c9738a..8b8c9a22360b 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -13,7 +13,7 @@ * */ -#include +#include #include #include From cdb6973ae15c5f62947ea7afc29fc1175ecb3172 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:08 +0530 Subject: [PATCH 071/607] Revert "misc seq_printf fixes for 4.4" This reverts commit 5c7566a29bff14166d952f2ea525d5231546f821. This patch revert some changes in net/netfilter/xt_qtaguid.c as well. I'll submit another patch to restore those changes. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 3 +-- net/activity_stats.c | 7 ++++--- net/netfilter/xt_qtaguid.c | 5 +++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 8b8c9a22360b..4766c1f83b94 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -55,8 +55,7 @@ static int uid_stat_atomic_int_show(struct seq_file *m, void *v) atomic_t *counter = m->private; bytes = (unsigned int) (atomic_read(counter) + INT_MIN); - seq_printf(m, "%u\n", bytes); - return seq_has_overflowed(m) ? -ENOSPC : 0; + return seq_printf(m, "%u\n", bytes); } static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file) diff --git a/net/activity_stats.c b/net/activity_stats.c index 3bf92d80b8b9..4609ce2043eb 100644 --- a/net/activity_stats.c +++ b/net/activity_stats.c @@ -63,13 +63,14 @@ void activity_stats_update(void) static int activity_stats_show(struct seq_file *m, void *v) { int i; + int ret; seq_printf(m, "Min Bucket(sec) Count\n"); for (i = 0; i < BUCKET_MAX; i++) { - seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]); - if (seq_has_overflowed(m)) - return -ENOSPC; + ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]); + if (ret) + return ret; } return 0; diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e1442bfb668d..04bb081adde8 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -2543,6 +2543,7 @@ static void pp_stats_header(struct seq_file *m) static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, int cnt_set) { + int ret; struct data_counters *cnts; tag_t tag = ts_entry->tn.tag; uid_t stat_uid = get_uid_from_tag(tag); @@ -2561,7 +2562,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, } ppi->item_index++; cnts = &ts_entry->counters; - seq_printf(m, "%d %s 0x%llx %u %u " + ret = seq_printf(m, "%d %s 0x%llx %u %u " "%llu %llu " "%llu %llu " "%llu %llu " @@ -2591,7 +2592,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); - return seq_has_overflowed(m) ? -ENOSPC : 1; + return ret ?: 1; } static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry) From 1ada37dc073ab6e49fd71d2fdc2be2e4f87c56cd Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:31 +0530 Subject: [PATCH 072/607] Revert "misc: uidstat: Remove use of obsolete create_proc_read_entry api" This reverts commit fccab646d33381af63e4f4c0d4f309a1d2b4b0c3. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 50 ++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 4766c1f83b94..509822c81e97 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -49,26 +48,41 @@ static struct uid_stat *find_uid_stat(uid_t uid) { return NULL; } -static int uid_stat_atomic_int_show(struct seq_file *m, void *v) +static int tcp_snd_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int len; unsigned int bytes; - atomic_t *counter = m->private; + char *p = page; + struct uid_stat *uid_entry = (struct uid_stat *) data; + if (!data) + return 0; - bytes = (unsigned int) (atomic_read(counter) + INT_MIN); - return seq_printf(m, "%u\n", bytes); + bytes = (unsigned int) (atomic_read(&uid_entry->tcp_snd) + INT_MIN); + p += sprintf(p, "%u\n", bytes); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; } -static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file) +static int tcp_rcv_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) { - return single_open(file, uid_stat_atomic_int_show, PDE_DATA(inode)); -} + int len; + unsigned int bytes; + char *p = page; + struct uid_stat *uid_entry = (struct uid_stat *) data; + if (!data) + return 0; -static const struct file_operations uid_stat_read_atomic_int_fops = { - .open = uid_stat_read_atomic_int_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; + bytes = (unsigned int) (atomic_read(&uid_entry->tcp_rcv) + INT_MIN); + p += sprintf(p, "%u\n", bytes); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; +} /* Create a new entry for tracking the specified uid. */ static struct uid_stat *create_stat(uid_t uid) { @@ -95,11 +109,11 @@ static void create_stat_proc(struct uid_stat *new_uid) entry = proc_mkdir(uid_s, parent); /* Keep reference to uid_stat so we know what uid to read stats from. */ - proc_create_data("tcp_snd", S_IRUGO, entry, - &uid_stat_read_atomic_int_fops, &new_uid->tcp_snd); + create_proc_read_entry("tcp_snd", S_IRUGO, entry , tcp_snd_read_proc, + (void *) new_uid); - proc_create_data("tcp_rcv", S_IRUGO, entry, - &uid_stat_read_atomic_int_fops, &new_uid->tcp_rcv); + create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc, + (void *) new_uid); } static struct uid_stat *find_or_create_uid_stat(uid_t uid) From cded5596a31db1b1e7fa914b4b47fb488dbd819c Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:43 +0530 Subject: [PATCH 073/607] Revert "misc: uidstat: avoid create_stat() race and blockage." This reverts commit f7a812174033fe620509e6e8ca7022abd924b1c4. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 52 +++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 509822c81e97..2141124a6c12 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -38,13 +38,17 @@ struct uid_stat { }; static struct uid_stat *find_uid_stat(uid_t uid) { + unsigned long flags; struct uid_stat *entry; + spin_lock_irqsave(&uid_lock, flags); list_for_each_entry(entry, &uid_list, link) { if (entry->uid == uid) { + spin_unlock_irqrestore(&uid_lock, flags); return entry; } } + spin_unlock_irqrestore(&uid_lock, flags); return NULL; } @@ -86,10 +90,13 @@ static int tcp_rcv_read_proc(char *page, char **start, off_t off, /* Create a new entry for tracking the specified uid. */ static struct uid_stat *create_stat(uid_t uid) { + unsigned long flags; + char uid_s[32]; struct uid_stat *new_uid; + struct proc_dir_entry *entry; + /* Create the uid stat struct and append it to the list. */ - new_uid = kmalloc(sizeof(struct uid_stat), GFP_ATOMIC); - if (!new_uid) + if ((new_uid = kmalloc(sizeof(struct uid_stat), GFP_KERNEL)) == NULL) return NULL; new_uid->uid = uid; @@ -97,15 +104,11 @@ static struct uid_stat *create_stat(uid_t uid) { atomic_set(&new_uid->tcp_rcv, INT_MIN); atomic_set(&new_uid->tcp_snd, INT_MIN); + spin_lock_irqsave(&uid_lock, flags); list_add_tail(&new_uid->link, &uid_list); - return new_uid; -} + spin_unlock_irqrestore(&uid_lock, flags); -static void create_stat_proc(struct uid_stat *new_uid) -{ - char uid_s[32]; - struct proc_dir_entry *entry; - sprintf(uid_s, "%d", new_uid->uid); + sprintf(uid_s, "%d", uid); entry = proc_mkdir(uid_s, parent); /* Keep reference to uid_stat so we know what uid to read stats from. */ @@ -114,31 +117,17 @@ static void create_stat_proc(struct uid_stat *new_uid) create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc, (void *) new_uid); -} -static struct uid_stat *find_or_create_uid_stat(uid_t uid) -{ - struct uid_stat *entry; - unsigned long flags; - spin_lock_irqsave(&uid_lock, flags); - entry = find_uid_stat(uid); - if (entry) { - spin_unlock_irqrestore(&uid_lock, flags); - return entry; - } - entry = create_stat(uid); - spin_unlock_irqrestore(&uid_lock, flags); - if (entry) - create_stat_proc(entry); - return entry; + return new_uid; } int uid_stat_tcp_snd(uid_t uid, int size) { struct uid_stat *entry; activity_stats_update(); - entry = find_or_create_uid_stat(uid); - if (!entry) - return -1; + if ((entry = find_uid_stat(uid)) == NULL && + ((entry = create_stat(uid)) == NULL)) { + return -1; + } atomic_add(size, &entry->tcp_snd); return 0; } @@ -146,9 +135,10 @@ int uid_stat_tcp_snd(uid_t uid, int size) { int uid_stat_tcp_rcv(uid_t uid, int size) { struct uid_stat *entry; activity_stats_update(); - entry = find_or_create_uid_stat(uid); - if (!entry) - return -1; + if ((entry = find_uid_stat(uid)) == NULL && + ((entry = create_stat(uid)) == NULL)) { + return -1; + } atomic_add(size, &entry->tcp_rcv); return 0; } From 5c0d8ae10ab777417eefc39cd5f84d64c6f34622 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:57 +0530 Subject: [PATCH 074/607] Revert "net: activity_stats: Stop using obsolete create_proc_read_entry api" This reverts commit 7c121720fa14889d59e933aad0a8b9ce948a39ae. Signed-off-by: Amit Pundir --- net/activity_stats.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/net/activity_stats.c b/net/activity_stats.c index 4609ce2043eb..8a3e93470069 100644 --- a/net/activity_stats.c +++ b/net/activity_stats.c @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -60,20 +59,29 @@ void activity_stats_update(void) spin_unlock_irqrestore(&activity_lock, flags); } -static int activity_stats_show(struct seq_file *m, void *v) +static int activity_stats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) { int i; - int ret; + int len; + char *p = page; - seq_printf(m, "Min Bucket(sec) Count\n"); + /* Only print if offset is 0, or we have enough buffer space */ + if (off || count < (30 * BUCKET_MAX + 22)) + return -ENOMEM; + + len = snprintf(p, count, "Min Bucket(sec) Count\n"); + count -= len; + p += len; for (i = 0; i < BUCKET_MAX; i++) { - ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]); - if (ret) - return ret; + len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]); + count -= len; + p += len; } + *eof = 1; - return 0; + return p - page; } static int activity_stats_notifier(struct notifier_block *nb, @@ -92,26 +100,14 @@ static int activity_stats_notifier(struct notifier_block *nb, return 0; } -static int activity_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, activity_stats_show, PDE_DATA(inode)); -} - -static const struct file_operations activity_stats_fops = { - .open = activity_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static struct notifier_block activity_stats_notifier_block = { .notifier_call = activity_stats_notifier, }; static int __init activity_stats_init(void) { - proc_create("activity", S_IRUGO, - init_net.proc_net_stat, &activity_stats_fops); + create_proc_read_entry("activity", S_IRUGO, + init_net.proc_net_stat, activity_stats_read_proc, NULL); return register_pm_notifier(&activity_stats_notifier_block); } From 42d9422a803afc7c1db3502787fdbe75577bc14d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:01:08 +0530 Subject: [PATCH 075/607] Revert "net: activity_stats: Add statistics for network transmission activity" This reverts commit afedd7beba14385fd797166751fde39e0f52cf72. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 3 - include/net/activity_stats.h | 25 -------- net/Kconfig | 8 --- net/Makefile | 1 - net/activity_stats.c | 115 ----------------------------------- 5 files changed, 152 deletions(-) delete mode 100644 include/net/activity_stats.h delete mode 100644 net/activity_stats.c diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 2141124a6c12..e6760b56083c 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -24,7 +24,6 @@ #include #include #include -#include static DEFINE_SPINLOCK(uid_lock); static LIST_HEAD(uid_list); @@ -123,7 +122,6 @@ static struct uid_stat *create_stat(uid_t uid) { int uid_stat_tcp_snd(uid_t uid, int size) { struct uid_stat *entry; - activity_stats_update(); if ((entry = find_uid_stat(uid)) == NULL && ((entry = create_stat(uid)) == NULL)) { return -1; @@ -134,7 +132,6 @@ int uid_stat_tcp_snd(uid_t uid, int size) { int uid_stat_tcp_rcv(uid_t uid, int size) { struct uid_stat *entry; - activity_stats_update(); if ((entry = find_uid_stat(uid)) == NULL && ((entry = create_stat(uid)) == NULL)) { return -1; diff --git a/include/net/activity_stats.h b/include/net/activity_stats.h deleted file mode 100644 index 10e4c1506eeb..000000000000 --- a/include/net/activity_stats.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2010 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * Author: Mike Chan (mike@android.com) - */ - -#ifndef __activity_stats_h -#define __activity_stats_h - -#ifdef CONFIG_NET_ACTIVITY_STATS -void activity_stats_update(void); -#else -#define activity_stats_update(void) {} -#endif - -#endif /* _NET_ACTIVITY_STATS_H */ diff --git a/net/Kconfig b/net/Kconfig index 043fe1dc0860..ce9585cf343a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -92,14 +92,6 @@ config ANDROID_PARANOID_NETWORK help none -config NET_ACTIVITY_STATS - bool "Network activity statistics tracking" - default y - help - Network activity statistics are useful for tracking wireless - modem activity on 2G, 3G, 4G wireless networks. Counts number of - transmissions and groups them in specified time buckets. - config NETWORK_SECMARK bool "Security Marking" help diff --git a/net/Makefile b/net/Makefile index eeb9d5db454f..a5d04098dfce 100644 --- a/net/Makefile +++ b/net/Makefile @@ -77,4 +77,3 @@ endif ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif -obj-$(CONFIG_NET_ACTIVITY_STATS) += activity_stats.o diff --git a/net/activity_stats.c b/net/activity_stats.c deleted file mode 100644 index 8a3e93470069..000000000000 --- a/net/activity_stats.c +++ /dev/null @@ -1,115 +0,0 @@ -/* net/activity_stats.c - * - * Copyright (C) 2010 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * Author: Mike Chan (mike@android.com) - */ - -#include -#include -#include - -/* - * Track transmission rates in buckets (power of 2). - * 1,2,4,8...512 seconds. - * - * Buckets represent the count of network transmissions at least - * N seconds apart, where N is 1 << bucket index. - */ -#define BUCKET_MAX 10 - -/* Track network activity frequency */ -static unsigned long activity_stats[BUCKET_MAX]; -static ktime_t last_transmit; -static ktime_t suspend_time; -static DEFINE_SPINLOCK(activity_lock); - -void activity_stats_update(void) -{ - int i; - unsigned long flags; - ktime_t now; - s64 delta; - - spin_lock_irqsave(&activity_lock, flags); - now = ktime_get(); - delta = ktime_to_ns(ktime_sub(now, last_transmit)); - - for (i = BUCKET_MAX - 1; i >= 0; i--) { - /* - * Check if the time delta between network activity is within the - * minimum bucket range. - */ - if (delta < (1000000000ULL << i)) - continue; - - activity_stats[i]++; - last_transmit = now; - break; - } - spin_unlock_irqrestore(&activity_lock, flags); -} - -static int activity_stats_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int i; - int len; - char *p = page; - - /* Only print if offset is 0, or we have enough buffer space */ - if (off || count < (30 * BUCKET_MAX + 22)) - return -ENOMEM; - - len = snprintf(p, count, "Min Bucket(sec) Count\n"); - count -= len; - p += len; - - for (i = 0; i < BUCKET_MAX; i++) { - len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]); - count -= len; - p += len; - } - *eof = 1; - - return p - page; -} - -static int activity_stats_notifier(struct notifier_block *nb, - unsigned long event, void *dummy) -{ - switch (event) { - case PM_SUSPEND_PREPARE: - suspend_time = ktime_get_real(); - break; - - case PM_POST_SUSPEND: - suspend_time = ktime_sub(ktime_get_real(), suspend_time); - last_transmit = ktime_sub(last_transmit, suspend_time); - } - - return 0; -} - -static struct notifier_block activity_stats_notifier_block = { - .notifier_call = activity_stats_notifier, -}; - -static int __init activity_stats_init(void) -{ - create_proc_read_entry("activity", S_IRUGO, - init_net.proc_net_stat, activity_stats_read_proc, NULL); - return register_pm_notifier(&activity_stats_notifier_block); -} - -subsys_initcall(activity_stats_init); - From ece28ad441409646dc6330b06d347465d2730feb Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:08:15 +0530 Subject: [PATCH 076/607] Revert "misc: uidstat: Adding uid stat driver to collect network statistics." This reverts commit 6b6d5fbf9ae567aefb58099a30bbb6d25fa8925b. Signed-off-by: Amit Pundir --- drivers/misc/Kconfig | 4 - drivers/misc/Makefile | 1 - drivers/misc/uid_stat.c | 153 --------------------------------------- include/linux/uid_stat.h | 29 -------- net/ipv4/tcp.c | 14 ---- 5 files changed, 201 deletions(-) delete mode 100644 drivers/misc/uid_stat.c delete mode 100644 include/linux/uid_stat.h diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 98c020b560ac..2763d6b3b063 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -412,10 +412,6 @@ config TI_DAC7512 This driver can also be built as a module. If so, the module will be called ti_dac7512. -config UID_STAT - bool "UID based statistics tracking exported to /proc/uid_stat" - default n - config VMWARE_BALLOON tristate "VMware Balloon Driver" depends on VMWARE_VMCI && X86 && HYPERVISOR_GUEST diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 24483a6caa6b..e5142b836aee 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -35,7 +35,6 @@ obj-$(CONFIG_ISL29020) += isl29020.o obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o obj-$(CONFIG_DS1682) += ds1682.o obj-$(CONFIG_TI_DAC7512) += ti_dac7512.o -obj-$(CONFIG_UID_STAT) += uid_stat.o obj-$(CONFIG_C2PORT) += c2port/ obj-$(CONFIG_HMC6352) += hmc6352.o obj-y += eeprom/ diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c deleted file mode 100644 index e6760b56083c..000000000000 --- a/drivers/misc/uid_stat.c +++ /dev/null @@ -1,153 +0,0 @@ -/* drivers/misc/uid_stat.c - * - * Copyright (C) 2008 - 2009 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(uid_lock); -static LIST_HEAD(uid_list); -static struct proc_dir_entry *parent; - -struct uid_stat { - struct list_head link; - uid_t uid; - atomic_t tcp_rcv; - atomic_t tcp_snd; -}; - -static struct uid_stat *find_uid_stat(uid_t uid) { - unsigned long flags; - struct uid_stat *entry; - - spin_lock_irqsave(&uid_lock, flags); - list_for_each_entry(entry, &uid_list, link) { - if (entry->uid == uid) { - spin_unlock_irqrestore(&uid_lock, flags); - return entry; - } - } - spin_unlock_irqrestore(&uid_lock, flags); - return NULL; -} - -static int tcp_snd_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - unsigned int bytes; - char *p = page; - struct uid_stat *uid_entry = (struct uid_stat *) data; - if (!data) - return 0; - - bytes = (unsigned int) (atomic_read(&uid_entry->tcp_snd) + INT_MIN); - p += sprintf(p, "%u\n", bytes); - len = (p - page) - off; - *eof = (len <= count) ? 1 : 0; - *start = page + off; - return len; -} - -static int tcp_rcv_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - unsigned int bytes; - char *p = page; - struct uid_stat *uid_entry = (struct uid_stat *) data; - if (!data) - return 0; - - bytes = (unsigned int) (atomic_read(&uid_entry->tcp_rcv) + INT_MIN); - p += sprintf(p, "%u\n", bytes); - len = (p - page) - off; - *eof = (len <= count) ? 1 : 0; - *start = page + off; - return len; -} - -/* Create a new entry for tracking the specified uid. */ -static struct uid_stat *create_stat(uid_t uid) { - unsigned long flags; - char uid_s[32]; - struct uid_stat *new_uid; - struct proc_dir_entry *entry; - - /* Create the uid stat struct and append it to the list. */ - if ((new_uid = kmalloc(sizeof(struct uid_stat), GFP_KERNEL)) == NULL) - return NULL; - - new_uid->uid = uid; - /* Counters start at INT_MIN, so we can track 4GB of network traffic. */ - atomic_set(&new_uid->tcp_rcv, INT_MIN); - atomic_set(&new_uid->tcp_snd, INT_MIN); - - spin_lock_irqsave(&uid_lock, flags); - list_add_tail(&new_uid->link, &uid_list); - spin_unlock_irqrestore(&uid_lock, flags); - - sprintf(uid_s, "%d", uid); - entry = proc_mkdir(uid_s, parent); - - /* Keep reference to uid_stat so we know what uid to read stats from. */ - create_proc_read_entry("tcp_snd", S_IRUGO, entry , tcp_snd_read_proc, - (void *) new_uid); - - create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc, - (void *) new_uid); - - return new_uid; -} - -int uid_stat_tcp_snd(uid_t uid, int size) { - struct uid_stat *entry; - if ((entry = find_uid_stat(uid)) == NULL && - ((entry = create_stat(uid)) == NULL)) { - return -1; - } - atomic_add(size, &entry->tcp_snd); - return 0; -} - -int uid_stat_tcp_rcv(uid_t uid, int size) { - struct uid_stat *entry; - if ((entry = find_uid_stat(uid)) == NULL && - ((entry = create_stat(uid)) == NULL)) { - return -1; - } - atomic_add(size, &entry->tcp_rcv); - return 0; -} - -static int __init uid_stat_init(void) -{ - parent = proc_mkdir("uid_stat", NULL); - if (!parent) { - pr_err("uid_stat: failed to create proc entry\n"); - return -1; - } - return 0; -} - -__initcall(uid_stat_init); diff --git a/include/linux/uid_stat.h b/include/linux/uid_stat.h deleted file mode 100644 index 6bd6c4e52d17..000000000000 --- a/include/linux/uid_stat.h +++ /dev/null @@ -1,29 +0,0 @@ -/* include/linux/uid_stat.h - * - * Copyright (C) 2008-2009 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef __uid_stat_h -#define __uid_stat_h - -/* Contains definitions for resource tracking per uid. */ - -#ifdef CONFIG_UID_STAT -int uid_stat_tcp_snd(uid_t uid, int size); -int uid_stat_tcp_rcv(uid_t uid, int size); -#else -#define uid_stat_tcp_snd(uid, size) do {} while (0); -#define uid_stat_tcp_rcv(uid, size) do {} while (0); -#endif - -#endif /* _LINUX_UID_STAT_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e0921748b2c0..a1d5c67e251d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,7 +269,6 @@ #include #include #include -#include #include #include @@ -1284,10 +1283,6 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); out_nopush: release_sock(sk); - - if (copied + copied_syn) - uid_stat_tcp_snd(from_kuid(&init_user_ns, current_uid()), - copied + copied_syn); return copied + copied_syn; do_fault: @@ -1562,8 +1557,6 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); - uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()), - copied); } return copied; } @@ -1897,10 +1890,6 @@ skip_copy: tcp_cleanup_rbuf(sk, copied); release_sock(sk); - - if (copied > 0) - uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()), - copied); return copied; out: @@ -1909,9 +1898,6 @@ out: recv_urg: err = tcp_recv_urg(sk, msg, len, flags); - if (err > 0) - uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()), - err); goto out; recv_sndq: From 7c79aca516a51401a66b28ca68bd370ed7e9db0b Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 1 Oct 2015 10:44:36 +0530 Subject: [PATCH 077/607] netfilter: xt_qtaguid: seq_printf fixes Update seq_printf() usage in xt_qtaguid to align with changes from mainline commit 6798a8caaf64 "fs/seq_file: convert int seq_vprint/seq_printf/etc... returns to void". Signed-off-by: Amit Pundir --- net/netfilter/xt_qtaguid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 04bb081adde8..e1442bfb668d 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -2543,7 +2543,6 @@ static void pp_stats_header(struct seq_file *m) static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, int cnt_set) { - int ret; struct data_counters *cnts; tag_t tag = ts_entry->tn.tag; uid_t stat_uid = get_uid_from_tag(tag); @@ -2562,7 +2561,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, } ppi->item_index++; cnts = &ts_entry->counters; - ret = seq_printf(m, "%d %s 0x%llx %u %u " + seq_printf(m, "%d %s 0x%llx %u %u " "%llu %llu " "%llu %llu " "%llu %llu " @@ -2592,7 +2591,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); - return ret ?: 1; + return seq_has_overflowed(m) ? -ENOSPC : 1; } static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry) From 5e00c3098bbfbf22086e7ce350cc63ea913a1c15 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 23:55:44 +0530 Subject: [PATCH 078/607] android: recommended.cfg: remove CONFIG_UID_STAT Remove UID Stat driver. Change-Id: Ifc9d2c6fe27900f30e6407398f5b24222518bffc Signed-off-by: Amit Pundir --- android/configs/android-recommended.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index e4c8aaade197..35936afdcae4 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -119,7 +119,6 @@ CONFIG_TIMER_STATS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y -CONFIG_UID_STAT=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y From 4b7de8375299c9f0e330ce521a956bc35bf1d12c Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 22 Apr 2016 00:00:14 -0700 Subject: [PATCH 079/607] vfs: change d_canonical_path to take two paths bug: 23904372 Change-Id: I4a686d64b6de37decf60019be1718e1d820193e6 Signed-off-by: Daniel Rosenberg --- fs/notify/inotify/inotify_user.c | 2 +- fs/sdcardfs/dentry.c | 6 +++++- include/linux/dcache.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f72f3b25b3f2..e2893f17dde2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -746,7 +746,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, /* support stacked filesystems */ if(path.dentry && path.dentry->d_op) { if (path.dentry->d_op->d_canonical_path) { - path.dentry->d_op->d_canonical_path(path.dentry, &alteredpath); + path.dentry->d_op->d_canonical_path(&path, &alteredpath); canonical_path = &alteredpath; path_put(&path); } diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index ba165ef11e27..971928ab6c21 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -172,11 +172,15 @@ static int sdcardfs_cmp_ci(const struct dentry *parent, return 1; } +static void sdcardfs_canonical_path(const struct path *path, struct path *actual_path) { + sdcardfs_get_real_lower(path->dentry, actual_path); +} + const struct dentry_operations sdcardfs_ci_dops = { .d_revalidate = sdcardfs_d_revalidate, .d_release = sdcardfs_d_release, .d_hash = sdcardfs_hash_ci, .d_compare = sdcardfs_cmp_ci, - .d_canonical_path = sdcardfs_get_real_lower, + .d_canonical_path = sdcardfs_canonical_path, }; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ebad56e31a8c..5819e9b0ee22 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -161,7 +161,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); struct inode *(*d_select_inode)(struct dentry *, unsigned); - void (*d_canonical_path)(const struct dentry *, struct path *); + void (*d_canonical_path)(const struct path *, struct path *); } ____cacheline_aligned; /* From 00ee836d89c5756b4e937eae58c9d0d1d7ffe560 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 22 Apr 2016 00:00:48 -0700 Subject: [PATCH 080/607] fuse: Add support for d_canonical_path Allows FUSE to report to inotify that it is acting as a layered filesystem. The userspace component returns a string representing the location of the underlying file. If the string cannot be resolved into a path, the top level path is returned instead. bug: 23904372 Change-Id: Iabdca0bbedfbff59e9c820c58636a68ef9683d9f Signed-off-by: Daniel Rosenberg --- fs/fuse/dev.c | 5 +++++ fs/fuse/dir.c | 45 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 1 + 4 files changed, 54 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 175fcdeabe4c..8932c06e40c1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1935,6 +1936,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, cs->move_pages = 0; err = copy_out_args(cs, &req->out, nbytes); + if (req->in.h.opcode == FUSE_CANONICAL_PATH) { + req->out.h.error = kern_path((char *)req->out.args[0].value, 0, + req->canonical_path); + } fuse_copy_finish(cs); spin_lock(&fpq->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e2e08712d3b..24df0a5df675 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -267,6 +267,50 @@ invalid: goto out; } +/* + * Get the canonical path. Since we must translate to a path, this must be done + * in the context of the userspace daemon, however, the userspace daemon cannot + * look up paths on its own. Instead, we handle the lookup as a special case + * inside of the write request. + */ +static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) { + struct inode *inode = path->dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int err; + char *path_name; + + req = fuse_get_req(fc, 1); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto default_path; + + path_name = (char*)__get_free_page(GFP_KERNEL); + if (!path_name) { + fuse_put_request(fc, req); + goto default_path; + } + + req->in.h.opcode = FUSE_CANONICAL_PATH; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 0; + req->out.numargs = 1; + req->out.args[0].size = PATH_MAX; + req->out.args[0].value = path_name; + req->canonical_path = canonical_path; + req->out.argvar = 1; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + free_page((unsigned long)path_name); + if (!err) + return; +default_path: + canonical_path->dentry = path->dentry; + canonical_path->mnt = path->mnt; + path_get(canonical_path); +} + static int invalid_nodeid(u64 nodeid) { return !nodeid || nodeid == FUSE_ROOT_ID; @@ -274,6 +318,7 @@ static int invalid_nodeid(u64 nodeid) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_canonical_path = fuse_dentry_canonical_path, }; int fuse_valid_type(int m) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 405113101db8..6b4f8dd8e9c4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -363,6 +363,9 @@ struct fuse_req { /** Inode used in the request or NULL */ struct inode *inode; + /** Path used for completing d_canonical_path */ + struct path *canonical_path; + /** AIO control block */ struct fuse_io_priv *io; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c9aca042e61d..8485dd4300b8 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -358,6 +358,7 @@ enum fuse_opcode { FUSE_FALLOCATE = 43, FUSE_READDIRPLUS = 44, FUSE_RENAME2 = 45, + FUSE_CANONICAL_PATH= 2016, /* CUSE specific operations */ CUSE_INIT = 4096, From 5a0725f49f3ab0ebada002e1d4b13943c3a35d18 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 15:51:06 +0530 Subject: [PATCH 081/607] Revert "SELinux: build fix for 4.1" This reverts commit 43e1b4f528e1654fadd1097f7cc5c50be6e45b77. This patch is part of code which is already upstreamed in v4.4. Commits 5c73fceb8c70 (SELinux: Enable setting security contexts on rootfs inodes.), 12f348b9dcf6 (SELinux: rename SE_SBLABELSUPP to SBLABEL_MNT), and b43e725d8d38 (SELinux: use a helper function to determine seclabel). for reference. Signed-off-by: Amit Pundir --- security/selinux/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 78d06ff4eeb7..d4d12949de09 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -469,7 +469,7 @@ static int sb_finish_set_opts(struct super_block *sb) * setting SELinux context on in-core inodes. */ if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0) - sbsec->flags |= SBLABEL_MNT; + sbsec->flags |= SE_SBLABELSUPP; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); From 66daa5f5e2b9078a7cecf37e896385aeee520005 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 15:51:20 +0530 Subject: [PATCH 082/607] Revert "SELinux: Enable setting security contexts on rootfs inodes." This reverts commit 78d36d2111cd4ca722a602846f7db8f54a0b074c. Drop this duplicate patch. This patch is already upstreamed in v4.4. Commits 5c73fceb8c70 (SELinux: Enable setting security contexts on rootfs inodes.), 12f348b9dcf6 (SELinux: rename SE_SBLABELSUPP to SBLABEL_MNT), and b43e725d8d38 (SELinux: use a helper function to determine seclabel), for reference. Signed-off-by: Amit Pundir --- security/selinux/hooks.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d4d12949de09..9b0586d0f3c4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -464,13 +464,6 @@ static int sb_finish_set_opts(struct super_block *sb) if (selinux_is_sblabel_mnt(sb)) sbsec->flags |= SBLABEL_MNT; - /* - * Special handling for rootfs. Is genfs but supports - * setting SELinux context on in-core inodes. - */ - if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0) - sbsec->flags |= SE_SBLABELSUPP; - /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); From ad95c12f66df9efae04b15d5c4d0d0ba56ab2620 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Mon, 25 Apr 2016 14:28:30 -0700 Subject: [PATCH 083/607] Revert "mm: vmscan: Add a debug file for shrinkers" Kernel panic when type "cat /sys/kernel/debug/shrinker" Unable to handle kernel paging request at virtual address 0af37d40 pgd = d4dec000 [0af37d40] *pgd=00000000 Internal error: Oops: 5 [#1] PREEMPT SMP ARM [] (_raw_spin_lock) from [] (list_lru_count_one+0x14/0x28) [] (list_lru_count_one) from [] (super_cache_count+0x40/0xa0) [] (super_cache_count) from [] (debug_shrinker_show+0x50/0x90) [] (debug_shrinker_show) from [] (seq_read+0x1ec/0x48c) [] (seq_read) from [] (__vfs_read+0x20/0xd0) [] (__vfs_read) from [] (vfs_read+0x7c/0x104) [] (vfs_read) from [] (SyS_read+0x44/0x9c) [] (SyS_read) from [] (ret_fast_syscall+0x0/0x3c) Code: e1a04000 e3a00001 ebd66b39 f594f000 (e1943f9f) ---[ end trace 60c74014a63a9688 ]--- Kernel panic - not syncing: Fatal exception shrink_control.nid is used but not initialzed, same for shrink_control.memcg. This reverts commit b0e7a582b2264cdf75874dcd8df915b6b4427755. Change-Id: I108de88fa4baaef99a53c4e4c6a1d8c4b4804157 Reported-by: Xiaowen Liu Signed-off-by: Dmitry Shmidt --- mm/vmscan.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c56f6a812bc..2aec4241b42a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include @@ -221,39 +220,6 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); } -struct dentry *debug_file; - -static int debug_shrinker_show(struct seq_file *s, void *unused) -{ - struct shrinker *shrinker; - struct shrink_control sc; - - sc.gfp_mask = -1; - sc.nr_to_scan = 0; - - down_read(&shrinker_rwsem); - list_for_each_entry(shrinker, &shrinker_list, list) { - int num_objs; - - num_objs = shrinker->count_objects(shrinker, &sc); - seq_printf(s, "%pf %d\n", shrinker->scan_objects, num_objs); - } - up_read(&shrinker_rwsem); - return 0; -} - -static int debug_shrinker_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_shrinker_show, inode->i_private); -} - -static const struct file_operations debug_shrinker_fops = { - .open = debug_shrinker_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* * Add a shrinker callback to be called from the vm. */ @@ -283,15 +249,6 @@ int register_shrinker(struct shrinker *shrinker) } EXPORT_SYMBOL(register_shrinker); -static int __init add_shrinker_debug(void) -{ - debugfs_create_file("shrinker", 0644, NULL, NULL, - &debug_shrinker_fops); - return 0; -} - -late_initcall(add_shrinker_debug); - /* * Remove one */ From 4e461c777e345727aa2988377774c996d303ac46 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Apr 2016 17:12:57 -0700 Subject: [PATCH 084/607] xt_qtaguid: Fix panic caused by synack processing In upstream commit ca6fb06518836ef9b65dc0aac02ff97704d52a05 (tcp: attach SYNACK messages to request sockets instead of listener) http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ca6fb0651883 The building of synack messages was changed, which made it so the skb->sk points to a casted request_sock. This is problematic, as there is no sk_socket in a request_sock. So when the qtaguid_mt function tries to access the sk->sk_socket, it accesses uninitialized memory. After looking at how other netfilter implementations handle this, I realized there was a skb_to_full_sk() helper added, which the xt_qtaguid code isn't yet using. This patch adds its use, and resovles panics seen when accessing uninitialzed memory when processing synack packets. Reported-by: YongQin Liu Signed-off-by: John Stultz --- net/netfilter/xt_qtaguid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e1442bfb668d..822dc3c3bce1 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1689,7 +1689,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) /* default: Fall through and do UID releated work */ } - sk = skb->sk; + sk = skb_to_full_sk(skb); /* * When in TCP_TIME_WAIT the sk is not a "struct sock" but * "struct inet_timewait_sock" which is missing fields. From 379cf6e8ddb187ad202bf05d22251e280fee26d1 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 15:14:35 +0530 Subject: [PATCH 085/607] Revert "HID: steelseries: validate output report details" This reverts commit 90037b2720acffa6da2269a10ecf24ec2dace89b. Remove duplicate code. This patch is already upstreamed in v4.4, commit 41df7f6d4372 (HID: steelseries: validate output report details). Signed-off-by: Amit Pundir --- drivers/hid/hid-steelseries.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/hid/hid-steelseries.c b/drivers/hid/hid-steelseries.c index 93ddc1c65b4c..3edd4ac36494 100644 --- a/drivers/hid/hid-steelseries.c +++ b/drivers/hid/hid-steelseries.c @@ -253,11 +253,6 @@ static int steelseries_srws1_probe(struct hid_device *hdev, goto err_free; } - if (!hid_validate_values(hdev, HID_OUTPUT_REPORT, 0, 0, 16)) { - ret = -ENODEV; - goto err_free; - } - ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); From 9fc7b46c3d3ffdd135c601e5c28a6bb316f3a011 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 14:47:53 +0530 Subject: [PATCH 086/607] Revert "hid-multitouch: Filter collections by application usage." This reverts commit 0840b80cb9626906b57df54e7229db60f9aea4f2. This patch is already upstreamed in v4.4, commit 658d4aed59b3 (HID: hid-multitouch: Filter collections by application usage.), and further fixed/cleaned up afterwards in commits c2ef8f21ea8f (HID: multitouch: add support for trackpads), 76f5902aebda (HID: hid-multitouch: Simplify setup and frame synchronization) et al. By having this duplicate patch in AOSP we are doing redundant checks for Touchscreen and Touchpad devices. Signed-off-by: Amit Pundir --- drivers/hid/hid-multitouch.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 96759b270360..3d664d01305e 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -434,16 +434,6 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON) td->buttons_count++; - /* Only map fields from TouchScreen or TouchPad collections. - * We need to ignore fields that belong to other collections - * such as Mouse that might have the same GenericDesktop usages. */ - if (field->application == HID_DG_TOUCHSCREEN) - set_bit(INPUT_PROP_DIRECT, hi->input->propbit); - else if (field->application == HID_DG_TOUCHPAD) - set_bit(INPUT_PROP_POINTER, hi->input->propbit); - else - return 0; - if (usage->usage_index) prev_usage = &field->usage[usage->usage_index - 1]; From 3a343a1540d4376d838c0a29bd5462d4e961e766 Mon Sep 17 00:00:00 2001 From: Yongqin Liu Date: Thu, 28 Apr 2016 13:53:36 +0800 Subject: [PATCH 087/607] quick selinux support for tracefs Here is just the quick fix for tracefs with selinux. just add tracefs to the list of whitelisted filesystem types in selinux_is_sblabel_mnt(), but the right fix would be to generalize this logic as described in the last item on the todo list, https://bitbucket.org/seandroid/wiki/wiki/ToDo Change-Id: I2aa803ccffbcd2802a7287514da7648e6b364157 Signed-off-by: Yongqin Liu --- security/selinux/hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9b0586d0f3c4..94a0bfc748d1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -420,6 +420,7 @@ static int selinux_is_sblabel_mnt(struct super_block *sb) !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "debugfs") || + !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "rootfs"); } From f24888af18176246638498325fb1ecd49e0ef00e Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 2 May 2016 15:32:15 +0530 Subject: [PATCH 088/607] Revert "mmc: Add status IRQ and status callback function to mmc platform data" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 91fa97e1e5c001d52f6c993d37be08d1e84f47b7. This patch is no longer valid. There are no users for this status irq and callback in android-4.x. The Qcom platform (mach-msm/qsd8x50, HTC Dream..) and SDCC controller (msm_sdcc) using this status IRQ and callback are dropped from mainline sometime back. 27842bb18b00 (mmc: Remove msm_sdcc driver) c0c89fafa289 (ARM: Remove mach-msm and associated ARM architecture code) Change-Id: Ia38e42a06dc184395f79c1ec1d306bf9775704d5 Signed-off-by: Amit Pundir --- include/linux/amba/mmci.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h index 8bfd21c13d59..eff56cb0016a 100644 --- a/include/linux/amba/mmci.h +++ b/include/linux/amba/mmci.h @@ -40,10 +40,7 @@ struct mmci_platform_data { int gpio_wp; int gpio_cd; bool cd_invert; - unsigned int status_irq; struct embedded_sdio_data *embedded_sdio; - int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id); - }; #endif From d114f2c3570fc63019656b6870e4fcc506852334 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Wed, 23 Mar 2016 13:18:03 -0700 Subject: [PATCH 089/607] usb: dual-role: make stub functions inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_DUAL_ROLE_USB_INTF is disabled but the exported functions are referenced, the build will result in warnings such as: In file included from include/linux/usb/class-dual-role.h:112:13: warning: ‘dual_role_instance_changed’ defined but not used [-Wunused-function] These stub functions should be static inline. Change-Id: I5a9ef58dca32306fac5a4c7f28cdaa36fa8ae078 Signed-off-by: Jack Pham (cherry picked from commit 2d152dbb0743526b21d6bbefe097f874c027f860) (cherry picked from commit 8ad66cafaa10e6ba94ff79a8dbc2cc437c6bfe93) --- include/linux/usb/class-dual-role.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h index af42ed34944a..c6df2238012e 100644 --- a/include/linux/usb/class-dual-role.h +++ b/include/linux/usb/class-dual-role.h @@ -109,18 +109,19 @@ extern int dual_role_property_is_writeable(struct dual_role_phy_instance enum dual_role_property prop); extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role); #else /* CONFIG_DUAL_ROLE_USB_INTF */ -static void dual_role_instance_changed(struct dual_role_phy_instance +static inline void dual_role_instance_changed(struct dual_role_phy_instance *dual_role){} -static struct dual_role_phy_instance *__must_check +static inline struct dual_role_phy_instance *__must_check devm_dual_role_instance_register(struct device *parent, const struct dual_role_phy_desc *desc) { return ERR_PTR(-ENOSYS); } -static void devm_dual_role_instance_unregister(struct device *dev, +static inline void devm_dual_role_instance_unregister(struct device *dev, struct dual_role_phy_instance *dual_role){} -static void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role) +static inline void *dual_role_get_drvdata(struct dual_role_phy_instance + *dual_role) { return ERR_PTR(-ENOSYS); } From 8b43e279a24727794cf6dd61fcad956f532002d3 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 4 May 2016 11:05:15 +0530 Subject: [PATCH 090/607] Revert "ARM: Add 'card_present' state to mmc_platfrom_data" This reverts commit 541632275e983573b8250fcd4402f772d7bd1e6f. mmc_platform_data (or arch/arm/include/asm/mach/mmc.h in general) has no active user in android-4.x kernels. Also all the necessary bits are already moved to include/linux/amba/mmci.h. 6ef297f86b62 (ARM: 5720/1: Move MMCI header to amba include dir) Change-Id: Iff384eb527327bf88543408e0257241c1fd99a43 Signed-off-by: Amit Pundir --- arch/arm/include/asm/mach/mmc.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h index bca864ac945f..f8d391ad9203 100644 --- a/arch/arm/include/asm/mach/mmc.h +++ b/arch/arm/include/asm/mach/mmc.h @@ -18,7 +18,6 @@ struct embedded_sdio_data { struct mmc_platform_data { unsigned int ocr_mask; /* available voltages */ int built_in; /* built-in device flag */ - int card_present; /* card detect state */ u32 (*translate_vdd)(struct device *, unsigned int); unsigned int (*status)(struct device *); struct embedded_sdio_data *embedded_sdio; From b2cee99f2bada8a9da66cd29b71a79aaf52227ce Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 4 May 2016 11:14:16 +0530 Subject: [PATCH 091/607] Revert "Recreate asm/mach/mmc.h include file" This reverts commit 5b42ae3edab6c39c1337d36881d29350bb36dcff. This recereated arch/arm/include/asm/mach/mmc.h include file has no active user in android-4.x kernels. Also all the necessary bits are already moved to include/linux/amba/mmci.h. 6ef297f86b62 (ARM: 5720/1: Move MMCI header to amba include dir) Change-Id: Ibf258b355d17f54f49b777a8f6e0089e9b59a3a5 Signed-off-by: Amit Pundir --- arch/arm/include/asm/mach/mmc.h | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 arch/arm/include/asm/mach/mmc.h diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h deleted file mode 100644 index f8d391ad9203..000000000000 --- a/arch/arm/include/asm/mach/mmc.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * arch/arm/include/asm/mach/mmc.h - */ -#ifndef ASMARM_MACH_MMC_H -#define ASMARM_MACH_MMC_H - -#include -#include -#include - -struct embedded_sdio_data { - struct sdio_cis cis; - struct sdio_cccr cccr; - struct sdio_embedded_func *funcs; - int num_funcs; -}; - -struct mmc_platform_data { - unsigned int ocr_mask; /* available voltages */ - int built_in; /* built-in device flag */ - u32 (*translate_vdd)(struct device *, unsigned int); - unsigned int (*status)(struct device *); - struct embedded_sdio_data *embedded_sdio; - int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id); -}; - -#endif From f355b21139b26c70f38ae3cc19388c513a75242c Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Wed, 4 May 2016 13:51:38 -0700 Subject: [PATCH 092/607] fiq_debugger: Add option to apply uart overlay by FIQ_DEBUGGER_UART_OVERLAY fiq_debugger is taking over uart, so it is necessary to disable original uart in DT file. It can be done manually or by overlay. Change-Id: I9f50ec15b0e22e602d73b9f745fc8666f8925d09 Signed-off-by: Dmitry Shmidt --- drivers/staging/android/fiq_debugger/Kconfig | 9 ++++++ .../android/fiq_debugger/fiq_debugger.c | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig index 56f7f999377e..60fc224d4efc 100644 --- a/drivers/staging/android/fiq_debugger/Kconfig +++ b/drivers/staging/android/fiq_debugger/Kconfig @@ -42,6 +42,15 @@ config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE If enabled, this puts the fiq debugger into console mode by default. Otherwise, the fiq debugger will start out in debug mode. +config FIQ_DEBUGGER_UART_OVERLAY + bool "Install uart DT overlay" + depends on FIQ_DEBUGGER + select OF_OVERLAY + default n + help + If enabled, fiq debugger is calling fiq_debugger_uart_overlay() + that will apply overlay uart_overlay@0 to disable proper uart. + config FIQ_WATCHDOG bool select FIQ_DEBUGGER diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c index 7f056831dbff..0c558331a3d2 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c @@ -39,6 +39,10 @@ #include #endif +#ifdef CONFIG_FIQ_DEBUGGER_UART_OVERLAY +#include +#endif + #include #include "fiq_debugger.h" @@ -1201,10 +1205,36 @@ static struct platform_driver fiq_debugger_driver = { }, }; +#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY) +int fiq_debugger_uart_overlay(void) +{ + struct device_node *onp = of_find_node_by_path("/uart_overlay@0"); + int ret; + + if (!onp) { + pr_err("serial_debugger: uart overlay not found\n"); + return -ENODEV; + } + + ret = of_overlay_create(onp); + if (ret < 0) { + pr_err("serial_debugger: fail to create overlay: %d\n", ret); + of_node_put(onp); + return ret; + } + + pr_info("serial_debugger: uart overlay applied\n"); + return 0; +} +#endif + static int __init fiq_debugger_init(void) { #if defined(CONFIG_FIQ_DEBUGGER_CONSOLE) fiq_debugger_tty_init(); +#endif +#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY) + fiq_debugger_uart_overlay(); #endif return platform_driver_register(&fiq_debugger_driver); } From 0ecbd590a7175818fcbccc9d5b1dc819aae25c86 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Thu, 17 Dec 2015 07:34:27 +0800 Subject: [PATCH 093/607] sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() If a newly created task is selected to go to a different CPU in fork balance when it wakes up the first time, its load averages should not be removed from the source CPU since they are never added to it before. The same is also applicable to a never used group entity. Fix it in remove_entity_load_avg(): when entity's last_update_time is 0, simply return. This should precisely identify the case in question, because in other migrations, the last_update_time is set to 0 after remove_entity_load_avg(). Reported-by: Steve Muckle Signed-off-by: Yuyang Du [peterz: cfs_rq_last_update_time] Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfdc0e61066c..6e7d36f2ed8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2809,6 +2809,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } +#ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + u64 last_update_time_copy; + u64 last_update_time; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); + + return last_update_time; +} +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} +#endif + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2818,17 +2839,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; -#ifndef CONFIG_64BIT - u64 last_update_time_copy; + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif + last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); From 259bd4cd8a4c5776e86ae1a86f1f8e1168e84212 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 17 Sep 2015 16:10:56 +0100 Subject: [PATCH 094/607] cpufreq: Frequency invariant scheduler load-tracking support Implements cpufreq_scale_freq_capacity() to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is: current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 29 +++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 32 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8412ce5f93a7..46a7d62c8174 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -347,6 +347,31 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) #endif } +/********************************************************************* + * FREQUENCY INVARIANT CPU CAPACITY * + *********************************************************************/ + +static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +static void +scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) +{ + unsigned long cur = freqs ? freqs->new : policy->cur; + unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + int cpu; + + pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", + cpumask_pr_args(policy->cpus), cur, policy->max, scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(freq_scale, cpu) = scale; +} + +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { @@ -450,6 +475,8 @@ wait: spin_unlock(&policy->transition_lock); + scale_freq_capacity(policy, freqs); + cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); @@ -2126,6 +2153,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY, new_policy); + scale_freq_capacity(new_policy, NULL); + policy->min = new_policy->min; policy->max = new_policy->max; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 177c7680c1a8..11c65f536b6c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -616,4 +616,7 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +struct sched_domain; +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif /* _LINUX_CPUFREQ_H */ From 372b62344c227283acd23d6ca63543a4ceb27abb Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 12:47:48 +0100 Subject: [PATCH 095/607] arm: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Signed-off-by: Dietmar Eggemann --- arch/arm/include/asm/topology.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 370f7a732900..a69917b7d2c9 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,11 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#include +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +#endif + #else static inline void init_cpu_topology(void) { } From 8411396e9c5cd1badfe05e08624d90cbbb7b1126 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:15:11 +0100 Subject: [PATCH 096/607] arm64: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a3e9d6fdbf21..72c47c3fa7b3 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,6 +22,12 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +struct sched_domain; +extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +#endif + #include #endif /* _ASM_ARM_TOPOLOGY_H */ From 0f8edc82743fd749a3536c68386c89112e760792 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 14 Apr 2015 16:25:31 +0100 Subject: [PATCH 097/607] arm: Update arch_scale_cpu_capacity() to reflect change to define arch_scale_cpu_capacity() is no longer a weak function but a #define instead. Include the #define in topology.h. cc: Russell King Signed-off-by: Morten Rasmussen --- arch/arm/include/asm/topology.h | 2 ++ arch/arm/kernel/topology.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index a69917b7d2c9..e3e596cbb1a7 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -28,6 +28,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #else diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847bf912..614554765e44 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } From 1eb2b8aa45840b50cd953052315a8b4f6bc6416d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 7 May 2015 18:46:15 +0100 Subject: [PATCH 098/607] sched: Store system-wide maximum cpu capacity in root domain To be able to compare the capacity of the target cpu with the highest cpu capacity of the system in the wakeup path, store the system-wide maximum cpu capacity in the root domain. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 732e993b564b..7c6e2b326349 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6971,6 +6971,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7021,11 +7022,18 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); + + if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) + rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); + if (rq) + pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b242775bf670..2b7ffa5a20ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -543,6 +543,9 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + /* Maximum cpu capacity in the system. */ + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; From e8bcb272bc920d875c26ca2e06c872b9f844f6f1 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 19:53:49 +0100 Subject: [PATCH 099/607] sched: Add cpu capacity awareness to wakeup balancing Wakeup balancing is completely unaware of cpu capacity, cpu utilization and task utilization. The task is preferably placed on a cpu which is idle in the instant the wakeup happens. New tasks (SD_BALANCE_{FORK,EXEC} are placed on an idle cpu in the idlest group if such can be found, otherwise it goes on the least loaded one. Existing tasks (SD_BALANCE_WAKE) are placed on the previous cpu or an idle cpu sharing the same last level cache unless the wakee_flips heuristic in wake_wide() decides to fallback to considering cpus outside SD_LLC. Hence existing tasks are not guaranteed to get a chance to migrate to a different group at wakeup in case the current one has reduced cpu capacity (due RT/IRQ pressure or different uarch e.g. ARM big.LITTLE). They may eventually get pulled by other cpus doing periodic/idle/nohz_idle balance, but it may take quite a while before it happens. This patch adds capacity awareness to find_idlest_{group,queue} (used by SD_BALANCE_{FORK,EXEC} and SD_BALANCE_WAKE under certain circumstances) such that groups/cpus that can accommodate the waking task based on task utilization are preferred. In addition, wakeup of existing tasks (SD_BALANCE_WAKE) is sent through find_idlest_{group,queue} also if the task doesn't fit the capacity of the previous cpu to allow it to escape (override wake_affine) when necessary instead of relying on periodic/idle/nohz_idle balance to eventually sort it out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e7d36f2ed8f..5e112bba1b5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4742,6 +4742,43 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +static unsigned int capacity_margin = 1280; /* ~20% margin */ + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static int cpu_util(int cpu); + +static inline bool task_fits_spare(struct task_struct *p, int cpu) +{ + return __task_fits(p, cpu, cpu_util(cpu)); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4751,7 +4788,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4782,6 +4821,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + /* + * Look for most energy-efficient group that can fit + * that can fit the task. + */ + if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { + fit_capacity = capacity_of(i); + fit_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4795,6 +4843,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); + if (fit_group) + return fit_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4815,7 +4866,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { + if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -4827,7 +4878,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && + } else if (idle_cpu(i) && + (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -4836,6 +4888,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } else if (shallowest_idle_cpu == -1) { + /* + * If we haven't found an idle CPU yet + * pick a non-idle one that can fit the task as + * fallback. + */ + shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -4950,7 +5009,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { From 8a5c0339bb7b7eedd2e75b62ebe4144a195f7ef3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 6 Jul 2015 15:01:10 +0100 Subject: [PATCH 100/607] sched: Consider spare cpu capacity at task wake-up find_idlest_group() selects the wake-up target group purely based on group load which leads to suboptimal choices in low load scenarios. An idle group with reduced capacity (due to RT tasks or different cpu type) isn't necessarily a better target than a lightly loaded group with higher capacity. The patch adds spare capacity as an additional group selection parameter. The target group is now selected based on the following criteria: 1. Return the group with the cpu with most spare capacity and this capacity is significant if such group exists. Significant spare capacity is currently at least 20% to spare. 2. Return the group with the lowest load, unless it is the local group in which case NULL is returned and the search is continued at the next (lower) level. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e112bba1b5a..bebc8367edee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4788,9 +4788,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL; + struct sched_group *fit_group = NULL, *spare_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; + unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4798,7 +4799,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_capacity; int local_group; int i; @@ -4830,6 +4831,16 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, fit_capacity = capacity_of(i); fit_group = group; } + + /* + * Look for group which has most spare capacity on a + * single cpu. + */ + spare_capacity = capacity_of(i) - cpu_util(i); + if (spare_capacity > max_spare_capacity) { + max_spare_capacity = spare_capacity; + spare_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4846,6 +4857,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (fit_group) return fit_group; + if (spare_group) + return spare_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From cda2bd333be6e5e8c38fa072c9c4c296312dfd8d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 26 Jan 2015 19:47:28 +0000 Subject: [PATCH 101/607] sched: Enable idle balance to pull single task towards cpu with higher capacity We do not want to miss out on the ability to pull a single remaining task from a potential source cpu towards an idle destination cpu. Add an extra criteria to need_active_balance() to kick off active load balance if the source cpu is over-utilized and has lower capacity than the destination cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bebc8367edee..fe40cdb8e640 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4779,6 +4779,11 @@ static inline bool task_fits_spare(struct task_struct *p, int cpu) return __task_fits(p, cpu, cpu_util(cpu)); } +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -6995,6 +7000,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 681fa1413b869ca73f685b9c8769cdd645879804 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 102/607] sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe40cdb8e640..632163ef7628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5629,6 +5629,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6591,6 +6592,8 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) @@ -7219,7 +7222,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); From d8c8088f41033b1cea9d88149046e1f8bdb9ade0 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:43:28 +0000 Subject: [PATCH 103/607] sched: Documentation for scheduler energy cost model This documentation patch provides an overview of the experimental scheduler energy costing model, associated data structures, and a reference recipe on how platforms can be characterized to derive energy models. Signed-off-by: Morten Rasmussen --- Documentation/scheduler/sched-energy.txt | 362 +++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 Documentation/scheduler/sched-energy.txt diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt new file mode 100644 index 000000000000..dab2f9088b33 --- /dev/null +++ b/Documentation/scheduler/sched-energy.txt @@ -0,0 +1,362 @@ +Energy cost model for energy-aware scheduling (EXPERIMENTAL) + +Introduction +============= + +The basic energy model uses platform energy data stored in sched_group_energy +data structures attached to the sched_groups in the sched_domain hierarchy. The +energy cost model offers two functions that can be used to guide scheduling +decisions: + +1. static unsigned int sched_group_energy(struct energy_env *eenv) +2. static int energy_diff(struct energy_env *eenv) + +sched_group_energy() estimates the energy consumed by all cpus in a specific +sched_group including any shared resources owned exclusively by this group of +cpus. Resources shared with other cpus are excluded (e.g. later level caches). + +energy_diff() estimates the total energy impact of a utilization change. That +is, adding, removing, or migrating utilization (tasks). + +Both functions use a struct energy_env to specify the scenario to be evaluated: + + struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + }; + +sg_top: sched_group to be evaluated. Not used by energy_diff(). + +sg_cap: sched_group covering the cpus in the same frequency domain. Set by +sched_group_energy(). + +cap_idx: Capacity state to be used for energy calculations. Set by +find_new_capacity(). + +util_delta: Amount of utilization to be added, removed, or migrated. + +src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be +-1 if no source (e.g. task wake-up). + +dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1 +if utilization is removed (e.g. terminating tasks). + +energy: Result of sched_group_energy(). + +The metric used to represent utilization is the actual per-entity running time +averaged over time using a geometric series. Very similar to the existing +per-entity load-tracking, but _not_ scaled by task priority and capped by the +capacity of the cpu. The latter property does mean that utilization may +underestimate the compute requirements for task on fully/over utilized cpus. +The greatest potential for energy savings without affecting performance too much +is scenarios where the system isn't fully utilized. If the system is deemed +fully utilized load-balancing should be done with task load (includes task +priority) instead in the interest of fairness and performance. + + +Background and Terminology +=========================== + +To make it clear from the start: + +energy = [joule] (resource like a battery on powered devices) +power = energy/time = [joule/second] = [watt] + +The goal of energy-aware scheduling is to minimize energy, while still getting +the job done. That is, we want to maximize: + + performance [inst/s] + -------------------- + power [W] + +which is equivalent to minimizing: + + energy [J] + ----------- + instruction + +while still getting 'good' performance. It is essentially an alternative +optimization objective to the current performance-only objective for the +scheduler. This alternative considers two objectives: energy-efficiency and +performance. Hence, there needs to be a user controllable knob to switch the +objective. Since it is early days, this is currently a sched_feature +(ENERGY_AWARE). + +The idea behind introducing an energy cost model is to allow the scheduler to +evaluate the implications of its decisions rather than applying energy-saving +techniques blindly that may only have positive effects on some platforms. At +the same time, the energy cost model must be as simple as possible to minimize +the scheduler latency impact. + +Platform topology +------------------ + +The system topology (cpus, caches, and NUMA information, not peripherals) is +represented in the scheduler by the sched_domain hierarchy which has +sched_groups attached at each level that covers one or more cpus (see +sched-domains.txt for more details). To add energy awareness to the scheduler +we need to consider power and frequency domains. + +Power domain: + +A power domain is a part of the system that can be powered on/off +independently. Power domains are typically organized in a hierarchy where you +may be able to power down just a cpu or a group of cpus along with any +associated resources (e.g. shared caches). Powering up a cpu means that all +power domains it is a part of in the hierarchy must be powered up. Hence, it is +more expensive to power up the first cpu that belongs to a higher level power +domain than powering up additional cpus in the same high level domain. Two +level power domain hierarchy example: + + Power source + +-------------------------------+----... +per group PD G G + | +----------+ | + +--------+-------| Shared | (other groups) +per-cpu PD G G | resource | + | | +----------+ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + +Frequency domain: + +Frequency domains (P-states) typically cover the same group of cpus as one of +the power domain levels. That is, there might be several smaller power domains +sharing the same frequency (P-state) or there might be a power domain spanning +multiple frequency domains. + +From a scheduling point of view there is no need to know the actual frequencies +[Hz]. All the scheduler cares about is the compute capacity available at the +current state (P-state) the cpu is in and any other available states. For that +reason, and to also factor in any cpu micro-architecture differences, compute +capacity scaling states are called 'capacity states' in this document. For SMP +systems this is equivalent to P-states. For mixed micro-architecture systems +(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture +performance relative to the other cpus in the system. + +Energy modelling: +------------------ + +Due to the hierarchical nature of the power domains, the most obvious way to +model energy costs is therefore to associate power and energy costs with +domains (groups of cpus). Energy costs of shared resources are associated with +the group of cpus that share the resources, only the cost of powering the +cpu itself and any private resources (e.g. private L1 caches) is associated +with the per-cpu groups (lowest level). + +For example, for an SMP system with per-cpu power domains and a cluster level +(group of cpus) power domain we get the overall energy costs to be: + + energy = energy_cluster + n * energy_cpu + +where 'n' is the number of cpus powered up and energy_cluster is the cost paid +as soon as any cpu in the cluster is powered up. + +The power and frequency domains can naturally be mapped onto the existing +sched_domain hierarchy and sched_groups by adding the necessary data to the +existing data structures. + +The energy model considers energy consumption from two contributors (shown in +the illustration below): + +1. Busy energy: Energy consumed while a cpu and the higher level groups that it +belongs to are busy running tasks. Busy energy is associated with the state of +the cpu, not an event. The time the cpu spends in this state varies. Thus, the +most obvious platform parameter for this contribution is busy power +(energy/time). + +2. Idle energy: Energy consumed while a cpu and higher level groups that it +belongs to are idle (in a C-state). Like busy energy, idle energy is associated +with the state of the cpu. Thus, the platform parameter for this contribution +is idle power (energy/time). + +Energy consumed during transitions from an idle-state (C-state) to a busy state +(P-state) or going the other way is ignored by the model to simplify the energy +model calculations. + + + Power + ^ + | busy->idle idle->busy + | transition transition + | + | _ __ + | / \ / \__________________ + |______________/ \ / + | \ / + | Busy \ Idle / Busy + | low P-state \____________/ high P-state + | + +------------------------------------------------------------> time + +Busy |--------------| |-----------------| + +Wakeup |------| |------| + +Idle |------------| + + +The basic algorithm +==================== + +The basic idea is to determine the total energy impact when utilization is +added or removed by estimating the impact at each level in the sched_domain +hierarchy starting from the bottom (sched_group contains just a single cpu). +The energy cost comes from busy time (sched_group is awake because one or more +cpus are busy) and idle time (in an idle-state). Energy model numbers account +for energy costs associated with all cpus in the sched_group as a group. + + for_each_domain(cpu, sd) { + sg = sched_group_of(cpu) + energy_before = curr_util(sg) * busy_power(sg) + + (1-curr_util(sg)) * idle_power(sg) + energy_after = new_util(sg) * busy_power(sg) + + (1-new_util(sg)) * idle_power(sg) + energy_diff += energy_before - energy_after + + } + + return energy_diff + +{curr, new}_util: The cpu utilization at the lowest level and the overall +non-idle time for the entire group for higher levels. Utilization is in the +range 0.0 to 1.0 in the pseudo-code. + +busy_power: The power consumption of the sched_group. + +idle_power: The power consumption of the sched_group when idle. + +Note: It is a fundamental assumption that the utilization is (roughly) scale +invariant. Task utilization tracking factors in any frequency scaling and +performance scaling differences due to difference cpu microarchitectures such +that task utilization can be used across the entire system. + + +Platform energy data +===================== + +struct sched_group_energy can be attached to sched_groups in the sched_domain +hierarchy and has the following members: + +cap_states: + List of struct capacity_state representing the supported capacity states + (P-states). struct capacity_state has two members: cap and power, which + represents the compute capacity and the busy_power of the state. The + list must be ordered by capacity low->high. + +nr_cap_states: + Number of capacity states in cap_states list. + +idle_states: + List of struct idle_state containing idle_state power cost for each + idle-state supported by the system orderd by shallowest state first. + All states must be included at all level in the hierarchy, i.e. a + sched_group spanning just a single cpu must also include coupled + idle-states (cluster states). In addition to the cpuidle idle-states, + the list must also contain an entry for the idling using the arch + default idle (arch_idle_cpu()). Despite this state may not be a true + hardware idle-state it is considered the shallowest idle-state in the + energy model and must be the first entry. cpus may enter this state + (possibly 'active idling') if cpuidle decides not enter a cpuidle + idle-state. Default idle may not be used when cpuidle is enabled. + In this case, it should just be a copy of the first cpuidle idle-state. + +nr_idle_states: + Number of idle states in idle_states list. + +There are no unit requirements for the energy cost data. Data can be normalized +with any reference, however, the normalization must be consistent across all +energy cost data. That is, one bogo-joule/watt must be the same quantity for +data, but we don't care what it is. + +A recipe for platform characterization +======================================= + +Obtaining the actual model data for a particular platform requires some way of +measuring power/energy. There isn't a tool to help with this (yet). This +section provides a recipe for use as reference. It covers the steps used to +characterize the ARM TC2 development platform. This sort of measurements is +expected to be done anyway when tuning cpuidle and cpufreq for a given +platform. + +The energy model needs two types of data (struct sched_group_energy holds +these) for each sched_group where energy costs should be taken into account: + +1. Capacity state information + +A list containing the compute capacity and power consumption when fully +utilized attributed to the group as a whole for each available capacity state. +At the lowest level (group contains just a single cpu) this is the power of the +cpu alone without including power consumed by resources shared with other cpus. +It basically needs to fit the basic modelling approach described in "Background +and Terminology" section: + + energy_system = energy_shared + n * energy_cpu + +for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at +the lowest level. 'energy_shared' is included at the next level which +represents the group of cpus among which the resources are shared. + +This model is, of course, a simplification of reality. Thus, power/energy +attributions might not always exactly represent how the hardware is designed. +Also, busy power is likely to depend on the workload. It is therefore +recommended to use a representative mix of workloads when characterizing the +capacity states. + +If the group has no capacity scaling support, the list will contain a single +state where power is the busy power attributed to the group. The capacity +should be set to a default value (1024). + +When frequency domains include multiple power domains, the group representing +the frequency domain and all child groups share capacity states. This must be +indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at +all levels that share the capacity state must have the list of capacity states +with the power set to the contribution of the individual group. + +2. Idle power information + +Stored in the idle_states list. The power number is the group idle power +consumption in each idle state as well when the group is idle but has not +entered an idle-state ('active idle' as mentioned earlier). Due to the way the +energy model is defined, the idle power of the deepest group idle state can +alternatively be accounted for in the parent group busy power. In that case the +group idle state power values are offset such that the idle power of the +deepest state is zero. It is less intuitive, but it is easier to measure as +idle power consumed by the group and the busy/idle power of the parent group +cannot be distinguished without per group measurement points. + +Measuring capacity states and idle power: + +The capacity states' capacity and power can be estimated by running a benchmark +workload at each available capacity state. By restricting the benchmark to run +on subsets of cpus it is possible to extrapolate the power consumption of +shared resources. + +ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a +shared L2 cache. TC2 has on-chip energy counters per cluster. Running a +benchmark workload on just one cpu in a cluster means that power is consumed in +the cluster (higher level group) and a single cpu (lowest level group). Adding +another benchmark task to another cpu increases the power consumption by the +amount consumed by the additional cpu. Hence, it is possible to extrapolate the +cluster busy power. + +For platforms that don't have energy counters or equivalent instrumentation +built-in, it may be possible to use an external DAQ to acquire similar data. + +If the benchmark includes some performance score (for example sysbench cpu +benchmark), this can be used to record the compute capacity. + +Measuring idle power requires insight into the idle state implementation on the +particular platform. Specifically, if the platform has coupled idle-states (or +package states). To measure non-coupled per-cpu idle-states it is necessary to +keep one cpu busy to keep any shared resources alive to isolate the idle power +of the cpu from idle/busy power of the shared resources. The cpu can be tricked +into different per-cpu idle states by disabling the other states. Based on +various combinations of measurements with specific cpus busy and disabling +idle-states it is possible to extrapolate the idle-state power. From e496f328c87173fe188e91c6cd60197fb8e7e305 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:45:51 +0000 Subject: [PATCH 104/607] sched: Make energy awareness a sched feature This patch introduces the ENERGY_AWARE sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set false when SCHED_DEBUG is not defined. Hence this doesn't allow energy awareness to be enabled without SCHED_DEBUG. This sched_feature knob will be replaced later with a more appropriate control knob when things have matured a bit. ENERGY_AWARE is based on per-entity load-tracking hence FAIR_GROUP_SCHED must be enable. This dependency isn't checked at compile time yet. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 632163ef7628..efc65c3db6b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4651,6 +4651,11 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..b634151ce286 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,8 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +SCHED_FEAT(ENERGY_AWARE, false) From 0b3bda54d245ebcfeefc61f0ff763e76ca2a8784 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:08:45 +0000 Subject: [PATCH 105/607] sched: Introduce energy data structures The struct sched_group_energy represents the per sched_group related data which is needed for energy aware scheduling. It contains: (1) number of elements of the idle state array (2) pointer to the idle state array which comprises 'power consumption' for each idle state (3) number of elements of the capacity state array (4) pointer to the capacity state array which comprises 'compute capacity and power consumption' tuples for each capacity state The struct sched_group obtains a pointer to a struct sched_group_energy. The function pointer sched_domain_energy_f is introduced into struct sched_domain_topology_level which will allow the arch to pass a particular struct sched_group_energy from the topology shim layer into the scheduler core. The function pointer sched_domain_energy_f has an 'int cpu' parameter since the folding of two adjacent sd levels via sd degenerate doesn't work for all sd levels. I.e. it is not possible for example to use this feature to provide per-cpu energy in sd level DIE on ARM's TC2 platform. It was discussed that the folding of sd levels approach is preferable over the cpu parameter approach, simply because the user (the arch specifying the sd topology table) can introduce less errors. But since it is not working, the 'int cpu' parameter is the only way out. It's possible to use the folding of sd levels approach for sched_domain_flags_f and the cpu parameter approach for the sched_domain_energy_f at the same time though. With the use of the 'int cpu' parameter, an extra check function has to be provided to make sure that all cpus spanned by a sched group are provisioned with the same energy data. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- include/linux/sched.h | 19 +++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 20 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index fa39434e3fdd..5bee272a86d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1021,6 +1021,22 @@ struct sched_domain_attr { extern int sched_domain_level_max; +struct capacity_state { + unsigned long cap; /* compute capacity */ + unsigned long power; /* power consumption at this compute capacity */ +}; + +struct idle_state { + unsigned long power; /* power consumption in this idle state */ +}; + +struct sched_group_energy { + unsigned int nr_idle_states; /* number of idle states */ + struct idle_state *idle_states; /* ptr to idle state array */ + unsigned int nr_cap_states; /* number of capacity states */ + struct capacity_state *cap_states; /* ptr to capacity state array */ +}; + struct sched_group; struct sched_domain { @@ -1119,6 +1135,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); +typedef +const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); #define SDTL_OVERLAP 0x01 @@ -1131,6 +1149,7 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; + sched_domain_energy_f energy; int flags; int numa_level; struct sd_data data; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2b7ffa5a20ad..1813cba2995d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -863,6 +863,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + const struct sched_group_energy const *sge; /* * The CPUs this group covers. From 4ce990ec65231af6a148c2c0e6b4745fc1d12368 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:20:20 +0000 Subject: [PATCH 106/607] sched: Initialize energy data structures The sched_group_energy (sge) pointer of the first sched_group (sg) in the sched_domain (sd) is initialized to point to the appropriate (in terms of sd level and cpu) sge data defined in the arch and so to the correct part of the Energy Model (EM). Energy-aware scheduling allows that a system has only EM data up to a certain sd level (so called highest energy aware balancing sd level). A check in init_sched_energy() enforces that all sd's below this sd level contain EM data. The 'int cpu' parameter of sched_domain_energy_f requires that check_sched_energy_data() makes sure that all cpus spanned by a sg are provisioned with the same EM data. This patch has also been tested with feature FORCE_SD_OVERLAP enabled. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c6e2b326349..f1b1c9eaeb10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6304,6 +6304,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +/* + * Check that the per-cpu provided sd energy data is consistent for all cpus + * within the mask. + */ +static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn, + const struct cpumask *cpumask) +{ + const struct sched_group_energy * const sge = fn(cpu); + struct cpumask mask; + int i; + + if (cpumask_weight(cpumask) <= 1) + return; + + cpumask_xor(&mask, cpumask, get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) { + const struct sched_group_energy * const e = fn(i); + int y; + + BUG_ON(e->nr_idle_states != sge->nr_idle_states); + + for (y = 0; y < (e->nr_idle_states); y++) { + BUG_ON(e->idle_states[y].power != + sge->idle_states[y].power); + } + + BUG_ON(e->nr_cap_states != sge->nr_cap_states); + + for (y = 0; y < (e->nr_cap_states); y++) { + BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap); + BUG_ON(e->cap_states[y].power != + sge->cap_states[y].power); + } + } +} + +static void init_sched_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sd->groups)) + return; + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup broken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); + + sd->groups->sge = fn(cpu); +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -7010,10 +7070,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } From f0f739d887a4f144ab4937e619288d4cede2cc91 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:50:46 +0000 Subject: [PATCH 107/607] sched: Introduce SD_SHARE_CAP_STATES sched_domain flag cpufreq is currently keeping it a secret which cpus are sharing clock source. The scheduler needs to know about clock domains as well to become more energy aware. The SD_SHARE_CAP_STATES domain flag indicates whether cpus belonging to the sched_domain share capacity states (P-states). There is no connection with cpufreq (yet). The flag must be set by the arch specific topology code. cc: Russell King cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- arch/arm/kernel/topology.c | 3 ++- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 614554765e44..38e7be162b79 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -277,7 +277,8 @@ void store_cpu_topology(unsigned int cpuid) static inline int cpu_corepower_flags(void) { - return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; } static struct sched_domain_topology_level arm_topology[] = { diff --git a/include/linux/sched.h b/include/linux/sched.h index 5bee272a86d8..09e2d43406eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -989,6 +989,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1b1c9eaeb10..2863d223c07e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5775,7 +5775,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5807,7 +5808,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6472,6 +6474,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6481,7 +6484,8 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) From c7aeeb88c7370af2fc4e2e8bd156e001f7a96d11 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 10 Jul 2015 13:57:19 +0100 Subject: [PATCH 108/607] arm: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used instead of the arm arch specific cpu_efficiency and dtb property 'clock-frequency' values as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 38e7be162b79..da1c611a3b5e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -153,6 +153,8 @@ static void __init parse_dt_topology(void) } +static const struct sched_group_energy * const cpu_core_energy(int cpu); + /* * Look for a customed capacity of a CPU in the cpu_capacity table during the * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the @@ -160,10 +162,14 @@ static void __init parse_dt_topology(void) */ static void update_cpu_capacity(unsigned int cpu) { - if (!cpu_capacity(cpu)) - return; + unsigned long capacity = SCHED_CAPACITY_SCALE; - set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); pr_info("CPU%u: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); From 7c791bf110c2c1884f02f0c4ceaff886fad2a907 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 11:53:48 +0100 Subject: [PATCH 109/607] arm64: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 4 ++- arch/arm64/kernel/topology.c | 42 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 72c47c3fa7b3..9370a336c934 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,11 +22,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity -struct sched_domain; extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #include diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..fb99a6735fd4 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -23,6 +23,18 @@ #include #include +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; + +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -211,6 +223,35 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +static inline int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; +} + +static struct sched_domain_topology_level arm64_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void update_cpu_capacity(unsigned int cpu) +{ + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); + + pr_info("CPU%d: update cpu_capacity %lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); +} + static void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -272,6 +313,7 @@ void store_cpu_topology(unsigned int cpuid) topology_populated: update_siblings_masks(cpuid); + update_cpu_capacity(cpuid); } static void __init reset_cpu_topology(void) From 3e55d2f2fcaf44ac19eb143d895924ff402e375c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 14:11:28 +0000 Subject: [PATCH 110/607] sched: Compute cpu capacity available at current frequency capacity_orig_of() returns the max available compute capacity of a cpu. For scale-invariant utilization tracking and energy-aware scheduling decisions it is useful to know the compute capacity available at the current OPP of a cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efc65c3db6b5..ea966d9193cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4651,6 +4651,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); From b6c0399e0f340887f369c6870ba6a865afe0b0ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 11 Dec 2014 15:25:29 +0000 Subject: [PATCH 111/607] sched: Relocated cpu_util() and change return type Move cpu_util() to an earlier position in fair.c and change return type to unsigned long as negative usage doesn't make much sense. All other load and capacity related functions use unsigned long including the caller of cpu_util(). cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 70 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea966d9193cd..318141042a3e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4662,6 +4662,40 @@ static unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static unsigned long cpu_util(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + return (util >= capacity) ? capacity : util; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); @@ -4788,8 +4822,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static int cpu_util(int cpu); - static inline bool task_fits_spare(struct task_struct *p, int cpu) { return __task_fits(p, cpu, cpu_util(cpu)); @@ -4988,40 +5020,6 @@ done: return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - return (util >= capacity) ? capacity : util; -} - /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, From 5ec8ccabfef3989f4514f22c034a15aeab691110 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 17:08:52 +0000 Subject: [PATCH 112/607] sched: Highest energy aware balancing sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_ea, points to the highest level at which energy model is provided. At this level and all levels below all sched_groups have energy model data attached. Partial energy model information is possible but restricted to providing energy model data for lower level sched_domains (sd_ea and below) and leaving load-balancing on levels above to non-energy-aware load-balancing. For example, it is possible to apply energy-aware scheduling within each socket on a multi-socket system and let normal scheduling handle load-balancing between sockets. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 11 ++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2863d223c07e..45e54b0e3669 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5993,11 +5993,12 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; + struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; @@ -6018,6 +6019,14 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1813cba2995d..a77516a15ba0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -839,6 +839,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_ea); struct sched_group_capacity { atomic_t ref; From c1770a521398102b6f2447502035887297b17ea0 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 18 Dec 2014 14:47:18 +0000 Subject: [PATCH 113/607] sched: Calculate energy consumption of sched_group For energy-aware load-balancing decisions it is necessary to know the energy consumption estimates of groups of cpus. This patch introduces a basic function, sched_group_energy(), which estimates the energy consumption of the cpus in the group and any resources shared by the members of the group. NOTE: The function has five levels of identation and breaks the 80 character limit. Refactoring is necessary. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 4 ++ kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 161 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 45e54b0e3669..bc5bb26ccf3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); static void update_top_cache_domain(int cpu) { @@ -6027,6 +6028,9 @@ static void update_top_cache_domain(int cpu) break; } rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 318141042a3e..49f7567c6b07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4701,6 +4701,162 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +/* + * cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +{ + int util = cpu_util(cpu); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static unsigned long group_max_util(struct sched_group *sg) +{ + int i; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(sg)) + max_util = max(max_util, cpu_util(i)); + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +{ + int i; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) + util_sum += cpu_norm_util(i, capacity); + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct sched_group *sg, + const struct sched_group_energy const *sge) +{ + int idx; + unsigned long util = group_max_util(sg); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + return idx; + } + + return idx; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct sched_group *sg_top) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (!sd) + /* + * We most probably raced with hotplug; returning a + * wrong energy estimation is better than entering an + * infinite loop. + */ + return -EINVAL; + + if (sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + struct sched_group *sg_cap_util; + unsigned long group_util; + int sg_busy_energy, sg_idle_energy, cap_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + sg_cap_util = sg_shared_cap; + else + sg_cap_util = sg; + + cap_idx = find_new_capacity(sg_cap_util, sg->sge); + group_util = group_norm_util(sg, cap_idx); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + continue; + } + + return total_energy; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a77516a15ba0..a618db2936d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -840,6 +840,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_ea); +DECLARE_PER_CPU(struct sched_domain *, sd_scs); struct sched_group_capacity { atomic_t ref; From df2030c8412e6acdb3f7ec62ed195cc7967c30e2 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 14:21:56 +0000 Subject: [PATCH 114/607] sched: Extend sched_group_energy to test load-balancing decisions Extended sched_group_energy() to support energy prediction with usage (tasks) added/removed from a specific cpu or migrated between a pair of cpus. Useful for load-balancing decision making. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 49f7567c6b07..f3de37414a66 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4688,12 +4688,21 @@ static unsigned long capacity_curr_of(int cpu) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static unsigned long cpu_util(int cpu) +static unsigned long __cpu_util(int cpu, int delta) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - return (util >= capacity) ? capacity : util; + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); } static inline bool energy_aware(void) @@ -4701,8 +4710,18 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; +}; + /* - * cpu_norm_util() returns the cpu util relative to a specific capacity, + * __cpu_norm_util() returns the cpu util relative to a specific capacity, * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for * energy calculations. Using the scale-invariant util returned by * cpu_util() and approximating scale-invariant util by: @@ -4715,9 +4734,9 @@ static inline bool energy_aware(void) * * norm_util = running_time/time ~ util/capacity */ -static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) { - int util = cpu_util(cpu); + int util = __cpu_util(cpu, delta); if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -4725,13 +4744,25 @@ static unsigned long cpu_norm_util(int cpu, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct sched_group *sg) +static int calc_util_delta(struct energy_env *eenv, int cpu) { - int i; + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; unsigned long max_util = 0; - for_each_cpu(i, sched_group_cpus(sg)) - max_util = max(max_util, cpu_util(i)); + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } return max_util; } @@ -4745,31 +4776,36 @@ static unsigned long group_max_util(struct sched_group *sg) * latter is used as the estimate as it leads to a more pessimistic energy * estimate (more busy). */ -static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i; + int i, delta; unsigned long util_sum = 0; - unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; - for_each_cpu(i, sched_group_cpus(sg)) - util_sum += cpu_norm_util(i, capacity); + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } if (util_sum > SCHED_CAPACITY_SCALE) return SCHED_CAPACITY_SCALE; return util_sum; } -static int find_new_capacity(struct sched_group *sg, +static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy const *sge) { int idx; - unsigned long util = group_max_util(sg); + unsigned long util = group_max_util(eenv); for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) - return idx; + break; } + eenv->cap_idx = idx; + return idx; } @@ -4783,16 +4819,16 @@ static int find_new_capacity(struct sched_group *sg, * This can probably be done in a faster but more complex way. * Note: sched_group_energy() may fail when racing with sched_domain updates. */ -static int sched_group_energy(struct sched_group *sg_top) +static int sched_group_energy(struct energy_env *eenv) { struct sched_domain *sd; int cpu, total_energy = 0; struct cpumask visit_cpus; struct sched_group *sg; - WARN_ON(!sg_top->sge); + WARN_ON(!eenv->sg_top->sge); - cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -4824,17 +4860,16 @@ static int sched_group_energy(struct sched_group *sg_top) break; do { - struct sched_group *sg_cap_util; unsigned long group_util; int sg_busy_energy, sg_idle_energy, cap_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) - sg_cap_util = sg_shared_cap; + eenv->sg_cap = sg_shared_cap; else - sg_cap_util = sg; + eenv->sg_cap = sg; - cap_idx = find_new_capacity(sg_cap_util, sg->sge); - group_util = group_norm_util(sg, cap_idx); + cap_idx = find_new_capacity(eenv, sg->sge); + group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) @@ -4845,7 +4880,7 @@ static int sched_group_energy(struct sched_group *sg_top) if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); - if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); @@ -4854,7 +4889,8 @@ next_cpu: continue; } - return total_energy; + eenv->energy = total_energy; + return 0; } /* From 2c6a8a48a701ca5d767f249eacb776f80858cbaa Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 6 Jan 2015 17:34:05 +0000 Subject: [PATCH 115/607] sched: Estimate energy impact of scheduling decisions Adds a generic energy-aware helper function, energy_diff(), that calculates energy impact of adding, removing, and migrating utilization in the system. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3de37414a66..043715612227 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4893,6 +4893,58 @@ next_cpu: return 0; } +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static int energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + return energy_after-energy_before; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened From 1b5ec5d8ab918a62446f91ddfdb3eba88ea8c8dd Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 16:49:57 +0100 Subject: [PATCH 116/607] sched: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% (1024) regardless of how many task additional task we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at it's highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For system where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 31 +++++++++++++++++++++++++------ kernel/sched/sched.h | 3 +++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 043715612227..3cfb4be3d88f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static bool cpu_overutilized(int cpu); + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4154,6 +4156,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); for_each_sched_entity(se) { if (se->on_rq) @@ -4185,9 +4188,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { add_nr_running(rq, 1); - + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) + rq->rd->overutilized = true; + } hrtick_update(rq); } @@ -6651,11 +6657,12 @@ group_type group_classify(struct sched_group *group, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i; @@ -6685,6 +6692,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) + *overutilized = true; } /* Adjust by relative CPU capacity of the group */ @@ -6790,7 +6800,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6812,7 +6822,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6856,8 +6866,14 @@ next_group: /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) + env->dst_rq->rd->overutilized = overutilized; + } else { + if (!env->dst_rq->rd->overutilized && overutilized) + env->dst_rq->rd->overutilized = true; + } } /** @@ -8250,6 +8266,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + rq->rd->overutilized = true; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a618db2936d4..5323d4fc1109 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -528,6 +528,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). From 19a5ebe13dded57ab2b04defd2e379579da05197 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 27 Jan 2015 13:48:07 +0000 Subject: [PATCH 117/607] sched, cpuidle: Track cpuidle state index in the scheduler The idle-state of each cpu is currently pointed to by rq->idle_state but there isn't any information in the struct cpuidle_state that can used to look up the idle-state energy model data stored in struct sched_group_energy. For this purpose is necessary to store the idle state index as well. Ideally, the idle-state data should be unified. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- drivers/cpuidle/cpuidle.c | 4 ++-- include/linux/cpuidle.h | 2 +- kernel/sched/idle.c | 3 ++- kernel/sched/sched.h | 21 +++++++++++++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 17a6dc0e2111..b9a6cceb7bac 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -192,7 +192,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, } /* Take note of the planned idle state. */ - sched_idle_set_state(target_state); + sched_idle_set_state(target_state, index); trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ktime_get(); @@ -205,7 +205,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ - sched_idle_set_state(NULL); + sched_idle_set_state(NULL, -1); if (broadcast) { if (WARN_ON_ONCE(!irqs_disabled())) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 786ad32631a6..6eae1576499e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, #endif /* kernel/sched/idle.c */ -extern void sched_idle_set_state(struct cpuidle_state *idle_state); +extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index); extern void default_idle_call(void); #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..cbc130efbc5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -19,9 +19,10 @@ * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. */ -void sched_idle_set_state(struct cpuidle_state *idle_state) +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) { idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); } static int __read_mostly cpu_idle_force_poll; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5323d4fc1109..2301b0476b90 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -693,6 +693,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -1285,6 +1286,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1295,6 +1307,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void sysrq_sched_debug_show(void); From ec055b978e851118a2b9d42f61ecad619cd1ce77 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 27 Jan 2015 14:04:17 +0000 Subject: [PATCH 118/607] sched: Determine the current sched_group idle-state To estimate the energy consumption of a sched_group in sched_group_energy() it is necessary to know which idle-state the group is in when it is idle. For now, it is assumed that this is the current idle-state (though it might be wrong). Based on the individual cpu idle-states group_idle_state() finds the group idle-state. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3cfb4be3d88f..c11bc7392916 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4815,6 +4815,20 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } +static int group_idle_state(struct sched_group *sg) +{ + int i, state = INT_MAX; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + return state; +} + /* * sched_group_energy(): Computes the absolute energy consumption of cpus * belonging to the sched_group including shared resources shared only by @@ -4867,7 +4881,8 @@ static int sched_group_energy(struct energy_env *eenv) do { unsigned long group_util; - int sg_busy_energy, sg_idle_energy, cap_idx; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; @@ -4875,11 +4890,13 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) - >> SCHED_CAPACITY_SHIFT; + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; total_energy += sg_busy_energy + sg_idle_energy; From e38982ba3289974385bd8fead9bf767d8cdde386 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 20:03:19 +0100 Subject: [PATCH 119/607] sched: Energy-aware wake-up task placement Let available compute capacity and estimated energy impact select wake-up target cpu when energy-aware scheduling is enabled and the system in not over-utilized (above the tipping point). energy_aware_wake_cpu() attempts to find group of cpus with sufficient compute capacity to accommodate the task and find a cpu with enough spare capacity to handle the task within that group. Preference is given to cpus with enough spare capacity at the current OPP. Finally, the energy impact of the new target and the previous task cpu is compared to select the wake-up target cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c11bc7392916..682b4ae9ebd7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5287,6 +5287,86 @@ done: return target; } +static int energy_aware_wake_cpu(struct task_struct *p, int target) +{ + struct sched_domain *sd; + struct sched_group *sg, *sg_target; + int target_max_cap = INT_MAX; + int target_cpu = task_cpu(p); + int i; + + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + + if (!sd) + return target; + + sg = sd->groups; + sg_target = sg; + + /* + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. + */ + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); + + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + + if (target_cpu != task_cpu(p)) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = task_cpu(p), + .dst_cpu = target_cpu, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(task_cpu(p))) + return target_cpu; + + if (energy_diff(&eenv) >= 0) + return task_cpu(p); + } + + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5309,8 +5389,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + energy_aware(); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5340,7 +5421,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + new_cpu = energy_aware_wake_cpu(p, prev_cpu); + else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); } else while (sd) { From 52b7b8a37c3da55885a9b7e60b2c4db6cf2dbeff Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 10 May 2015 15:17:32 +0100 Subject: [PATCH 120/607] sched: Consider a not over-utilized energy-aware system as balanced In case the system operates below the tipping point indicator, introduced in ("sched: Add over-utilization/tipping point indicator"), bail out in find_busiest_group after the dst and src group statistics have been checked. There is simply no need to move usage around because all involved cpus still have spare cycles available. For an energy-aware system below its tipping point, we rely on the task placement of the wakeup path. This works well for short running tasks. The existence of long running tasks on one of the involved cpus lets the system operate over its tipping point. To be able to move such a task (whose load can't be used to average the load among the cpus) from a src cpu with lower capacity than the dst_cpu, an additional rule has to be implemented in need_active_balance. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 682b4ae9ebd7..ae4f1ee9cf0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7194,6 +7194,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; From b71188bd2c52b6af022d687347eb4c7c29901580 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 3 Feb 2015 13:54:11 +0000 Subject: [PATCH 121/607] sched: Disable energy-unfriendly nohz kicks With energy-aware scheduling enabled nohz_kick_needed() generates many nohz idle-balance kicks which lead to nothing when multiple tasks get packed on a single cpu to save energy. This causes unnecessary wake-ups and hence wastes energy. Make these conditions depend on !energy_aware() for now until the energy-aware nohz story gets sorted out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae4f1ee9cf0c..7327221c991e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8259,12 +8259,13 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 1b35232614f555cbeed30a357ecf77a37632c347 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 17:56:20 +0100 Subject: [PATCH 122/607] Documentation: DT bindings for energy model cost data required by EAS EAS (energy aware scheduling) provides the scheduler with an alternative objective - energy efficiency - as opposed to it's current performance oriented objectives. EAS relies on a simple platform energy cost model to guide scheduling decisions. The model only considers the CPU subsystem. This patch adds documentation describing DT bindings that should be used to supply the scheduler with an energy cost model. Signed-off-by: Robin Randhawa --- .../bindings/scheduler/sched-energy-costs.txt | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt new file mode 100644 index 000000000000..11216f09e596 --- /dev/null +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -0,0 +1,360 @@ +=========================================================== +Energy cost bindings for Energy Aware Scheduling +=========================================================== + +=========================================================== +1 - Introduction +=========================================================== + +This note specifies bindings required for energy-aware scheduling +(EAS)[1]. Historically, the scheduler's primary objective has been +performance. EAS aims to provide an alternative objective - energy +efficiency. EAS relies on a simple platform energy cost model to +guide scheduling decisions. The model only considers the CPU +subsystem. + +This note is aligned with the definition of the layout of physical +CPUs in the system as described in the ARM topology binding +description [2]. The concept is applicable to any system so long as +the cost model data is provided for those processing elements in +that system's topology that EAS is required to service. + +Processing elements refer to hardware threads, CPUs and clusters of +related CPUs in increasing order of hierarchy. + +EAS requires two key cost metrics - busy costs and idle costs. Busy +costs comprise of a list of compute capacities for the processing +element in question and the corresponding power consumption at that +capacity. Idle costs comprise of a list of power consumption values +for each idle state [C-state] that the processing element supports. +For a detailed description of these metrics, their derivation and +their use see [3]. + +These cost metrics are required for processing elements in all +scheduling domain levels that EAS is required to service. + +=========================================================== +2 - energy-costs node +=========================================================== + +Energy costs for the processing elements in scheduling domains that +EAS is required to service are defined in the energy-costs node +which acts as a container for the actual per processing element cost +nodes. A single energy-costs node is required for a given system. + +- energy-costs node + + Usage: Required + + Description: The energy-costs node is a container node and + it's sub-nodes describe costs for each processing element at + all scheduling domain levels that EAS is required to + service. + + Node name must be "energy-costs". + + The energy-costs node's parent node must be the cpus node. + + The energy-costs node's child nodes can be: + + - one or more cost nodes. + + Any other configuration is considered invalid. + +The energy-costs node can only contain a single type of child node +whose bindings are described in paragraph 4. + +=========================================================== +3 - energy-costs node child nodes naming convention +=========================================================== + +energy-costs child nodes must follow a naming convention where the +node name must be "thread-costN", "core-costN", "cluster-costN" +depending on whether the costs in the node are for a thread, core or +cluster. N (where N = {0, 1, ...}) is the node number and has no +bearing to the OS' logical thread, core or cluster index. + +=========================================================== +4 - cost node bindings +=========================================================== + +Bindings for cost nodes are defined as follows: + +- cluster-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple clusters and each cluster + serviced by EAS must have a corresponding cluster-costs + node. + + The cluster-cost node name must be "cluster-costN" as + described in 3 above. + + A cluster-cost node must be a leaf node with no children. + + Properties for cluster-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- core-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple cores and each core serviced by + EAS must have a corresponding core-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for core-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- thread-cost node + + Description: must be declared within an energy-costs node. A + system can contain cores with multiple hardware threads and + each thread serviced by EAS must have a corresponding + thread-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for thread-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +=========================================================== +5 - Cost node properties +========================================================== + +All cost node types must have only the following properties: + +- busy-cost-data + + Usage: required + Value type: An array of 2-item tuples. Each item is of type + u32. + Definition: The first item in the tuple is the capacity + value as described in [3]. The second item in the tuple is + the energy cost value as described in [3]. + +- idle-cost-data + + Usage: required + Value type: An array of 1-item tuples. The item is of type + u32. + Definition: The item in the tuple is the energy cost value + as described in [3]. + +=========================================================== +4 - Extensions to the cpu node +=========================================================== + +The cpu node is extended with a property that establishes the +connection between the processing element represented by the cpu +node and the cost-nodes associated with this processing element. + +The connection is expressed in line with the topological hierarchy +that this processing element belongs to starting with the level in +the hierarchy that this processing element itself belongs to through +to the highest level that EAS is required to service. The +connection cannot be sparse and must be contiguous from the +processing element's level through to the highest desired level. The +highest desired level must be the same for all processing elements. + +Example: Given that a cpu node may represent a thread that is a part +of a core, this property may contain multiple elements which +associate the thread with cost nodes describing the costs for the +thread itself, the core the thread belongs to, the cluster the core +belongs to and so on. The elements must be ordered from the lowest +level nodes to the highest desired level that EAS must service. The +highest desired level must be the same for all cpu nodes. The +elements must not be sparse: there must be elements for the current +thread, the next level of hierarchy (core) and so on without any +'holes'. + +Example: Given that a cpu node may represent a core that is a part +of a cluster of related cpus this property may contain multiple +elements which associate the core with cost nodes describing the +costs for the core itself, the cluster the core belongs to and so +on. The elements must be ordered from the lowest level nodes to the +highest desired level that EAS must service. The highest desired +level must be the same for all cpu nodes. The elements must not be +sparse: there must be elements for the current thread, the next +level of hierarchy (core) and so on without any 'holes'. + +If the system comprises of hierarchical clusters of clusters, this +property will contain multiple associations with the relevant number +of cluster elements in hierarchical order. + +Property added to the cpu node: + +- sched-energy-costs + + Usage: required + Value type: List of phandles + Definition: a list of phandles to specific cost nodes in the + energy-costs parent node that correspond to the processing + element represented by this cpu node in hierarchical order + of topology. + + The order of phandles in the list is significant. The first + phandle is to the current processing element's own cost + node. Subsequent phandles are to higher hierarchical level + cost nodes up until the maximum level that EAS is to + service. + + All cpu nodes must have the same highest level cost node. + + The phandle list must not be sparsely populated with handles + to non-contiguous hierarchical levels. See commentary above + for clarity. + + Any other configuration is invalid. + +=========================================================== +5 - Example dts +=========================================================== + +Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one +cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus): + +cpus { + #address-cells = <2>; + #size-cells = <0>; + . + . + . + A57_0: cpu@0 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x0>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A57_1: cpu@1 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x1>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A53_0: cpu@100 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x100>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_1: cpu@101 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x101>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_2: cpu@102 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x102>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_3: cpu@103 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x103>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + energy-costs { + CPU_COST_0: core-cost0 { + busy-cost-data = < + 417 168 + 579 251 + 744 359 + 883 479 + 1024 616 + >; + idle-cost-data = < + 15 + 0 + >; + }; + CPU_COST_1: core-cost1 { + busy-cost-data = < + 235 33 + 302 46 + 368 61 + 406 76 + 447 93 + >; + idle-cost-data = < + 6 + 0 + >; + }; + CLUSTER_COST_0: cluster-cost0 { + busy-cost-data = < + 417 24 + 579 32 + 744 43 + 883 49 + 1024 64 + >; + idle-cost-data = < + 65 + 24 + >; + }; + CLUSTER_COST_1: cluster-cost1 { + busy-cost-data = < + 235 26 + 303 30 + 368 39 + 406 47 + 447 57 + >; + idle-cost-data = < + 56 + 17 + >; + }; + }; +}; + +=============================================================================== +[1] https://lkml.org/lkml/2015/5/12/728 +[2] Documentation/devicetree/bindings/topology.txt +[3] Documentation/scheduler/sched-energy.txt From d2b3db032e34938c970b148e71cc401fc4c541e8 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 18:01:58 +0100 Subject: [PATCH 123/607] sched: Support for extracting EAS energy costs from DT This patch implements support for extracting energy cost data from DT. The data should conform to the DT bindings for energy cost data needed by EAS (energy aware scheduling). Signed-off-by: Robin Randhawa --- include/linux/sched_energy.h | 36 ++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/energy.c | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched_energy.h create mode 100644 kernel/sched/energy.c diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h new file mode 100644 index 000000000000..a3f1627ac609 --- /dev/null +++ b/include/linux/sched_energy.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_SCHED_ENERGY_H +#define _LINUX_SCHED_ENERGY_H + +#include +#include + +/* + * There doesn't seem to be an NR_CPUS style max number of sched domain + * levels so here's an arbitrary constant one for the moment. + * + * The levels alluded to here correspond to entries in struct + * sched_domain_topology_level that are meant to be populated by arch + * specific code (topology.c). + */ +#define NR_SD_LEVELS 8 + +#define SD_LEVEL0 0 +#define SD_LEVEL1 1 +#define SD_LEVEL2 2 +#define SD_LEVEL3 3 +#define SD_LEVEL4 4 +#define SD_LEVEL5 5 +#define SD_LEVEL6 6 +#define SD_LEVEL7 7 + +/* + * Convenience macro for iterating through said sd levels. + */ +#define for_each_possible_sd_level(level) \ + for (level = 0; level < NR_SD_LEVELS; level++) + +extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +void init_sched_energy_costs(void); + +#endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..a541b5ce1dcc 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c new file mode 100644 index 000000000000..b0656b7a93e3 --- /dev/null +++ b/kernel/sched/energy.c @@ -0,0 +1,124 @@ +/* + * Obtain energy cost data from DT and populate relevant scheduler data + * structures. + * + * Copyright (C) 2015 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define pr_fmt(fmt) "sched-energy: " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include + +struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +static void free_resources(void) +{ + int cpu, sd_level; + struct sched_group_energy *sge; + + for_each_possible_cpu(cpu) { + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (sge) { + kfree(sge->cap_states); + kfree(sge->idle_states); + kfree(sge); + } + } + } +} + +void init_sched_energy_costs(void) +{ + struct device_node *cn, *cp; + struct capacity_state *cap_states; + struct idle_state *idle_states; + struct sched_group_energy *sge; + const struct property *prop; + int sd_level, i, nstates, cpu; + const __be32 *val; + + for_each_possible_cpu(cpu) { + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_warn("CPU device node missing for CPU %d\n", cpu); + return; + } + + if (!of_find_property(cn, "sched-energy-costs", NULL)) { + pr_warn("CPU device node has no sched-energy-costs\n"); + return; + } + + for_each_possible_sd_level(sd_level) { + cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); + if (!cp) + break; + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No busy-cost data, skipping sched_energy init\n"); + goto out; + } + + sge = kcalloc(1, sizeof(struct sched_group_energy), + GFP_NOWAIT); + + nstates = (prop->length / sizeof(u32)) / 2; + cap_states = kcalloc(nstates, + sizeof(struct capacity_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) { + cap_states[i].cap = be32_to_cpup(val++); + cap_states[i].power = be32_to_cpup(val++); + } + + sge->nr_cap_states = nstates; + sge->cap_states = cap_states; + + prop = of_find_property(cp, "idle-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No idle-cost data, skipping sched_energy init\n"); + goto out; + } + + nstates = (prop->length / sizeof(u32)); + idle_states = kcalloc(nstates, + sizeof(struct idle_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) + idle_states[i].power = be32_to_cpup(val++); + + sge->nr_idle_states = nstates; + sge->idle_states = idle_states; + + sge_array[cpu][sd_level] = sge; + } + } + + pr_info("Sched-energy-costs installed from DT\n"); + return; + +out: + free_resources(); +} From 8296e78e5eaa9bfebf251cbd4bddd97322fa13a9 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Tue, 9 Jun 2015 15:10:00 +0100 Subject: [PATCH 124/607] arm64, topology: Updates to use DT bindings for EAS costing data With the bindings and the associated accessors to extract data from the bindings in place, remove the static hard-coded data from topology.c and use the accesors instead. Signed-off-by: Robin Randhawa --- arch/arm64/kernel/topology.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fb99a6735fd4..b5b43af6a7dc 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -218,6 +220,33 @@ out: struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu); + return NULL; + } + + return sge; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for CPU%d\n", cpu); + return NULL; + } + + return sge; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; @@ -344,4 +373,8 @@ void __init init_cpu_topology(void) */ if (of_have_populated_dt() && parse_dt_topology()) reset_cpu_topology(); + else + set_sched_topology(arm64_topology); + + init_sched_energy_costs(); } From b03f1ba3d5bb9db0ca495c8c33ef88588daaae50 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2015 16:47:48 +0100 Subject: [PATCH 125/607] cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support Implements cpufreq_scale_max_freq_capacity() to provide the scheduler with a maximum frequency scaling correction factor for more accurate load-tracking and cpu capacity handling by being able to deal with frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency lower than the absolute possible maximum frequency on load tracking and cpu capacity. The factor is: current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, max_freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for max_freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 19 +++++++++++++++++++ include/linux/cpufreq.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 46a7d62c8174..96f1a8860819 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -352,12 +352,14 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) *********************************************************************/ static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { unsigned long cur = freqs ? freqs->new : policy->cur; unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo; int cpu; pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", @@ -365,6 +367,18 @@ scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) for_each_cpu(cpu, policy->cpus) per_cpu(freq_scale, cpu) = scale; + + if (freqs) + return; + + scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq; + + pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n", + cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq, + scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(max_freq_scale, cpu) = scale; } unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) @@ -372,6 +386,11 @@ unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) return per_cpu(freq_scale, cpu); } +unsigned long cpufreq_scale_max_freq_capacity(int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 11c65f536b6c..9b1d61e1f1d7 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -619,4 +619,5 @@ int cpufreq_generic_init(struct cpufreq_policy *policy, struct sched_domain; unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif /* _LINUX_CPUFREQ_H */ From 05a773bfc98b786cf504af1bce8bfe4a84a6a586 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 126/607] sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. There is also an outstanding question: Should the cpu capacity of a cpu going idle be set to a very small value? Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++------ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc5bb26ccf3b..3680e76b6beb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5888,6 +5888,8 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; + + init_max_cpu_capacity(&rd->max_cpu_capacity); return 0; free_rto_mask: @@ -7105,15 +7107,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); - - if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) - rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); - if (rq) - pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7327221c991e..91dfc8aa8aff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5078,7 +5078,7 @@ static inline bool __task_fits(struct task_struct *p, int cpu, int util) static inline bool task_fits_max(struct task_struct *p, int cpu) { unsigned long capacity = capacity_of(cpu); - unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; if (capacity == max_capacity) return true; @@ -6564,13 +6564,43 @@ static unsigned long scale_rt_capacity(int cpu) return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2301b0476b90..aac581932eff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -510,6 +510,12 @@ struct dl_rq { #ifdef CONFIG_SMP +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -548,7 +554,7 @@ struct root_domain { struct cpupri cpupri; /* Maximum cpu capacity in the system. */ - unsigned long max_cpu_capacity; + struct max_cpu_capacity max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -1340,6 +1346,8 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From 5414a72fc30f2069e3943ceb4e39b0ba9cd9c7e3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 17:59:55 +0100 Subject: [PATCH 127/607] arm: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index da1c611a3b5e..0308342def8c 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#if CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From b732dd602c61bab535b95237552e530ca8de8b93 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:34:15 +0100 Subject: [PATCH 128/607] arm64: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_max_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 1 + arch/arm64/kernel/topology.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 9370a336c934..bbd362cd1ed1 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -26,6 +26,7 @@ struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +extern unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif #define arch_scale_cpu_capacity scale_cpu_capacity extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index b5b43af6a7dc..5b2c67a510d8 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -29,7 +29,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#ifdef CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From e14f1518239461ec91e3cb99fba6809c7d5fa4ad Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 13 Jan 2016 15:49:44 +0000 Subject: [PATCH 129/607] sched: Do eas idle balance regardless of the rq avg idle value EAS relies on idle balance to migrate a misfit task towards a cpu with higher capacity. When such a cpu becomes idle, idle balance should happen even if the rq avg idle is smaller than the sched migration cost (default 500us). The rq avg idle is updated during the wakeup of a task in case the rq has a non-null idle_stamp. This value stays unchanged and valid until the next task wakes up on this cpu after an idle period. So rq avg idle could be smaller than sched migration cost preventing the idle balance from happening. In this case we would be at the mercy of wakeup, periodic or nohz-idle load balancing to put another task on this cpu. To break this dependency towards rq avg idle make EAS idle balance independent from this rq avg idle has to be larger than sched migration cost. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 91dfc8aa8aff..7d58b1fea71d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7772,8 +7772,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) From 563ddb604e01393625aa56601188b7eeb20888da Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:43:49 +0000 Subject: [PATCH 130/607] sched: Add per-cpu max capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all cpus in the sched_group. Unless it is divided by the group_weight to get the average capacity per cpu it hides differences in cpu capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers cpus of different capacities. Instead, by extending struct sched_group_capacity to indicate max per-cpu capacity in the group a suitable group for a given task utilization can easily be found such that cpus with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 17 ++++++++++++----- kernel/sched/sched.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3680e76b6beb..23e82805ec47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5714,7 +5714,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6193,6 +6193,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d58b1fea71d..9762288ee47b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6609,13 +6609,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6628,6 +6629,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6652,11 +6654,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); } } else { /* @@ -6666,12 +6669,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aac581932eff..967d70d49af9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -858,7 +858,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long max_capacity; /* Max per-cpu capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From f2a8923298fe9a39cf4c9ad48bbcb0781483d1f9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:47:54 +0000 Subject: [PATCH 131/607] sched: Add group_misfit_task load-balance type To maximize throughput in systems with reduced capacity cpus (e.g. high RT/IRQ load and/or ARM big.LITTLE) load-balancing has to consider task and cpu utilization as well as per-cpu compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks that are scheduled on a reduced capacity cpu need to be identified and migrated to a higher capacity cpu if possible. To implement this additional policy an additional group_type (load-balance scenario) is added: group_misfit_task. This represents scenarios where a sched_group has tasks that are not suitable for its per-cpu capacity. group_misfit_task is only considered if the system is not overloaded in any other way (group_imbalanced or group_overloaded). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 29 ++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9762288ee47b..b16e1b7878f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5739,6 +5739,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5760,9 +5762,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -5975,6 +5980,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6446,12 +6458,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6467,6 +6473,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6783,6 +6790,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } @@ -6830,8 +6840,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) + if (cpu_overutilized(i)) { *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -8412,6 +8425,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; + + rq->misfit_task = !task_fits_max(curr, rq->cpu); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 967d70d49af9..fcbb3e07accc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -584,6 +584,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; + unsigned int misfit_task; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; From 0d2b1cdfa11fd62d3358eef332e78b95b6a3e9ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:51:35 +0000 Subject: [PATCH 132/607] sched: Consider misfit tasks when load-balancing With the new group_misfit_task load-balancing scenario additional policy conditions are needed when load-balancing. Misfit task balancing only makes sense between source group with lower capacity than the target group. If capacities are the same, fallback to normal group_other balancing. The aim is to balance tasks such that no task has its throughput hindered by compute capacity if a cpu with more capacity is available. Load-balancing is generally based on average load in the sched_groups, but for misfitting tasks it is necessary to introduce exceptions to migrate tasks against usual metrics and optimize throughput. This patch ensures the following load-balance for mixed capacity systems (e.g. ARM big.LITTLE) for always-running tasks: 1. Place a task on each cpu starting in order from cpus with highest capacity to lowest until all cpus are in use (i.e. one task on each cpu). 2. Once all cpus are in use balance according to compute capacity such that load per capacity is approximately the same regardless of the compute capacity (i.e. big cpus get more tasks than little cpus). Necessary changes are introduced in find_busiest_group(), calculate_imbalance(), and find_busiest_queue(). This includes passing the group_type on to find_busiest_queue() through struct lb_env, which is currently only considers imbalance and not the imbalance situation (group_type). To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b16e1b7878f6..5b8a9c68f3fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6016,6 +6016,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -6780,6 +6781,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6886,9 +6899,25 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + /* + * Candiate sg has no more than one task per cpu and has higher + * per-cpu capacity. No reason to pull tasks to less capable cpus. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6994,6 +7023,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -7170,6 +7208,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7203,6 +7257,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7276,6 +7335,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7299,7 +7363,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -7312,6 +7377,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -7370,7 +7436,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* From c3b2e765af4af80ada13e5e34a26a28e3081dd0e Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:27 +0100 Subject: [PATCH 133/607] cpufreq: introduce cpufreq_driver_is_slow Some architectures and platforms perform CPU frequency transitions through a non-blocking method, while some might block or sleep. Even when frequency transitions do not block or sleep they may be very slow. This distinction is important when trying to change frequency from a non-interruptible context in a scheduler hot path. Describe this distinction with a cpufreq driver flag, CPUFREQ_DRIVER_FAST. The default is to not have this flag set, thus erring on the side of caution. cpufreq_driver_is_slow() is also introduced in this patch. Setting the above flag will allow this function to return false. [smuckle@linaro.org: change flag/API to include drivers that are too slow for scheduler hot paths, in addition to those that block/sleep] Cc: Rafael J. Wysocki Cc: Viresh Kumar Signed-off-by: Michael Turquette Signed-off-by: Steve Muckle --- drivers/cpufreq/cpufreq.c | 6 ++++++ include/linux/cpufreq.h | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 96f1a8860819..5ca8e9517aae 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -154,6 +154,12 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +bool cpufreq_driver_is_slow(void) +{ + return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST); +} +EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9b1d61e1f1d7..b5ef79aae048 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_driver_is_slow(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else static inline unsigned int cpufreq_get(unsigned int cpu) @@ -317,6 +318,14 @@ struct cpufreq_driver { */ #define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5) +/* + * Indicates that it is safe to call cpufreq_driver_target from + * non-interruptable context in scheduler hot paths. Drivers must + * opt-in to this flag, as the safe default is that they might sleep + * or be too slow for hot path use. + */ +#define CPUFREQ_DRIVER_FAST (1 << 6) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From a967a45c71ae8978b2c0d06ee0de23f47737cfd5 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:48 +0100 Subject: [PATCH 134/607] sched: scheduler-driven cpu frequency selection Scheduler-driven CPU frequency selection hopes to exploit both per-task and global information in the scheduler to improve frequency selection policy, achieving lower power consumption, improved responsiveness/performance, and less reliance on heuristics and tunables. For further discussion on the motivation of this integration see [0]. This patch implements a shim layer between the Linux scheduler and the cpufreq subsystem. The interface accepts capacity requests from the CFS, RT and deadline sched classes. The requests from each sched class are summed on each CPU with a margin applied to the CFS and RT capacity requests to provide some headroom. Deadline requests are expected to be precise enough given their nature to not require headroom. The maximum total capacity request for a CPU in a frequency domain drives the requested frequency for that domain. Policy is determined by both the sched classes and this shim layer. Note that this algorithm is event-driven. There is no polling loop to check cpu idle time nor any other method which is unsynchronized with the scheduler, aside from a throttling mechanism to ensure frequency changes are not attempted faster than the hardware can accommodate them. Thanks to Juri Lelli for contributing design ideas, code and test results, and to Ricky Liang for initialization and static key inc/dec fixes. [0] http://article.gmane.org/gmane.linux.kernel/1499836 [smuckle@linaro.org: various additions and fixes, revised commit text] CC: Ricky Liang Signed-off-by: Michael Turquette Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- drivers/cpufreq/Kconfig | 21 ++ include/linux/cpufreq.h | 3 + include/linux/sched.h | 8 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched.c | 358 +++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 51 +++++ 7 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq_sched.c diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 659879a56dba..af523c539af0 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency using CPU utilization estimates from the + scheduler. + endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,18 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + bool "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is event-driven by the scheduler. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b5ef79aae048..b5433349938a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -496,6 +496,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) +extern struct cpufreq_governor cpufreq_gov_sched; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/include/linux/sched.h b/include/linux/sched.h index 09e2d43406eb..1460c6a4f65f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -927,6 +927,14 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +struct sched_capacity_reqs { + unsigned long cfs; + unsigned long rt; + unsigned long dl; + + unsigned long total; +}; + /* * Wake-queues are lists of tasks with a pending wakeup, whose * callers have already marked the task as woken internally, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a541b5ce1dcc..0eabc9db4c3d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 000000000000..58bca8d2ca65 --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2015 Michael Turquette + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; +static bool __read_mostly cpufreq_driver_slow; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static struct cpufreq_governor cpufreq_gov_sched; +#endif + +static DEFINE_PER_CPU(unsigned long, enabled); +DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @requested_freq: last frequency requested by the sched governor + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + unsigned int requested_freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, + unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +static bool finish_last_request(struct gov_data *gd) +{ + ktime_t now = ktime_get(); + + if (ktime_after(now, gd->throttle)) + return false; + + while (1) { + int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + + usec_left /= NSEC_PER_USEC; + usleep_range(usec_left, usec_left + 100); + now = ktime_get(); + if (ktime_after(now, gd->throttle)) + return true; + } +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned int new_request = 0; + unsigned int last_request = 0; + int ret; + + policy = (struct cpufreq_policy *) data; + gd = policy->governor_data; + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + do { + set_current_state(TASK_INTERRUPTIBLE); + new_request = gd->requested_freq; + if (new_request == last_request) { + schedule(); + } else { + /* + * if the frequency thread sleeps while waiting to be + * unthrottled, start over to check for a newer request + */ + if (finish_last_request(gd)) + continue; + last_request = new_request; + cpufreq_sched_try_driver_target(policy, new_request); + } + } while (!kthread_should_stop()); + + return 0; +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) + return; + + wake_up_process(gd->task); +} + +static void update_fdomain_capacity_request(int cpu) +{ + unsigned int freq_new, index_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity = 0; + + /* + * Avoid grabbing the policy if possible. A test is still + * required after locking the CPU's policy to avoid racing + * with the governor changing. + */ + if (!per_cpu(enabled, cpu)) + return; + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + return; + + if (policy->governor != &cpufreq_gov_sched || + !policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) { + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); + capacity = max(capacity, scr->total); + } + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + if (cpufreq_frequency_table_target(policy, policy->freq_table, + freq_new, CPUFREQ_RELATION_L, + &index_new)) + goto out; + freq_new = policy->freq_table[index_new].frequency; + + if (freq_new == gd->requested_freq) + goto out; + + gd->requested_freq = freq_new; + + /* + * Throttling is not yet supported on platforms with fast cpufreq + * drivers. + */ + if (cpufreq_driver_slow) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); +} + +void update_cpu_capacity_request(int cpu, bool request) +{ + unsigned long new_capacity; + struct sched_capacity_reqs *scr; + + /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ + lockdep_assert_held(&cpu_rq(cpu)->lock); + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + + new_capacity = scr->cfs + scr->rt; + new_capacity = new_capacity * capacity_margin + / SCHED_CAPACITY_SCALE; + new_capacity += scr->dl; + + if (new_capacity == scr->total) + return; + + scr->total = new_capacity; + if (request) + update_fdomain_capacity_request(cpu); +} + +static inline void set_sched_freq(void) +{ + static_key_slow_inc(&__sched_freq); +} + +static inline void clear_sched_freq(void) +{ + static_key_slow_dec(&__sched_freq); +} + +static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + for_each_cpu(cpu, policy->cpus) + memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, + sizeof(struct sched_capacity_reqs)); + + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) + return -ENOMEM; + + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_is_slow()) { + cpufreq_driver_slow = true; + gd->task = kthread_create(cpufreq_sched_thread, policy, + "kschedfreq:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kschedfreq thread\n", + __func__); + goto err; + } + get_task_struct(gd->task); + kthread_bind_mask(gd->task, policy->related_cpus); + wake_up_process(gd->task); + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + set_sched_freq(); + + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + clear_sched_freq(); + if (cpufreq_driver_slow) { + kthread_stop(gd->task); + put_task_struct(gd->task); + } + + policy->governor_data = NULL; + + kfree(gd); + return 0; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 1; + + return 0; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 0; + + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, + unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_POLICY_INIT: + return cpufreq_sched_policy_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return cpufreq_sched_policy_exit(policy); + case CPUFREQ_GOV_START: + return cpufreq_sched_start(policy); + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + case CPUFREQ_GOV_LIMITS: + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) + per_cpu(enabled, cpu) = 0; + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b8a9c68f3fb..3eb5beb590a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5064,7 +5064,7 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -static unsigned int capacity_margin = 1280; /* ~20% margin */ +unsigned int capacity_margin = 1280; /* ~20% margin */ static inline bool __task_fits(struct task_struct *p, int cpu, int util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fcbb3e07accc..ac28024b2d7a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1455,6 +1455,57 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +extern unsigned int capacity_margin; +extern struct static_key __sched_freq; + +static inline bool sched_freq(void) +{ + return static_key_false(&__sched_freq); +} + +DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +void update_cpu_capacity_request(int cpu, bool request); + +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; + update_cpu_capacity_request(cpu, request); + } +} +#else +static inline bool sched_freq(void) { return false; } +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); From ea429ccef14f4ef33e0a1c94a0369a435ae83807 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Aug 2015 19:47:12 +0100 Subject: [PATCH 135/607] sched/fair: add triggers for OPP change requests Each time a task is {en,de}queued we might need to adapt the current frequency to the new usage. Add triggers on {en,de}queue_task_fair() for this purpose. Only trigger a freq request if we are effectively waking up or going to sleep. Filter out load balancing related calls to reduce the number of triggers. [smuckle@linaro.org: resolve merge conflicts, define task_new, use renamed static key sched_freq] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3eb5beb590a8..844788da00be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,21 @@ static inline void hrtick_update(struct rq *rq) } #endif +static unsigned long capacity_orig_of(int cpu); +static int cpu_util(int cpu); + +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} + static bool cpu_overutilized(int cpu); /* @@ -4193,6 +4208,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + * + * XXX: how about fork()? Do we need a special + * flag/something to tell if we are here after a + * fork() (wakeup_task_new)? + */ + if (!task_new) + update_capacity_of(cpu_of(rq)); } hrtick_update(rq); } @@ -4251,9 +4280,24 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } hrtick_update(rq); } From 7ff814dd71331fc41d513f422da81ed65e56be47 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 26 Jun 2015 12:14:23 +0100 Subject: [PATCH 136/607] sched/{core,fair}: trigger OPP change request on fork() Patch "sched/fair: add triggers for OPP change requests" introduced OPP change triggers for enqueue_task_fair(), but the trigger was operating only for wakeups. Fact is that it makes sense to consider wakeup_new also (i.e., fork()), as we don't know anything about a newly created task and thus we most certainly want to jump to max OPP to not harm performance too much. However, it is not currently possible (or at least it wasn't evident to me how to do so :/) to tell new wakeups from other (non wakeup) operations. This patch introduces an additional flag in sched.h that is only set at fork() time and it is then consumed in enqueue_task_fair() for our purpose. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 9 +++------ kernel/sched/sched.h | 1 + 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 23e82805ec47..5c8d0dbaae5d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2387,7 +2387,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 844788da00be..3937f79240df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4171,7 +4171,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int task_new = !(flags & ENQUEUE_WAKEUP); + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; for_each_sched_entity(se) { if (se->on_rq) @@ -4215,12 +4216,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * because we get here also during load balancing, but * in these cases it seems wise to trigger as single * request after load balancing is done. - * - * XXX: how about fork()? Do we need a special - * flag/something to tell if we are here after a - * fork() (wakeup_task_new)? */ - if (!task_new) + if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } hrtick_update(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ac28024b2d7a..0052ada93ae8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1181,6 +1181,7 @@ static const u32 prio_to_wmult[40] = { #endif #define ENQUEUE_REPLENISH 0x08 #define ENQUEUE_RESTORE 0x10 +#define ENQUEUE_WAKEUP_NEW 0x20 #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 From f99e3fe6eb9cba8f645f8d00819d8007ee787e2c Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 25 Jun 2015 14:37:27 +0100 Subject: [PATCH 137/607] sched/fair: cpufreq_sched triggers for load balancing As we don't trigger freq changes from {en,de}queue_task_fair() during load balancing, we need to do explicitly so on load balancing paths. [smuckle@linaro.org: move update_capacity_of calls so rq lock is held] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3937f79240df..950046b1b8eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6384,6 +6384,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -6405,6 +6409,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -7667,6 +7676,11 @@ more_balance: * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -8037,8 +8051,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } From 6b6c1924539526c89384067fe70b9fe856c7b626 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 25 Jun 2015 14:12:33 +0100 Subject: [PATCH 138/607] sched/fair: jump to max OPP when crossing UP threshold Since the true utilization of a long running task is not detectable while it is running and might be bigger than the current cpu capacity, create the maximum cpu capacity head room by requesting the maximum cpu capacity once the cpu usage plus the capacity margin exceeds the current capacity. This is also done to try to harm the performance of a task the least. Original fair-class only version authored by Juri Lelli . cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++ kernel/sched/fair.c | 66 ------------------------------------------ kernel/sched/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c8d0dbaae5d..ac3d9af477f4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2854,6 +2854,45 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +static unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = cfs_cap + scr->rt; + + total = total * capacity_margin; + total /= SCHED_CAPACITY_SCALE; + total += scr->dl; + return total; +} + +static void sched_freq_tick(int cpu) +{ + struct sched_capacity_reqs *scr; + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to max OPP as soon as the margin of free capacity is + * impacted (specified by capacity_margin). + */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) + set_cfs_cpu_capacity(cpu, true, capacity_max); +} +#else +static inline void sched_freq_tick(int cpu) { } +#endif + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2880,6 +2919,8 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 950046b1b8eb..bb1d0e3d7835 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,9 +4144,6 @@ static inline void hrtick_update(struct rq *rq) } #endif -static unsigned long capacity_orig_of(int cpu); -static int cpu_util(int cpu); - static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4521,15 +4518,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4698,60 +4686,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static unsigned long __cpu_util(int cpu, int delta) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - delta += util; - if (delta < 0) - return 0; - - return (delta >= capacity) ? capacity : delta; -} - -static unsigned long cpu_util(int cpu) -{ - return __cpu_util(cpu, 0); -} - static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0052ada93ae8..6aaff682adb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1456,7 +1456,75 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static inline unsigned long __cpu_util(int cpu, int delta) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); +} + +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static inline unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHED +#define capacity_max SCHED_CAPACITY_SCALE extern unsigned int capacity_margin; extern struct static_key __sched_freq; From 9d44dc7fe29c4b925ddbbd6a2551929da28dfbfb Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 25 Nov 2015 15:59:25 -0800 Subject: [PATCH 139/607] sched/cpufreq_sched: add trace events Trace events will aid in debugging, profiling and tuning. Signed-off-by: Steve Muckle --- include/trace/events/cpufreq_sched.h | 87 ++++++++++++++++++++++++++++ kernel/sched/cpufreq_sched.c | 9 +++ 2 files changed, 96 insertions(+) create mode 100644 include/trace/events/cpufreq_sched.h diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h new file mode 100644 index 000000000000..a46cd088e969 --- /dev/null +++ b/include/trace/events/cpufreq_sched.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2015 Steve Muckle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpufreq_sched + +#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUFREQ_SCHED_H + +#include +#include + +TRACE_EVENT(cpufreq_sched_throttled, + TP_PROTO(unsigned int rem), + TP_ARGS(rem), + TP_STRUCT__entry( + __field( unsigned int, rem) + ), + TP_fast_assign( + __entry->rem = rem; + ), + TP_printk("throttled - %d usec remaining", __entry->rem) +); + +TRACE_EVENT(cpufreq_sched_request_opp, + TP_PROTO(int cpu, + unsigned long capacity, + unsigned int freq_new, + unsigned int requested_freq), + TP_ARGS(cpu, capacity, freq_new, requested_freq), + TP_STRUCT__entry( + __field( int, cpu) + __field( unsigned long, capacity) + __field( unsigned int, freq_new) + __field( unsigned int, requested_freq) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->capacity = capacity; + __entry->freq_new = freq_new; + __entry->requested_freq = requested_freq; + ), + TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d " + "(cur %d)", + __entry->cpu, __entry->capacity, __entry->freq_new, + __entry->requested_freq) +); + +TRACE_EVENT(cpufreq_sched_update_capacity, + TP_PROTO(int cpu, + bool request, + struct sched_capacity_reqs *scr, + unsigned long new_capacity), + TP_ARGS(cpu, request, scr, new_capacity), + TP_STRUCT__entry( + __field( int, cpu) + __field( bool, request) + __field( unsigned long, cfs) + __field( unsigned long, rt) + __field( unsigned long, dl) + __field( unsigned long, total) + __field( unsigned long, new_total) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->request = request; + __entry->cfs = scr->cfs; + __entry->rt = scr->rt; + __entry->dl = scr->dl; + __entry->total = scr->total; + __entry->new_total = new_capacity; + ), + TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld " + "new_tot=%ld", + __entry->cpu, __entry->request, __entry->cfs, __entry->rt, + __entry->dl, __entry->total, __entry->new_total) +); + +#endif /* _TRACE_CPUFREQ_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 58bca8d2ca65..5afe56a82491 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "sched.h" #define THROTTLE_NSEC 50000000 /* 50ms default */ @@ -78,6 +81,7 @@ static bool finish_last_request(struct gov_data *gd) int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); usec_left /= NSEC_PER_USEC; + trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); if (ktime_after(now, gd->throttle)) @@ -186,6 +190,9 @@ static void update_fdomain_capacity_request(int cpu) goto out; freq_new = policy->freq_table[index_new].frequency; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, + gd->requested_freq); + if (freq_new == gd->requested_freq) goto out; @@ -222,6 +229,8 @@ void update_cpu_capacity_request(int cpu, bool request) if (new_capacity == scr->total) return; + trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); + scr->total = new_capacity; if (request) update_fdomain_capacity_request(cpu); From cd248fa758ba2358d4dbf612090e6bbc075c46b1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 20 Oct 2015 10:46:26 +0200 Subject: [PATCH 140/607] sched: remove call of sched_avg_update from sched_rt_avg_update rt_avg is only used to scale the available CPU's capacity for CFS tasks. As the update of this scaling is done during periodic load balance, we only have to ensure that sched_avg_update has been called before any periodic load balancing. This requirement is already fulfilled by __update_cpu_load so the call in sched_rt_avg_update, which is part of the hotpath, is useless. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6aaff682adb8..770e119ca692 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1578,7 +1578,6 @@ static inline void set_dl_cpu_capacity(int cpu, bool request, static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); } #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } From fab5cc59bf31aac4bcb222c7d1013a033cb8ef70 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Nov 2015 10:39:01 +0100 Subject: [PATCH 141/607] sched: deadline: use deadline bandwidth in scale_rt_capacity Instead of monitoring the exec time of deadline tasks to evaluate the CPU capacity consumed by deadline scheduler class, we can directly calculate it thanks to the sum of utilization of deadline tasks on the CPU. We can remove deadline tasks from rt_avg metric and directly use the average bandwidth of deadline scheduler in scale_rt_capacity. Based in part on a similar patch from Luca Abeni . Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 8 ++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..9d9eb50d4059 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw += se_bw; +} + +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw -= se_bw; + if (dl_rq->avg_bw < 0) { + WARN_ON(1); + dl_rq->avg_bw = 0; + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + if (dl_se->dl_new) + add_average_bw(dl_se, dl_rq); + /* * The arrival of a new instance needs special treatment, i.e., * the actual scheduling parameters have to be "renewed". @@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; @@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_rq *dl_rq = dl_rq_of_se(&p->dl); + struct rq *rq = rq_of_dl_rq(dl_rq); /* * Since we are TASK_DEAD we won't slip out of the domain! @@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); + + clear_average_bw(&p->dl, &rq->dl); } static void set_curr_task_dl(struct rq *rq) @@ -1556,7 +1579,9 @@ retry: } deactivate_task(rq, next_task, 0); + clear_average_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + clear_average_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); + add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!start_dl_timer(p)) __dl_clear_params(p); + clear_average_bw(&p->dl, &rq->dl); + /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb1d0e3d7835..651e160cdd67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6550,6 +6550,14 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 770e119ca692..983c132af11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -506,6 +506,8 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* This is the "average utilization" for this runqueue */ + s64 avg_bw; }; #ifdef CONFIG_SMP From 6e1e1ed8720652305b77b6d01f80276ef076aff0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 26 Oct 2015 18:14:50 +0100 Subject: [PATCH 142/607] sched: rt scheduler sets capacity requirement RT tasks don't provide any running constraints like deadline ones except their running priority. The only current usable input to estimate the capacity needed by RT tasks is the rt_avg metric. We use it to estimate the CPU capacity needed for the RT scheduler class. In order to monitor the evolution for RT task load, we must peridiocally check it during the tick. Then, we use the estimated capacity of the last activity to estimate the next one which can not be that accurate but is a good starting point without any impact on the wake up path of RT tasks. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..9694204660b7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1426,6 +1426,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +#ifdef CONFIG_SMP +static void sched_rt_update_capacity_req(struct rq *rq) +{ + u64 total, used, age_stamp, avg; + s64 delta; + + if (!sched_freq()) + return; + + sched_avg_update(rq); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = rq_clock(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + if (unlikely(used > SCHED_CAPACITY_SCALE)) + used = SCHED_CAPACITY_SCALE; + + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); +} +#else +static inline void sched_rt_update_capacity_req(struct rq *rq) +{ } + +#endif + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1494,8 +1529,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!rt_rq->rt_queued) { + /* + * The next task to be picked on this rq will have a lower + * priority than rt tasks so we can spend some time to update + * the capacity used by rt tasks based on the last activity. + * This value will be the used as an estimation of the next + * activity. + */ + sched_rt_update_capacity_req(rq); return NULL; + } put_prev_task(rq, prev); @@ -2212,6 +2256,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); + if (rq->rt.rt_nr_running) + sched_rt_update_capacity_req(rq); + watchdog(rq, p); /* From 3eb29105cfe14719e0b61c782efd4b3b509d786d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:55:51 +0000 Subject: [PATCH 143/607] fixup! sched: scheduler-driven cpu frequency selection Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 5afe56a82491..e1d208e101ed 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -119,9 +119,9 @@ static int cpufreq_sched_thread(void *data) } do { - set_current_state(TASK_INTERRUPTIBLE); new_request = gd->requested_freq; if (new_request == last_request) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); } else { /* From 08d1cfd7c9193272e6caa100cbc5ef875cb5543c Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:58:05 +0000 Subject: [PATCH 144/607] fixup! sched/fair: jump to max OPP when crossing UP threshold Signed-off-by: Juri Lelli --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac3d9af477f4..151ed9c62ad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2910,6 +2910,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2919,8 +2920,6 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); - - sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL From 8aaf022dd7e5390b3e95fe92a8f49b1de949a687 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 30 Jun 2015 12:03:26 +0100 Subject: [PATCH 145/607] sched/tune: add detailed documentation The topic of a single simple power-performance tunable, that is wholly scheduler centric, and has well defined and predictable properties has come up on several occasions in the past. With techniques such as a scheduler driven DVFS, we now have a good framework for implementing such a tunable. This patch provides a detailed description of the motivations and design decisions behind the implementation of the SchedTune. cc: Jonathan Corbet cc: linux-doc@vger.kernel.org Signed-off-by: Patrick Bellasi --- Documentation/scheduler/sched-tune.txt | 366 +++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 Documentation/scheduler/sched-tune.txt diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt new file mode 100644 index 000000000000..9bd2231c01b1 --- /dev/null +++ b/Documentation/scheduler/sched-tune.txt @@ -0,0 +1,366 @@ + Central, scheduler-driven, power-performance control + (EXPERIMENTAL) + +Abstract +======== + +The topic of a single simple power-performance tunable, that is wholly +scheduler centric, and has well defined and predictable properties has come up +on several occasions in the past [1,2]. With techniques such as a scheduler +driven DVFS [3], we now have a good framework for implementing such a tunable. +This document describes the overall ideas behind its design and implementation. + + +Table of Contents +================= + +1. Motivation +2. Introduction +3. Signal Boosting Strategy +4. OPP selection using boosted CPU utilization +5. Per task group boosting +6. Question and Answers + - What about "auto" mode? + - What about boosting on a congested system? + - How CPUs are boosted when we have tasks with multiple boost values? +7. References + + +1. Motivation +============= + +Sched-DVFS [3] is a new event-driven cpufreq governor which allows the +scheduler to select the optimal DVFS operating point (OPP) for running a task +allocated to a CPU. The introduction of sched-DVFS enables running workloads at +the most energy efficient OPPs. + +However, sometimes it may be desired to intentionally boost the performance of +a workload even if that could imply a reasonable increase in energy +consumption. For example, in order to reduce the response time of a task, we +may want to run the task at a higher OPP than the one that is actually required +by it's CPU bandwidth demand. + +This last requirement is especially important if we consider that one of the +main goals of the sched-DVFS component is to replace all currently available +CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling +driven governors we currently have, it is already more responsive at selecting +the optimal OPP to run tasks allocated to a CPU. However, just tracking the +actual task load demand may not be enough from a performance standpoint. For +example, it is not possible to get behaviors similar to those provided by the +"performance" and "interactive" CPUFreq governors. + +This document describes an implementation of a tunable, stacked on top of the +sched-DVFS which extends its functionality to support task performance +boosting. + +By "performance boosting" we mean the reduction of the time required to +complete a task activation, i.e. the time elapsed from a task wakeup to its +next deactivation (e.g. because it goes back to sleep or it terminates). For +example, if we consider a simple periodic task which executes the same workload +for 5[s] every 20[s] while running at a certain OPP, a boosted execution of +that task must complete each of its activations in less than 5[s]. + +A previous attempt [5] to introduce such a boosting feature has not been +successful mainly because of the complexity of the proposed solution. The +approach described in this document exposes a single simple interface to +user-space. This single tunable knob allows the tuning of system wide +scheduler behaviours ranging from energy efficiency at one end through to +incremental performance boosting at the other end. This first tunable affects +all tasks. However, a more advanced extension of the concept is also provided +which uses CGroups to boost the performance of only selected tasks while using +the energy efficient default for all others. + +The rest of this document introduces in more details the proposed solution +which has been named SchedTune. + + +2. Introduction +=============== + +SchedTune exposes a simple user-space interface with a single power-performance +tunable: + + /proc/sys/kernel/sched_cfs_boost + +This permits expressing a boost value as an integer in the range [0..100]. + +A value of 0 (default) configures the CFS scheduler for maximum energy +efficiency. This means that sched-DVFS runs the tasks at the minimum OPP +required to satisfy their workload demand. +A value of 100 configures scheduler for maximum performance, which translates +to the selection of the maximum OPP on that CPU. + +The range between 0 and 100 can be set to satisfy other scenarios suitably. For +example to satisfy interactive response or depending on other system events +(battery level etc). + +A CGroup based extension is also provided, which permits further user-space +defined task classification to tune the scheduler for different goals depending +on the specific nature of the task, e.g. background vs interactive vs +low-priority. + +The overall design of the SchedTune module is built on top of "Per-Entity Load +Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating +Performance Point (OPP) selection. +Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune +the operating frequency of that CPU to better match the workload demand. The +selection of the actual OPP being activated is influenced by the global boost +value, or the boost value for the task CGroup when in use. + +This simple biasing approach leverages existing frameworks, which means minimal +modifications to the scheduler, and yet it allows to achieve a range of +different behaviours all from a single simple tunable knob. +The only new concept introduced is that of signal boosting. + + +3. Signal Boosting Strategy +=========================== + +The whole PELT machinery works based on the value of a few load tracking signals +which basically track the CPU bandwidth requirements for tasks and the capacity +of CPUs. The basic idea behind the SchedTune knob is to artificially inflate +some of these load tracking signals to make a task or RQ appears more demanding +that it actually is. + +Which signals have to be inflated depends on the specific "consumer". However, +independently from the specific (signal, consumer) pair, it is important to +define a simple and possibly consistent strategy for the concept of boosting a +signal. + +A boosting strategy defines how the "abstract" user-space defined +sched_cfs_boost value is translated into an internal "margin" value to be added +to a signal to get its inflated value: + + margin := boosting_strategy(sched_cfs_boost, signal) + boosted_signal := signal + margin + +Different boosting strategies were identified and analyzed before selecting the +one found to be most effective. + +Signal Proportional Compensation (SPC) +-------------------------------------- + +In this boosting strategy the sched_cfs_boost value is used to compute a +margin which is proportional to the complement of the original signal. +When a signal has a maximum possible value, its complement is defined as +the delta from the actual value and its possible maximum. + +Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as +the maximum possible value, the margin becomes: + + margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal) + +Using this boosting strategy: +- a 100% sched_cfs_boost means that the signal is scaled to the maximum value +- each value in the range of sched_cfs_boost effectively inflates the signal in + question by a quantity which is proportional to the maximum value. + +For example, by applying the SPC boosting strategy to the selection of the OPP +to run a task it is possible to achieve these behaviors: + +- 0% boosting: run the task at the minimum OPP required by its workload +- 100% boosting: run the task at the maximum OPP available for the CPU +- 50% boosting: run at the half-way OPP between minimum and maximum + +Which means that, at 50% boosting, a task will be scheduled to run at half of +the maximum theoretically achievable performance on the specific target +platform. + +A graphical representation of an SPC boosted signal is represented in the +following figure where: + a) "-" represents the original signal + b) "b" represents a 50% boosted signal + c) "p" represents a 100% boosted signal + + + ^ + | SCHED_LOAD_SCALE + +-----------------------------------------------------------------+ + |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp + | + | boosted_signal + | bbbbbbbbbbbbbbbbbbbbbbbb + | + | original signal + | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+ + | | + |bbbbbbbbbbbbbbbbbb | + | | + | | + | | + | +-----------------------+ + | | + | | + | | + |------------------+ + | + | + +-----------------------------------------------------------------------> + +The plot above shows a ramped load signal (titled 'original_signal') and it's +boosted equivalent. For each step of the original signal the boosted signal +corresponding to a 50% boost is midway from the original signal and the upper +bound. Boosting by 100% generates a boosted signal which is always saturated to +the upper bound. + + +4. OPP selection using boosted CPU utilization +============================================== + +It is worth calling out that the implementation does not introduce any new load +signals. Instead, it provides an API to tune existing signals. This tuning is +done on demand and only in scheduler code paths where it is sensible to do so. +The new API calls are defined to return either the default signal or a boosted +one, depending on the value of sched_cfs_boost. This is a clean an non invasive +modification of the existing existing code paths. + +The signal representing a CPU's utilization is boosted according to the +previously described SPC boosting strategy. To sched-DVFS, this allows a CPU +(ie CFS run-queue) to appear more used then it actually is. + +Thus, with the sched_cfs_boost enabled we have the following main functions to +get the current utilization of a CPU: + + cpu_util() + boosted_cpu_util() + +The new boosted_cpu_util() is similar to the first but returns a boosted +utilization signal which is a function of the sched_cfs_boost value. + +This function is used in the CFS scheduler code paths where sched-DVFS needs to +decide the OPP to run a CPU at. +For example, this allows selecting the highest OPP for a CPU which has +the boost value set to 100%. + + +5. Per task group boosting +========================== + +The availability of a single knob which is used to boost all tasks in the +system is certainly a simple solution but it quite likely doesn't fit many +utilization scenarios, especially in the mobile device space. + +For example, on battery powered devices there usually are many background +services which are long running and need energy efficient scheduling. On the +other hand, some applications are more performance sensitive and require an +interactive response and/or maximum performance, regardless of the energy cost. +To better service such scenarios, the SchedTune implementation has an extension +that provides a more fine grained boosting interface. + +A new CGroup controller, namely "schedtune", could be enabled which allows to +defined and configure task groups with different boosting values. +Tasks that require special performance can be put into separate CGroups. +The value of the boost associated with the tasks in this group can be specified +using a single knob exposed by the CGroup controller: + + schedtune.boost + +This knob allows the definition of a boost value that is to be used for +SPC boosting of all tasks attached to this group. + +The current schedtune controller implementation is really simple and has these +main characteristics: + + 1) It is only possible to create 1 level depth hierarchies + + The root control groups define the system-wide boost value to be applied + by default to all tasks. Its direct subgroups are named "boost groups" and + they define the boost value for specific set of tasks. + Further nested subgroups are not allowed since they do not have a sensible + meaning from a user-space standpoint. + + 2) It is possible to define only a limited number of "boost groups" + + This number is defined at compile time and by default configured to 16. + This is a design decision motivated by two main reasons: + a) In a real system we do not expect utilization scenarios with more then few + boost groups. For example, a reasonable collection of groups could be + just "background", "interactive" and "performance". + b) It simplifies the implementation considerably, especially for the code + which has to compute the per CPU boosting once there are multiple + RUNNABLE tasks with different boost values. + +Such a simple design should allow servicing the main utilization scenarios identified +so far. It provides a simple interface which can be used to manage the +power-performance of all tasks or only selected tasks. +Moreover, this interface can be easily integrated by user-space run-times (e.g. +Android, ChromeOS) to implement a QoS solution for task boosting based on tasks +classification, which has been a long standing requirement. + +Setup and usage +--------------- + +0. Use a kernel with CGROUP_SCHEDTUNE support enabled + +1. Check that the "schedtune" CGroup controller is available: + + root@linaro-nano:~# cat /proc/cgroups + #subsys_name hierarchy num_cgroups enabled + cpuset 0 1 1 + cpu 0 1 1 + schedtune 0 1 1 + +2. Mount a tmpfs to create the CGroups mount point (Optional) + + root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup + +3. Mount the "schedtune" controller + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune + root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune + +4. Setup the system-wide boost value (Optional) + + If not configured the root control group has a 0% boost value, which + basically disables boosting for all tasks in the system thus running in + an energy-efficient mode. + + root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost + +5. Create task groups and configure their specific boost value (Optional) + + For example here we create a "performance" boost group configure to boost + all its tasks to 100% + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance + root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost + +6. Move tasks into the boost group + + For example, the following moves the tasks with PID $TASKPID (and all its + threads) into the "performance" boost group. + + root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs + +This simple configuration allows only the threads of the $TASKPID task to run, +when needed, at the highest OPP in the most capable CPU of the system. + + +6. Question and Answers +======================= + +What about "auto" mode? +----------------------- + +The 'auto' mode as described in [5] can be implemented by interfacing SchedTune +with some suitable user-space element. This element could use the exposed +system-wide or cgroup based interface. + +How are multiple groups of tasks with different boost values managed? +--------------------------------------------------------------------- + +The current SchedTune implementation keeps track of the boosted RUNNABLE tasks +on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization +is boosted with a value which is the maximum of the boost values of the +currently RUNNABLE tasks in its RQ. + +This allows sched-DVFS to boost a CPU only while there are boosted tasks ready +to run and switch back to the energy efficient mode as soon as the last boosted +task is dequeued. + + +7. References +============= +[1] http://lwn.net/Articles/552889 +[2] http://lkml.org/lkml/2012/5/18/91 +[3] http://lkml.org/lkml/2015/6/26/620 From 344f4ec4293b75d83282eb13ede965995e5f312f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:11:44 +0100 Subject: [PATCH 146/607] sched/tune: add sysctl interface to define a boost value The current (CFS) scheduler implementation does not allow "to boost" tasks performance by running them at a higher OPP compared to the minimum required to meet their workload demands. To support tasks performance boosting the scheduler should provide a "knob" which allows to tune how much the system is going to be optimised for energy efficiency vs performance. This patch is the first of a series which provides a simple interface to define a tuning knob. One system-wide "boost" tunable is exposed via: /proc/sys/kernel/sched_cfs_boost which can be configured in the range [0..100], to define a percentage where: - 0% boost requires to operate in "standard" mode by scheduling tasks at the minimum capacities required by the workload demand - 100% boost requires to push at maximum the task performances, "regardless" of the incurred energy consumption A boost value in between these two boundaries is used to bias the power/performance trade-off, the higher the boost value the more the scheduler is biased toward performance boosting instead of energy efficiency. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/sched/sysctl.h | 16 ++++++++++++++++ init/Kconfig | 26 ++++++++++++++++++++++++++ kernel/sched/Makefile | 1 + kernel/sched/tune.c | 16 ++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 5 files changed, 70 insertions(+) create mode 100644 kernel/sched/tune.c diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4479e48c7712 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -77,6 +77,22 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_SCHED_TUNE +extern unsigned int sysctl_sched_cfs_boost; +int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return sysctl_sched_cfs_boost; +} +#else +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return 0; +} +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..83d8c02e7a57 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1237,6 +1237,32 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TUNE + bool "Boosting for CFS tasks (EXPERIMENTAL)" + help + This option enables the system-wide support for task boosting. + When this support is enabled a new sysctl interface is exposed to + userspace via: + /proc/sys/kernel/sched_cfs_boost + which allows to set a system-wide boost value in range [0..100]. + + The currently boosting strategy is implemented in such a way that: + - a 0% boost value requires to operate in "standard" mode by + scheduling all tasks at the minimum capacities required by their + workload demand + - a 100% boost value requires to push at maximum the task + performances, "regardless" of the incurred energy consumption + + A boost value in between these two boundaries is used to bias the + power/performance trade-off, the higher the boost value the more the + scheduler is biased toward performance boosting instead of energy + efficiency. + + Since this support exposes a single system-wide knob, the specified + boost value is applied to all (CFS) tasks in the system. + + If unsure, say N. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 0eabc9db4c3d..c6a85f813dfd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 000000000000..a93af9c2f267 --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,16 @@ +#include "sched.h" + +unsigned int sysctl_sched_cfs_boost __read_mostly; + +int +sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + return 0; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..827ba256fdad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -434,6 +434,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#ifdef CONFIG_SCHED_TUNE + { + .procname = "sched_cfs_boost", + .data = &sysctl_sched_cfs_boost, + .maxlen = sizeof(sysctl_sched_cfs_boost), + .mode = 0644, + .proc_handler = &sysctl_sched_cfs_boost_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", From 6ed27145013d97e49d7e71a56601033cccee743c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:32:36 +0100 Subject: [PATCH 147/607] sched/fair: add function to convert boost value into "margin" The basic idea of the boost knob is to "artificially inflate" a signal to make a task or logical CPU appears more demanding than it actually is. Independently from the specific signal, a consistent and possibly simple semantic for the concept of "signal boosting" must define: 1. how we translate the boost percentage into a "margin" value to be added to the original signal to inflate 2. what is the meaning of a boost value from a user-space perspective This patch provides the implementation of a possible boost semantic, named "Signal Proportional Compensation" (SPC), where the boost percentage (BP) is used to compute a margin (M) which is proportional to the complement of the original signal (OS): M = BP * (SCHED_LOAD_SCALE - OS) The computed margin then added to the OS to obtain the Boosted Signal (BS) BS = OS + M The proposed boost semantic has these main features: - each signal gets a boost which is proportional to its delta with respect to the maximum available capacity in the system (i.e. SCHED_LOAD_SCALE) - a 100% boosting has a clear understanding from a user-space perspective, since it means simply to run (possibly) "all" tasks at the max OPP - each boosting value means to improve the task performance by a quantity which is proportional to the maximum achievable performance on that system Thus this semantics is somehow forcing a behaviour which is: 50% boosting means to run at half-way between the current and the maximum performance which a task could achieve on that system This patch provides the code to implement a fast integer division to convert a boost percentage (BP) value into a margin (M). NOTE: this code is suitable for all signals operating in range [0..SCHED_LOAD_SCALE] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 651e160cdd67..1b6a57837433 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5074,6 +5074,44 @@ static bool cpu_overutilized(int cpu) return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); } +#ifdef CONFIG_SCHED_TUNE + +static unsigned long +schedtune_margin(unsigned long signal, unsigned long boost) +{ + unsigned long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_LOAD_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + + /* + * Fast integer division by constant: + * Constant : (C) = 100 + * Precision : 0.1% (P) = 0.1 + * Reference : C * 100 / P (R) = 100000 + * + * Thus: + * Shift bits : ceil(log(R,2)) (S) = 17 + * Mult const : round(2^S/C) (M) = 1311 + * + * + */ + margin *= 1311; + margin >>= 17; + + return margin; +} + +#endif /* CONFIG_SCHED_TUNE */ + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 07e22949de7116dfac9098a46109c30776ead625 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 26 Jun 2015 09:55:06 +0100 Subject: [PATCH 148/607] sched/fair: add boosted CPU usage The CPU usage signal is used by the scheduler as an estimation of the overall bandwidth currently allocated on a CPU. When SchedDVFS is in use, this signal affects the selection of the operating points (OPP) required to accommodate all the workload allocated in a CPU. A convenient way to boost the performance of tasks running on a CPU, which is also little intrusive, is to boost the CPU usage signal each time it is used to select an OPP. This patch introduces a new function: get_boosted_cpu_usage(cpu) to return a boosted value for the usage of a specified CPU. The margin added to the original usage is: 1. computed based on the "boosting strategy" in use 2. proportional to the system-wide boost value defined by provided user-space interface The boosted signal is used by SchedDVFS (transparently) each time it requires to get an estimation of the capacity required for a CPU. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b6a57837433..eff548fae43e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static inline unsigned long boosted_cpu_util(int cpu); + static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4152,7 +4154,8 @@ static void update_capacity_of(int cpu) return; /* Convert scale-invariant capacity to cpu. */ - req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } @@ -5110,8 +5113,36 @@ schedtune_margin(unsigned long signal, unsigned long boost) return margin; } +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + unsigned int boost = get_sysctl_sched_cfs_boost(); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ +static inline unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + unsigned long margin = schedtune_cpu_margin(util); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 13001f47c9a610705219700af4636386b647e231 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 23 Jun 2015 09:17:54 +0100 Subject: [PATCH 149/607] sched/tune: add initial support for CGroups based boosting To support task performance boosting, the usage of a single knob has the advantage to be a simple solution, both from the implementation and the usability standpoint. However, on a real system it can be difficult to identify a single value for the knob which fits the needs of multiple different tasks. For example, some kernel threads and/or user-space background services should be better managed the "standard" way while we still want to be able to boost the performance of specific workloads. In order to improve the flexibility of the task boosting mechanism this patch is the first of a small series which extends the previous implementation to introduce a "per task group" support. This first patch introduces just the basic CGroups support, a new "schedtune" CGroups controller is added which allows to configure different boost value for different groups of tasks. To keep the implementation simple but still effective for a boosting strategy, the new controller: 1. allows only a two layer hierarchy 2. supports only a limited number of boost groups A two layer hierarchy allows to place each task either: a) in the root control group thus being subject to a system-wide boosting value b) in a child of the root group thus being subject to the specific boost value defined by that "boost group" The limited number of "boost groups" supported is mainly motivated by the observation that in a real system it could be useful to have only few classes of tasks which deserve different treatment. For example, background vs foreground or interactive vs low-priority. As an additional benefit, a limited number of boost groups allows also to have a simpler implementation especially for the code required to compute the boost value for CPUs which have runnable tasks belonging to different boost groups. cc: Tejun Heo cc: Li Zefan cc: Johannes Weiner cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/cgroup_subsys.h | 4 + init/Kconfig | 17 +++ kernel/sched/tune.c | 223 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 4 + 4 files changed, 248 insertions(+) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1a96fdaa33d5..e133705d794a 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -26,6 +26,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE) +SUBSYS(schedtune) +#endif + #if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(io) #endif diff --git a/init/Kconfig b/init/Kconfig index 83d8c02e7a57..5d9097e2b805 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -999,6 +999,23 @@ config CGROUP_CPUACCT Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. +config CGROUP_SCHEDTUNE + bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)" + depends on SCHED_TUNE + help + This option provides the "schedtune" controller which improves the + flexibility of the task boosting mechanism by introducing the support + to define "per task" boost values. + + This new controller: + 1. allows only a two layers hierarchy, where the root defines the + system-wide boost value and its direct childrens define each one a + different "class of tasks" to be boosted with a different value + 2. supports up to 16 different task classes, each one which could be + configured with a different boost value + + Say N if unsure. + config PAGE_COUNTER bool diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a93af9c2f267..95bc8b87c6d4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,230 @@ +#include +#include +#include +#include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; +#ifdef CONFIG_CGROUP_SCHEDTUNE + +/* + * EAS scheduler tunables for task groups. + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, +}; + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 4 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + unsigned boost_max; + struct { + /* The boost for tasks on that boost group */ + unsigned boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static u64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 boost) +{ + struct schedtune *st = css_st(css); + + if (boost < 0 || boost > 100) + return -EINVAL; + + st->boost = boost; + if (css == &root_schedtune.css) + sysctl_sched_cfs_boost = boost; + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_u64 = boost_read, + .write_u64 = boost_write, + }, + { } /* terminate */ +}; + +static int +schedtune_boostgroup_init(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + return 0; +} + +static int +schedtune_init(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info(" schedtune configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + return 0; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) { + schedtune_init(); + return &root_schedtune.css; + } + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; + + return &st->css; + +release: + kfree(st); +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .legacy_cftypes = files, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 827ba256fdad..98c0f6ef01f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -439,7 +439,11 @@ static struct ctl_table kern_table[] = { .procname = "sched_cfs_boost", .data = &sysctl_sched_cfs_boost, .maxlen = sizeof(sysctl_sched_cfs_boost), +#ifdef CONFIG_CGROUP_SCHEDTUNE + .mode = 0444, +#else .mode = 0644, +#endif .proc_handler = &sysctl_sched_cfs_boost_handler, .extra1 = &zero, .extra2 = &one_hundred, From 9cd53fbeb2ad365aac06eebed2baee4984a283b0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 12:31:35 +0000 Subject: [PATCH 150/607] sched/tune: compute and keep track of per CPU boost value When per task boosting is enabled, we could have multiple RUNNABLE tasks which are concurrently scheduled on the same CPU but each one with a different boost value. For example, we could have a scenarios like this: Task SchedTune CGroup Boost Value T1 root 0 T2 low-priority 10 T3 interactive 90 In these conditions we expect a CPU to be configured according to a proper "aggregation" of the required boost values for all the tasks currently scheduled on this CPU. A suitable aggregation function is the one which tracks the MAX boost value for all the tasks RUNNABLE on a CPU. This approach allows to always satisfy the most boost demanding task while at the same time: a) boosting all the concurrently scheduled tasks thus reducing potential co-scheduling side-effects on demanding tasks b) reduce the number of frequency switch requested towards SchedDVFS, thus being more friendly to architectures with slow frequency switching times Every time a task enters/exits the RQ of a CPU the max boost value should be updated considering all the boost groups currently "affecting" that CPU, i.e. which have at least one RUNNABLE task currently allocated on that CPU. This patch introduces the required support to keep track of the boost groups currently affecting CPUs. Thanks to the limited number of boost groups, a small and memory efficient per-cpu array of boost groups values (cpu_boost_groups) is used which is updated for each CPU entry by schedtune_boostgroup_update() but only when a schedtune CGroup boost value is updated. However, this is expected to be a rare operation, perhaps done just one time at system boot time. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 77 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 95bc8b87c6d4..f62386893725 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -97,6 +97,67 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg; + unsigned boost_max; + int idx; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + boost_max = max(boost_max, bg->group[idx].boost); + } + + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) + schedtune_cpu_update(cpu); + } + + return 0; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -118,6 +179,9 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, if (css == &root_schedtune.css) sysctl_sched_cfs_boost = boost; + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + return 0; } @@ -133,9 +197,19 @@ static struct cftype files[] = { static int schedtune_boostgroup_init(struct schedtune *st) { + struct boost_groups *bg; + int cpu; + /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; + } + return 0; } @@ -203,6 +277,9 @@ out: static void schedtune_boostgroup_release(struct schedtune *st) { + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); + /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } From a515b887ece159dfc45010667c43bf4b834ee325 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 7 Jul 2015 15:33:20 +0100 Subject: [PATCH 151/607] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value When per-task boosting is enabled, every time a task enters/exits a CPU its boost value could impact the currently selected OPP for that CPU. Thus, the "aggregated" boost value for that CPU potentially needs to be updated to match the current maximum boost value among all the tasks currently RUNNABLE on that CPU. This patch introduces the required support to keep track of which boost groups are impacting a CPU. Each time a task is enqueued/dequeued to/from a CPU its boost group is used to increment a per-cpu counter of RUNNABLE tasks on that CPU. Only when the number of runnable tasks for a specific boost group becomes 1 or 0 the corresponding boost group changes its effects on that CPU, specifically: a) boost_group::tasks == 1: this boost group starts to impact the CPU b) boost_group::tasks == 0: this boost group stops to impact the CPU In each of these two conditions the aggregation function: sched_cpu_update(cpu) could be required to run in order to identify the new maximum boost value required for the CPU. The proposed patch minimizes the number of times the aggregation function is executed while still providing the required support to always boost a CPU to the maximum boost value required by all its currently RUNNABLE tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 17 +++++++--- kernel/sched/tune.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 23 +++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/tune.h diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eff548fae43e..2dbe1ff0a90b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -34,6 +34,7 @@ #include #include "sched.h" +#include "tune.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -4210,6 +4211,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + schedtune_enqueue_task(p, cpu_of(rq)); + /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4279,6 +4282,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { sub_nr_running(rq, 1); + schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -5114,10 +5118,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) } static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost = get_sysctl_sched_cfs_boost(); + unsigned int boost; +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_cpu_boost(cpu); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif if (boost == 0) return 0; @@ -5127,7 +5136,7 @@ schedtune_cpu_margin(unsigned long util) #else /* CONFIG_SCHED_TUNE */ static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } @@ -5138,7 +5147,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util); + unsigned long margin = schedtune_cpu_margin(util, cpu); return util + margin; } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f62386893725..540b945a01ce 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "sched.h" @@ -158,6 +159,87 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg; + int tasks; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* Update boosted tasks count while avoiding to make it negative */ + if (task_count < 0 && bg->group[idx].tasks <= -task_count) + bg->group[idx].tasks = 0; + else + bg->group[idx].tasks += task_count; + + /* Boost group activation or deactivation on that RQ */ + tasks = bg->group[idx].tasks; + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, 1); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue will be done by cgroup exit() callback. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, -1); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 000000000000..561b5171a19b --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,23 @@ + +#ifdef CONFIG_SCHED_TUNE + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +int schedtune_cpu_boost(int cpu); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_SCHED_TUNE */ From a8f65584fc948d646732c0f9533570f97091c78d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:31:53 +0000 Subject: [PATCH 152/607] sched/fair: add boosted task utilization The task utilization signal, which is derived from PELT signals and properly scaled to be architecture and frequency invariant, is used by EAS as an estimation of the task requirements in terms of CPU bandwidth. When the energy aware scheduler is in use, this signal affects the CPU selection. Thus, a convenient way to bias that decision, which is also little intrusive, is to boost the task utilization signal each time it is required to support them. This patch introduces the new function: boosted_task_util(task) which returns a boosted value for the utilization of the specified task. The margin added to the original utilization is: 1. computed based on the "boosting strategy" in use 2. proportional to boost value defined either by the sysctl interface, when global boosting is in use, or the "taskgroup" value, when per-task boosting is enabled. The boosted signal is used by EAS a. transparently, via its integration into the task_fits() function b. explicitly, in the energy-aware wakeup path Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/tune.c | 14 ++++++++++++++ kernel/sched/tune.h | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2dbe1ff0a90b..f582d58daea5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5048,11 +5048,13 @@ static inline unsigned long task_util(struct task_struct *p) unsigned int capacity_margin = 1280; /* ~20% margin */ +static inline unsigned long boosted_task_util(struct task_struct *task); + static inline bool __task_fits(struct task_struct *p, int cpu, int util) { unsigned long capacity = capacity_of(cpu); - util += task_util(p); + util += boosted_task_util(p); return (capacity * 1024) > (util * capacity_margin); } @@ -5133,6 +5135,27 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } +static inline unsigned long +schedtune_task_margin(struct task_struct *task) +{ + unsigned int boost; + unsigned long util; + unsigned long margin; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + #else /* CONFIG_SCHED_TUNE */ static inline unsigned int @@ -5141,6 +5164,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return 0; } +static inline unsigned int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ static inline unsigned long @@ -5152,6 +5181,15 @@ boosted_cpu_util(int cpu) return util + margin; } +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + unsigned long margin = schedtune_task_margin(task); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5386,7 +5424,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + task_util(p); + int new_util = cpu_util(i) + boosted_task_util(p); if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 540b945a01ce..87213861bde5 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -240,6 +240,20 @@ int schedtune_cpu_boost(int cpu) return bg->boost_max; } +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 561b5171a19b..d756ce7b06e0 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -4,6 +4,7 @@ #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); From 36967b2412fa7f356308c5104d95574e84ca03ef Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:35:13 +0000 Subject: [PATCH 153/607] sched/fair: keep track of energy/capacity variations The current EAS implementation does not allow "to boost" tasks performances, for example by running them at an higher OPP (or a more capable CPU), even if that could require a "reasonable" increase in energy consumption. To defined how much reasonable is an energy increase with respect to a required boost value, it is required to define and compute a trade-off between the expected energy and performance variations. However, the current EAS implementation considers only energy variations while completely disregard the impact on performance for the selection of a certain schedule candidate. This patch extends the eenv energy environment to keep track of both energy and performance deltas which are implied by the activation of a schedule candidate. The performance variation is estimated considering the different capacities of the CPUs in which the task could be scheduled. The idea is that while running on a CPU with higher capacity (e.g. higher operating point) the task could (potentially) complete faster and thus get better performance. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f582d58daea5..3b80ad01aa8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4706,6 +4706,16 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + struct { + int before; + int after; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; }; /* @@ -4872,6 +4882,22 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) @@ -4920,6 +4946,8 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0 }, + .cap = { 0, 0, 0 }, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -4941,13 +4969,21 @@ static int energy_diff(struct energy_env *eenv) return 0; /* Invalid result abort */ energy_before += eenv_before.energy; + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + if (sched_group_energy(eenv)) return 0; /* Invalid result abort */ energy_after += eenv->energy; } } while (sg = sg->next, sg != sd->groups); - return energy_after-energy_before; + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + + return eenv->nrg.diff; } /* From 637ee3715a1a22c07995be181c0e2fca10d2df2a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 12 Jan 2016 18:12:13 +0000 Subject: [PATCH 154/607] sched/tune: add support to compute normalized energy The current EAS implementation considers only energy variations, while it disregards completely the impact on performance for the selection of a certain schedule candidate. Moreover, it also makes its decision based on the "absolute" value of expected energy variations. In order to properly define a trade-off strategy between increased energy consumption and performances benefits it is required to compare energy variations with performance variations. Thus, both performance and energy metrics must be expressed in comparable units. While the performance variations are expressed in terms of capacity deltas, which are defined in the range [0..SCHED_LOAD_SCALE], the same scale is not used for energy variations. This patch introduces the function: schedtune_normalize_energy(energy_diff) which returns a normalized value in the same range of capacity variations, i.e. [0..SCHED_LOAD_SCALE]. A proper set of energy normalization constants are required to provide a fast division by a constant during the normalziation of the energy_diff. The value of these constants depends on the specific energy model and topology of a target device. Thus, this patch provides also the required support for the computation at boot time of this set of variables. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 321 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 7 + 2 files changed, 328 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 87213861bde5..1a8ba5a6d99b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,9 @@ #include #include +#include #include #include +#include #include #include @@ -9,6 +11,84 @@ unsigned int sysctl_sched_cfs_boost __read_mostly; +/* + * System energy normalization constants + */ +static struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +} schedtune_target_nrg; + +/* Performance Boost region (B) threshold params */ +static int perf_boost_idx; + +/* Performance Constraint region (C) threshold params */ +static int perf_constrain_idx; + +/** + * Performance-Energy (P-E) Space thresholds constants + */ +struct threshold_params { + int nrg_gain; + int cap_gain; +}; + +/* + * System specific P-E space thresholds constants + */ +static struct threshold_params +threshold_gains[] = { + { 0, 4 }, /* >= 0% */ + { 0, 4 }, /* >= 10% */ + { 1, 4 }, /* >= 20% */ + { 2, 4 }, /* >= 30% */ + { 3, 4 }, /* >= 40% */ + { 4, 3 }, /* >= 50% */ + { 4, 2 }, /* >= 60% */ + { 4, 1 }, /* >= 70% */ + { 4, 0 }, /* >= 80% */ + { 4, 0 } /* >= 90% */ +}; + +static int +__schedtune_accept_deltas(int nrg_delta, int cap_delta, + int perf_boost_idx, int perf_constrain_idx) +{ + int payoff = -INT_MAX; + + /* Performance Boost (B) region */ + if (nrg_delta > 0 && cap_delta > 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta < cap_gain / nrg_gain + * which is: + * nrg_delta * cap_gain > cap_delta * nrg_gain + */ + payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; + payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + return payoff; + } + + /* Performance Constraint (C) region */ + if (nrg_delta < 0 && cap_delta < 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta > cap_gain / nrg_gain + * which is: + * cap_delta * nrg_gain > nrg_delta * cap_gain + */ + payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + return payoff; + } + + /* Default: reject schedule candidate */ + return payoff; +} + #ifdef CONFIG_CGROUP_SCHEDTUNE /* @@ -26,6 +106,11 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; + /* Performance Boost (B) region threshold params */ + int perf_boost_idx; + + /* Performance Constraint (C) region threshold params */ + int perf_constrain_idx; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, + .perf_boost_idx = 0, + .perf_constrain_idx = 0, }; +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + struct schedtune *ct; + int perf_boost_idx; + int perf_constrain_idx; + + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + /* Get task specific perf Boost/Constraints indexes */ + rcu_read_lock(); + ct = task_schedtune(task); + perf_boost_idx = ct->perf_boost_idx; + perf_constrain_idx = ct->perf_constrain_idx; + rcu_read_unlock(); + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of @@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + #endif /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -408,5 +540,194 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* Performance Boost (B) region threshold params */ + perf_boost_idx = sysctl_sched_cfs_boost; + perf_boost_idx /= 10; + + /* Performance Constraint (C) region threshold params */ + perf_constrain_idx = 100 - sysctl_sched_cfs_boost; + perf_constrain_idx /= 10; + return 0; } + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +int +schedtune_normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + int max_delta; + +#ifdef CONFIG_SCHED_DEBUG + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +#ifdef CONFIG_SCHED_DEBUG +static void +schedtune_test_nrg(unsigned long delta_pwr) +{ + unsigned long test_delta_pwr; + unsigned long test_norm_pwr; + int idx; + + /* + * Check normalization constants using some constant system + * energy values + */ + pr_info("schedtune: verify normalization constants...\n"); + for (idx = 0; idx < 6; ++idx) { + test_delta_pwr = delta_pwr >> idx; + + /* Normalize on max energy for target platform */ + test_norm_pwr = reciprocal_divide( + test_delta_pwr << SCHED_LOAD_SHIFT, + schedtune_target_nrg.rdiv); + + pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", + idx, test_delta_pwr, test_norm_pwr); + } +} +#else +#define schedtune_test_nrg(delta_pwr) +#endif + +/* + * Compute the min/max power consumption of a cluster and all its CPUs + */ +static void +schedtune_add_cluster_nrg( + struct sched_domain *sd, + struct sched_group *sg, + struct target_nrg *ste) +{ + struct sched_domain *sd2; + struct sched_group *sg2; + + struct cpumask *cluster_cpus; + char str[32]; + + unsigned long min_pwr; + unsigned long max_pwr; + int cpu; + + /* Get Cluster energy using EM data for the first CPU */ + cluster_cpus = sched_group_cpus(sg); + snprintf(str, 32, "CLUSTER[%*pbl]", + cpumask_pr_args(cluster_cpus)); + + min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; + max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Keep track of this cluster's energy in the computation of the + * overall system energy + */ + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + /* Get CPU energy using EM data for each CPU in the group */ + for_each_cpu(cpu, cluster_cpus) { + /* Get a SD view for the specific CPU */ + for_each_domain(cpu, sd2) { + /* Get the CPU group */ + sg2 = sd2->groups; + min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; + max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; + + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + snprintf(str, 32, "CPU[%d]", cpu); + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Assume we have EM data only at the CPU and + * the upper CLUSTER level + */ + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); + break; + } + } +} + +/* + * Initialize the constants required to compute normalized energy. + * The values of these constants depends on the EM data for the specific + * target system and topology. + * Thus, this function is expected to be called by the code + * that bind the EM to the topology information. + */ +static int +schedtune_init_late(void) +{ + struct target_nrg *ste = &schedtune_target_nrg; + unsigned long delta_pwr = 0; + struct sched_domain *sd; + struct sched_group *sg; + + pr_info("schedtune: init normalization constants...\n"); + ste->max_power = 0; + ste->min_power = 0; + + rcu_read_lock(); + + /* + * When EAS is in use, we always have a pointer to the highest SD + * which provides EM data. + */ + sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); + if (!sd) { + pr_info("schedtune: no energy model data\n"); + goto nodata; + } + + sg = sd->groups; + do { + schedtune_add_cluster_nrg(sd, sg, ste); + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + "SYSTEM", ste->min_power, ste->max_power); + + /* Compute normalization constants */ + delta_pwr = ste->max_power - ste->min_power; + ste->rdiv = reciprocal_value(delta_pwr); + pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", + ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); + + schedtune_test_nrg(delta_pwr); + return 0; + +nodata: + rcu_read_unlock(); + return -EINVAL; +} +late_initcall(schedtune_init_late); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index d756ce7b06e0..f7273a5d994a 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -16,9 +16,16 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #endif /* CONFIG_CGROUP_SCHEDTUNE */ +int schedtune_normalize_energy(int energy); +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + #else /* CONFIG_SCHED_TUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_normalize_energy(energy) energy +#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta + #endif /* CONFIG_SCHED_TUNE */ From a2a6dc750833243b3b49d016291ae1133196759a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 15 Jan 2016 15:48:03 +0000 Subject: [PATCH 155/607] sched/fair: filter energy_diff() based on energy_payoff value Once the SchedTune support is enabled and the CPU bandwidth demand of a task is boosted, we could expect increased energy consumptions which are balanced by corresponding increases of tasks performance. However, the current implementation of the energy_diff() function accepts all and _only_ the schedule candidates which results into a reduced expected system energy, which works against the boosting strategy. This patch links the energy_diff() function with the "energy payoff" engine provided by SchedTune. The energy variation computed by the energy_diff() function is now filtered using the SchedTune support to evaluated the energy payoff for a boosted task. With that patch, the energy_diff() function is going to reported as "acceptable schedule candidate" only the schedule candidate which corresponds to a positive energy_payoff. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b80ad01aa8b..1191c4b1b119 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4706,9 +4706,12 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + int payoff; + struct task_struct *task; struct { int before; int after; + int delta; int diff; } nrg; struct { @@ -4929,6 +4932,44 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +#ifdef CONFIG_SCHED_TUNE +static int energy_diff_evaluate(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff_evaluate(eenv) eenv->nrg.diff +#endif + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -4946,7 +4987,7 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, - .nrg = { 0, 0, 0 }, + .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, }; @@ -4982,8 +5023,9 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.before = energy_before; eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; - return eenv->nrg.diff; + return energy_diff_evaluate(eenv); } /* @@ -5481,6 +5523,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) .util_delta = task_util(p), .src_cpu = task_cpu(p), .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ From 99ed4e57cb4231b2561ec4a6433722dcd4a19a9e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:06:24 +0000 Subject: [PATCH 156/607] DEBUG: sched: add tracepoint for cpu/freq scale invariance Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 24 ++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9b90c57517a9..20a7216f8abb 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -562,6 +562,30 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +TRACE_EVENT(sched_contrib_scale_f, + + TP_PROTO(int cpu, unsigned long freq_scale_factor, + unsigned long cpu_scale_factor), + + TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, freq_scale_factor) + __field(unsigned long, cpu_scale_factor) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->freq_scale_factor = freq_scale_factor; + __entry->cpu_scale_factor = cpu_scale_factor; + ), + + TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu", + __entry->cpu, __entry->freq_scale_factor, + __entry->cpu_scale_factor) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1191c4b1b119..ec2e8aecc4f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2587,6 +2587,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; From 8017fd7418dc34d55d0e8bfc83e6c0ec10909c58 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:27 +0000 Subject: [PATCH 157/607] DEBUG: sched: add tracepoint for task load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 43 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 3 +++ 2 files changed, 46 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 20a7216f8abb..99f3e648c197 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -586,6 +586,49 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu, __entry->freq_scale_factor, __entry->cpu_scale_factor) ); + +/* + * Tracepoint for accounting sched averages for tasks. + */ +TRACE_EVENT(sched_load_avg_task, + + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + __field( u64, load_sum ) + __field( u32, util_sum ) + __field( u32, period_contrib ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->util_avg = avg->util_avg; + __entry->load_sum = avg->load_sum; + __entry->util_sum = avg->util_sum; + __entry->period_contrib = avg->period_contrib; + ), + + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu" + " util_sum=%u period_contrib=%u", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->load_avg, + __entry->util_avg, + (u64)__entry->load_sum, + (u32)__entry->util_sum, + (u32)__entry->period_contrib) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ec2e8aecc4f3..8cdb4e5592f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2731,6 +2731,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + trace_sched_load_avg_task(task_of(se), &se->avg); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From e5a25996d3a3f46f3abee77f868b922903f5f60e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:48 +0000 Subject: [PATCH 158/607] DEBUG: sched: add tracepoint for CPU load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 25 +++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 99f3e648c197..f58760c2f712 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -629,6 +629,31 @@ TRACE_EVENT(sched_load_avg_task, (u32)__entry->util_sum, (u32)__entry->period_contrib) ); + +/* + * Tracepoint for accounting sched averages for cpus. + */ +TRACE_EVENT(sched_load_avg_cpu, + + TP_PROTO(int cpu, struct cfs_rq *cfs_rq), + + TP_ARGS(cpu, cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load_avg = cfs_rq->avg.load_avg; + __entry->util_avg = cfs_rq->avg.util_avg; + ), + + TP_printk("cpu=%d load_avg=%lu util_avg=%lu", + __entry->cpu, __entry->load_avg, __entry->util_avg) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cdb4e5592f9..b5ea9ec335d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2734,6 +2734,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); + trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From dd2460f3872f1b89839aedb21b999e802542c32b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 17:35:23 +0100 Subject: [PATCH 159/607] DEBUG: sched,cpufreq: add cpu_capacity change tracepoint This is useful when we want to compare cpu utilization and cpu curr capacity side by side. Signed-off-by: Juri Lelli --- drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/sched.h | 2 ++ include/trace/events/power.h | 7 +++++++ kernel/sched/fair.c | 11 +++++++++++ kernel/sched/sched.h | 11 ----------- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5ca8e9517aae..0f30d11fb528 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include static LIST_HEAD(cpufreq_policy_list); @@ -473,6 +474,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { + int cpu; /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -501,6 +503,8 @@ wait: spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); + for_each_cpu(cpu, policy->cpus) + trace_cpu_capacity(capacity_curr_of(cpu), cpu); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 1460c6a4f65f..b9b11b2dbabd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1046,6 +1046,8 @@ struct sched_group_energy { struct capacity_state *cap_states; /* ptr to capacity state array */ }; +unsigned long capacity_curr_of(int cpu); + struct sched_group; struct sched_domain { diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244ebfe8d..f4be04e44252 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -120,6 +120,13 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); +DEFINE_EVENT(cpu, cpu_capacity, + + TP_PROTO(unsigned int capacity, unsigned int cpu_id), + + TP_ARGS(capacity, cpu_id) +); + TRACE_EVENT(device_pm_callback_start, TP_PROTO(struct device *dev, const char *pm_ops, int event), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5ea9ec335d8..9b74b86aba3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4698,6 +4698,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 983c132af11b..db3f3db9849c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1512,17 +1512,6 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static inline unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From 0a0f4aa12f6ec50e8b5d917b773e29702714acad Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:25:50 +0000 Subject: [PATCH 160/607] DEBUG: sched: add energy procfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the energy data available via procfs. The related files are placed as sub-directory named 'energy' inside the /proc/sys/kernel/sched_domain/cpuX/domainY/groupZ directory for those cpu/domain/group tuples which have energy information. The following example depicts the contents of /proc/sys/kernel/sched_domain/cpu0/domain0/group[01] for a system which has energy information attached to domain level 0. ├── cpu0 │ ├── domain0 │ │ ├── busy_factor │ │ ├── busy_idx │ │ ├── cache_nice_tries │ │ ├── flags │ │ ├── forkexec_idx │ │ ├── group0 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── group1 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── idle_idx │ │ ├── imbalance_pct │ │ ├── max_interval │ │ ├── max_newidle_lb_cost │ │ ├── min_interval │ │ ├── name │ │ ├── newidle_idx │ │ └── wake_idx │ └── domain1 │ ├── busy_factor │ ├── busy_idx │ ├── cache_nice_tries │ ├── flags │ ├── forkexec_idx │ ├── idle_idx │ ├── imbalance_pct │ ├── max_interval │ ├── max_newidle_lb_cost │ ├── min_interval │ ├── name │ ├── newidle_idx │ └── wake_idx The files 'nr_idle_states' and 'nr_cap_states' contain a scalar value whereas 'idle_states' and 'cap_states' contain a vector of power consumption at this idle state respectively (compute capacity, power consumption) at this capacity state. Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 151ed9c62ad2..9e25e68747d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5412,10 +5412,61 @@ set_table_entry(struct ctl_table *entry, } } +static struct ctl_table * +sd_alloc_ctl_energy_table(struct sched_group_energy *sge) +{ + struct ctl_table *table = sd_alloc_ctl_entry(5); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power, + sge->nr_idle_states*sizeof(struct idle_state), 0644, + proc_doulongvec_minmax, false); + set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap, + sge->nr_cap_states*sizeof(struct capacity_state), 0644, + proc_doulongvec_minmax, false); + + return table; +} + +static struct ctl_table * +sd_alloc_ctl_group_table(struct sched_group *sg) +{ + struct ctl_table *table = sd_alloc_ctl_entry(2); + + if (table == NULL) + return NULL; + + table->procname = kstrdup("energy", GFP_KERNEL); + table->mode = 0555; + table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge); + + return table; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table; + unsigned int nr_entries = 14; + + int i = 0; + struct sched_group *sg = sd->groups; + + if (sg->sge) { + int nr_sgs = 0; + + do {} while (nr_sgs++, sg = sg->next, sg != sd->groups); + + nr_entries += nr_sgs; + } + + table = sd_alloc_ctl_entry(nr_entries); if (table == NULL) return NULL; @@ -5448,7 +5499,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + sg = sd->groups; + if (sg->sge) { + char buf[32]; + struct ctl_table *entry = &table[13]; + + do { + snprintf(buf, 32, "group%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_group_table(sg); + } while (entry++, i++, sg = sg->next, sg != sd->groups); + } + /* &table[nr_entries-1] is terminator */ return table; } From c8a65d2e9a3a6ba9d4d7b1634701dd34017f4107 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:49:07 +0100 Subject: [PATCH 161/607] DEBUG: schedtune: add tracepoint for SchedTune configuration update Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 21 +++++++++++++++++++++ kernel/sched/tune.c | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f58760c2f712..8ae43a2ebfda 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -654,6 +654,27 @@ TRACE_EVENT(sched_load_avg_cpu, TP_printk("cpu=%d load_avg=%lu util_avg=%lu", __entry->cpu, __entry->load_avg, __entry->util_avg) ); + +/* + * Tracepoint for sched_tune_config settings + */ +TRACE_EVENT(sched_tune_config, + + TP_PROTO(int boost), + + TP_ARGS(boost), + + TP_STRUCT__entry( + __field( int, boost ) + ), + + TP_fast_assign( + __entry->boost = boost; + ), + + TP_printk("boost=%d ", __entry->boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 1a8ba5a6d99b..f5f4c57efb9e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -7,6 +7,8 @@ #include #include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; @@ -392,6 +394,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); + trace_sched_tune_config(st->boost); + return 0; } From a09a25c5dfc81d08df7b3c2e76ea94a651c17ac2 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:51:07 +0100 Subject: [PATCH 162/607] DEBUG: schedtune: add tracepoint for CPU boost signal Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 29 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8ae43a2ebfda..eb812071458b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -675,6 +675,33 @@ TRACE_EVENT(sched_tune_config, TP_printk("boost=%d ", __entry->boost) ); +/* + * Tracepoint for accounting CPU boosted utilization + */ +TRACE_EVENT(sched_boost_cpu, + + TP_PROTO(int cpu, unsigned long util, unsigned long margin), + + TP_ARGS(cpu, util, margin), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("cpu=%d util=%lu margin=%lu", + __entry->cpu, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b74b86aba3a..3bc56e0ea1ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5272,6 +5272,8 @@ boosted_cpu_util(int cpu) unsigned long util = cpu_util(cpu); unsigned long margin = schedtune_cpu_margin(util, cpu); + trace_sched_boost_cpu(cpu, util, margin); + return util + margin; } From fed95d9ef1b2674c4a2cda8b2445ca7ef3f6ca59 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Jun 2015 15:36:08 +0100 Subject: [PATCH 163/607] DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 62 ++++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 12 ++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index eb812071458b..7ec9dcfc701a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -702,6 +702,68 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_tasks_update, + + TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, + unsigned int boost, unsigned int max_boost), + + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, tasks ) + __field( int, idx ) + __field( unsigned int, boost ) + __field( unsigned int, max_boost ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->tasks = tasks; + __entry->idx = idx; + __entry->boost = boost; + __entry->max_boost = max_boost; + ), + + TP_printk("pid=%d comm=%s " + "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + __entry->pid, __entry->comm, + __entry->cpu, __entry->tasks, __entry->idx, + __entry->boost, __entry->max_boost) +); + +/* + * Tracepoint for schedtune_boostgroup_update + */ +TRACE_EVENT(sched_tune_boostgroup_update, + + TP_PROTO(int cpu, int variation, int max_boost), + + TP_ARGS(cpu, variation, max_boost), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( int, variation ) + __field( int, max_boost ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->variation = variation; + __entry->max_boost = max_boost; + ), + + TP_printk("cpu=%d variation=%d max_boost=%d", + __entry->cpu, __entry->variation, __entry->max_boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f5f4c57efb9e..7a434f2394e7 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -264,12 +264,18 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ - if (cur_boost_max == old_boost && old_boost > boost) + if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); + continue; + } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; @@ -293,6 +299,10 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) tasks = bg->group[idx].tasks; if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + } /* From e3b53389cf2313172034123a84d760984c4efd71 Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Fri, 6 May 2016 00:17:57 -0700 Subject: [PATCH 164/607] Revert "drivers: power: Add watchdog timer to catch drivers which lockup during suspend." This reverts commit ad86cc8ad63229eeeba0628e99f2f59df55a25fd. Commit 70fea60d888d ("PM / Sleep: Detect device suspend/resume lockup...") added a suspend/resume watchdog timer to catch the lockup. Let's revert the duplicate one. Change-Id: Ic72a87432e27844155467817600adc6cf0c2209c Signed-off-by: Lianwei Wang Signed-off-by: Amit Pundir --- drivers/base/power/main.c | 43 --------------------------------------- 1 file changed, 43 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 6ed8b9326629..2f7a6e1b568c 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -62,12 +62,6 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; -static void dpm_drv_timeout(unsigned long data); -struct dpm_drv_wd_data { - struct device *dev; - struct task_struct *tsk; -}; - static int async_error; static char *pm_verb(int event) @@ -838,30 +832,6 @@ static void async_resume(void *data, async_cookie_t cookie) put_device(dev); } -/** - * dpm_drv_timeout - Driver suspend / resume watchdog handler - * @data: struct device which timed out - * - * Called when a driver has timed out suspending or resuming. - * There's not much we can do here to recover so - * BUG() out for a crash-dump - * - */ -static void dpm_drv_timeout(unsigned long data) -{ - struct dpm_drv_wd_data *wd_data = (void *)data; - struct device *dev = wd_data->dev; - struct task_struct *tsk = wd_data->tsk; - - printk(KERN_EMERG "**** DPM device timeout: %s (%s)\n", dev_name(dev), - (dev->driver ? dev->driver->name : "no driver")); - - printk(KERN_EMERG "dpm suspend stack:\n"); - show_stack(tsk, NULL); - - BUG(); -} - /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. @@ -1380,8 +1350,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; - struct timer_list timer; - struct dpm_drv_wd_data data; char suspend_abort[MAX_SUSPEND_ABORT_LEN]; DECLARE_DPM_WATCHDOG_ON_STACK(wd); @@ -1412,14 +1380,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (dev->power.syscore) goto Complete; - - data.dev = dev; - data.tsk = current; - init_timer_on_stack(&timer); - timer.expires = jiffies + HZ * 12; - timer.function = dpm_drv_timeout; - timer.data = (unsigned long)&data; - add_timer(&timer); if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { @@ -1500,9 +1460,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) device_unlock(dev); dpm_watchdog_clear(&wd); - del_timer_sync(&timer); - destroy_timer_on_stack(&timer); - Complete: complete_all(&dev->power.completion); if (error) From e9253e876087accdab8edb763cfa4c3a48a8ce66 Mon Sep 17 00:00:00 2001 From: Jimmy Perchet Date: Mon, 9 May 2016 10:32:04 -0700 Subject: [PATCH 165/607] FROMLIST: wlcore: Disable filtering in AP role When you configure (set it up) a STA interface, the driver install a multicast filter. This is normal behavior, when one application subscribe to multicast address the filter is updated. When Access Point interface is configured, there is no filter installation and the "filter update" path is disabled in the driver. The problem happens when you switch an interface from STA type to AP type. The filter is installed but there are no means to update it. Change-Id: Ied22323af831575303abd548574918baa9852dd0 Signed-off-by: Dmitry Shmidt --- drivers/net/wireless/ti/wlcore/init.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c index e92f2639af2c..9fd3c6af0a61 100644 --- a/drivers/net/wireless/ti/wlcore/init.c +++ b/drivers/net/wireless/ti/wlcore/init.c @@ -549,6 +549,11 @@ static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif) { int ret; + /* Disable filtering */ + ret = wl1271_acx_group_address_tbl(wl, wlvif, false, NULL, 0); + if (ret < 0) + return ret; + ret = wl1271_acx_ap_max_tx_retry(wl, wlvif); if (ret < 0) return ret; From 92c4fc6f0946e846dee50b79b5dd061fe795dfaf Mon Sep 17 00:00:00 2001 From: Janis Danisevskis Date: Thu, 14 Apr 2016 13:57:03 +0100 Subject: [PATCH 166/607] UPSTREAM: procfs: fixes pthread cross-thread naming if !PR_DUMPABLE The PR_DUMPABLE flag causes the pid related paths of the proc file system to be owned by ROOT. The implementation of pthread_set/getname_np however needs access to /proc//task//comm. If PR_DUMPABLE is false this implementation is locked out. This patch installs a special permission function for the file "comm" that grants read and write access to all threads of the same group regardless of the ownership of the inode. For all other threads the function falls back to the generic inode permission check. Signed-off-by: Janis Danisevskis --- fs/proc/base.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b6962c52965..c21bb4beb0d7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3076,6 +3076,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; } +/* + * proc_tid_comm_permission is a special permission function exclusively + * used for the node /proc//task//comm. + * It bypasses generic permission checks in the case where a task of the same + * task group attempts to access the node. + * The rational behind this is that glibc and bionic access this node for + * cross thread naming (pthread_set/getname_np(!self)). However, if + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, + * which locks out the cross thread naming implementation. + * This function makes sure that the node is always accessible for members of + * same thread group. + */ +static int proc_tid_comm_permission(struct inode *inode, int mask) +{ + bool is_same_tgroup; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + is_same_tgroup = same_thread_group(current, task); + put_task_struct(task); + + if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { + /* This file (/proc//task//comm) can always be + * read or written by the members of the corresponding + * thread group. + */ + return 0; + } + + return generic_permission(inode, mask); +} + +static const struct inode_operations proc_tid_comm_inode_operations = { + .permission = proc_tid_comm_permission, +}; + /* * Tasks */ @@ -3094,7 +3132,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif - REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), + NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, + &proc_tid_comm_inode_operations, + &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif From 239dd543887daf9337567e68e4de7f42efa53a48 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Wed, 11 May 2016 11:01:02 -0700 Subject: [PATCH 167/607] fiq_debugger: Add fiq_debugger.disable option This change allows to use same kernel image with different console options for uart and fiq_debugger. If fiq_debugger.disable will be set to 1/y/Y, fiq_debugger will not be initialized. Change-Id: I71fda54f5f863d13b1437b1f909e52dd375d002d Signed-off-by: Dmitry Shmidt --- drivers/staging/android/fiq_debugger/fiq_debugger.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c index 0c558331a3d2..b132cff14f01 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c @@ -123,11 +123,13 @@ static bool initial_console_enable; #endif static bool fiq_kgdb_enable; +static bool fiq_debugger_disable; module_param_named(no_sleep, initial_no_sleep, bool, 0644); module_param_named(debug_enable, initial_debug_enable, bool, 0644); module_param_named(console_enable, initial_console_enable, bool, 0644); module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644); +module_param_named(disable, fiq_debugger_disable, bool, 0644); #ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON static inline @@ -1230,6 +1232,10 @@ int fiq_debugger_uart_overlay(void) static int __init fiq_debugger_init(void) { + if (fiq_debugger_disable) { + pr_err("serial_debugger: disabled\n"); + return -ENODEV; + } #if defined(CONFIG_FIQ_DEBUGGER_CONSOLE) fiq_debugger_tty_init(); #endif From cc0063b8eb44f08bb899fb47d562c67d73b7245b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 12 May 2016 11:17:52 -0700 Subject: [PATCH 168/607] xt_qtaguid: Fix panic caused by processing non-full socket. In an issue very similar to 4e461c777e3 (xt_qtaguid: Fix panic caused by synack processing), we were seeing panics on occasion in testing. In this case, it was the same issue, but caused by a different call path, as the sk being returned from qtaguid_find_sk() was not a full socket. Resulting in the sk->sk_socket deref to fail. This patch adds an extra check to ensure the sk being retuned is a full socket, and if not it returns NULL. Reported-by: Milosz Wasilewski Signed-off-by: John Stultz --- net/netfilter/xt_qtaguid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 822dc3c3bce1..e2e7d54f9bb1 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1606,7 +1606,7 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, * When in TCP_TIME_WAIT the sk is not a "struct sock" but * "struct inet_timewait_sock" which is missing fields. */ - if (sk->sk_state == TCP_TIME_WAIT) { + if (!sk_fullsock(sk) || sk->sk_state == TCP_TIME_WAIT) { sock_gen_put(sk); sk = NULL; } From d495067ae176cfd7a027d11c956fbacad009a4c7 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 16 May 2016 13:17:28 +0530 Subject: [PATCH 169/607] Revert "cpufreq: interactive: build fixes for 4.4" This reverts commit bc68f6c4efbd4ddbb15817203f18b7941d9ffd52. This build fix broke the Interactive Gov at runtime with duplicate sysfs entry warnings at boot time. We no longer need to this create/destroy cpufreq sysfs entry at run time on need basis thanks to upstream commit 8eec1020f0c0 (cpufreq: create cpu/cpufreq at boot time) which creates it at boot time. Hence drop this build fix. Signed-off-by: Amit Pundir --- drivers/cpufreq/cpufreq_interactive.c | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index bd83be39f17b..7f846b06e024 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -116,28 +116,6 @@ struct cpufreq_interactive_tunables { bool io_is_busy; }; -/* - * HACK: FIXME: Bring back cpufreq_{get,put}_global_kobject() - * definition removed by upstream commit 8eec1020f0c0 "cpufreq: - * create cpu/cpufreq at boot time" to fix build failures. - */ -static int cpufreq_global_kobject_usage; - -int cpufreq_get_global_kobject(void) -{ - if (!cpufreq_global_kobject_usage++) - return kobject_add(cpufreq_global_kobject, - &cpu_subsys.dev_root->kobj, "%s", "cpufreq"); - - return 0; -} - -void cpufreq_put_global_kobject(void) -{ - if (!--cpufreq_global_kobject_usage) - kobject_del(cpufreq_global_kobject); -} - /* For cases where we have single governor instance for system */ static struct cpufreq_interactive_tunables *common_tunables; From 287d9d0efbd378dc0d7919bc9820737c869f5c0b Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 16 May 2016 13:25:35 +0530 Subject: [PATCH 170/607] cpufreq: interactive: drop cpufreq_{get,put}_global_kobject func calls Upstream commit 8eec1020f0c0 (cpufreq: create cpu/cpufreq at boot time) make sure that cpufreq sysfs entry get created at boot time, and there is no need to create/destroy it on need basis anymore. So drop deprecated cpufreq_{get,put}_global_kobject function calls which otherwise result in following compilation errors: drivers/cpufreq/cpufreq_interactive.c: In function 'cpufreq_governor_interactive': drivers/cpufreq/cpufreq_interactive.c:1187:4: error: implicit declaration of function 'cpufreq_get_global_kobject' [-Werror=implicit-function-declaration] WARN_ON(cpufreq_get_global_kobject()); ^ drivers/cpufreq/cpufreq_interactive.c:1197:5: error: implicit declaration of function 'cpufreq_put_global_kobject'[-Werror=implicit-function-declaration] cpufreq_put_global_kobject(); ^ Signed-off-by: Amit Pundir --- drivers/cpufreq/cpufreq_interactive.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 7f846b06e024..f2929e628820 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -1184,7 +1184,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, policy->governor_data = tunables; if (!have_governor_per_policy()) { common_tunables = tunables; - WARN_ON(cpufreq_get_global_kobject()); } rc = sysfs_create_group(get_governor_parent_kobj(policy), @@ -1194,7 +1193,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, policy->governor_data = NULL; if (!have_governor_per_policy()) { common_tunables = NULL; - cpufreq_put_global_kobject(); } return rc; } @@ -1218,9 +1216,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); - if (!have_governor_per_policy()) - cpufreq_put_global_kobject(); - kfree(tunables); common_tunables = NULL; } From 9a326f6c084b089765ed8ee903b76a5e8251b0b3 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 17 May 2016 16:06:17 +0530 Subject: [PATCH 171/607] Revert "drivers: power: use 'current' instead of 'get_current()'" This reverts commit e1b5d103894d097fb630aebc3c1fdaf257f7c9bb. This patch fixed the aosp commit ad86cc8ad632 (drivers: power: Add watchdog timer to catch drivers which lockup during suspend.), which we dropped in Change Id Ic72a87432e27844155467817600adc6cf0c2209c, so we no longer need this fix. A part of this patch is already reverted in above mentioned Change Id. Signed-off-by: Amit Pundir --- drivers/base/power/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2f7a6e1b568c..5ea529165362 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -35,8 +35,6 @@ #include #include -#include - #include "../base.h" #include "power.h" From 1f95b0e9be77e4d72a6352b6fc486d0678def27f Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 10 Jan 2016 22:40:55 -0800 Subject: [PATCH 172/607] UPSTREAM: tty: Fix unsafe ldisc reference via ioctl(TIOCGETD) (cherry pick from commit 5c17c861a357e9458001f021a7afa7aab9937439) ioctl(TIOCGETD) retrieves the line discipline id directly from the ldisc because the line discipline id (c_line) in termios is untrustworthy; userspace may have set termios via ioctl(TCSETS*) without actually changing the line discipline via ioctl(TIOCSETD). However, directly accessing the current ldisc via tty->ldisc is unsafe; the ldisc ptr dereferenced may be stale if the line discipline is changing via ioctl(TIOCSETD) or hangup. Wait for the line discipline reference (just like read() or write()) to retrieve the "current" line discipline id. Cc: Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Bug: 28409131 Change-Id: I6774bd883a2e48bbe020486c72c42fb410e3f98a --- drivers/tty/tty_io.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index bcc8e1e8bb72..ab15b937fedb 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2652,6 +2652,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p) return ret; } +/** + * tiocgetd - get line discipline + * @tty: tty device + * @p: pointer to user data + * + * Retrieves the line discipline id directly from the ldisc. + * + * Locking: waits for ldisc reference (in case the line discipline + * is changing or the tty is being hungup) + */ + +static int tiocgetd(struct tty_struct *tty, int __user *p) +{ + struct tty_ldisc *ld; + int ret; + + ld = tty_ldisc_ref_wait(tty); + ret = put_user(ld->ops->num, p); + tty_ldisc_deref(ld); + return ret; +} + /** * send_break - performed time break * @tty: device to break on @@ -2878,7 +2900,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case TIOCGSID: return tiocgsid(tty, real_tty, p); case TIOCGETD: - return put_user(tty->ldisc->ops->num, (int __user *)p); + return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: From 8f2aea82ca669025419a4c9ab23b94f97c1ee170 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 4 Apr 2016 14:15:23 -0400 Subject: [PATCH 173/607] =?UTF-8?q?UPSTREAM:=20mac80211:=20fix=20"warning:?= =?UTF-8?q?=20=E2=80=98target=5Fmetric=E2=80=99=20may=20be=20used=20uninit?= =?UTF-8?q?ialized"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (This cherry-picks b4201cc4fc6e1c57d6d306b1f787865043d60129 upstream) This fixes: net/mac80211/mesh_hwmp.c:603:26: warning: ‘target_metric’ may be used uninitialized in this function target_metric is only consumed when reply = true so no bug exists here, but not all versions of gcc realize it. Initialize to 0 to remove the warning. Change-Id: I13923fda9d314f48196c29e4354133dfe01f5abd Signed-off-by: Jeff Mahoney Signed-off-by: Johannes Berg [jstultz: Cherry-picked to android-4.4] Signed-off-by: John Stultz --- net/mac80211/mesh_hwmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index c6be0b4f4058..b6dc2d7cd650 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -530,7 +530,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; - u32 orig_sn, target_sn, lifetime, target_metric; + u32 orig_sn, target_sn, lifetime, target_metric = 0; bool reply = false; bool forward = true; bool root_is_gate; From cba8d1f729d830a200055f0a4bab51883304ac9c Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Fri, 20 May 2016 11:05:00 +0800 Subject: [PATCH 174/607] ANDROID: usb: gadget: f_midi: set fi->f to NULL when free f_midi function fi->f is set in f_midi's alloc_func, need to clean this to NULL in free_func, otherwise on ConfigFS's function switch, midi->usb_function it self is freed, fi->f will be a wild pointer and run into below kernel panic: --------------- [ 58.950628] Unable to handle kernel paging request at virtual address 63697664 [ 58.957869] pgd = c0004000 [ 58.960583] [63697664] *pgd=00000000 [ 58.964185] Internal error: Oops: 80000005 [#1] PREEMPT SMP ARM [ 58.970111] Modules linked in: [ 58.973191] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.1.15-03504-g34c857c-dirty #89 [ 58.981024] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 58.987557] task: c110bd70 ti: c1100000 task.ti: c1100000 [ 58.992962] PC is at 0x63697664 [ 58.996120] LR is at android_setup+0x78/0x138 <..snip..> [ 60.044980] 1fc0: ffffffff ffffffff c1000684 00000000 00000000 c108ecd0 c11f7294 c11039c0 [ 60.053181] 1fe0: c108eccc c110d148 1000406a 412fc09a 00000000 1000807c 00000000 00000000 [ 60.061420] [] (android_setup) from [] (udc_irq+0x758/0x1034) [ 60.068951] [] (udc_irq) from [] (handle_irq_event_percpu+0x50/0x254) [ 60.077165] [] (handle_irq_event_percpu) from [] (handle_irq_event+0x3c/0x5c) [ 60.086072] [] (handle_irq_event) from [] (handle_fasteoi_irq+0xe0/0x198) [ 60.094630] [] (handle_fasteoi_irq) from [] (generic_handle_irq+0x2c/0x3c) [ 60.103271] [] (generic_handle_irq) from [] (__handle_domain_irq+0x7c/0xec) [ 60.112000] [] (__handle_domain_irq) from [] (gic_handle_irq+0x24/0x5c) -------------- Signed-off-by: Winter Wang --- drivers/usb/gadget/function/f_midi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 8a0e7f988d25..847f70363477 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -1140,6 +1140,7 @@ static void f_midi_free(struct usb_function *f) for (i = opts->in_ports - 1; i >= 0; --i) kfree(midi->in_port[i]); kfree(midi); + opts->func_inst.f = NULL; --opts->refcnt; mutex_unlock(&opts->lock); } From c11dd134ea73ff9ee6bdb2807ac064cef2f284bd Mon Sep 17 00:00:00 2001 From: Haojian Zhuang Date: Fri, 22 Apr 2016 17:23:29 +0800 Subject: [PATCH 175/607] arm64: add option to build Image-dtb Some bootloaders couldn't decompress Image.gz-dtb. Change-Id: I698cd0c4ee6894e8d0655d88f3ecf4826c28a645 Signed-off-by: Haojian Zhuang Signed-off-by: John Stultz Signed-off-by: Dmitry Shmidt --- arch/arm64/Makefile | 2 +- arch/arm64/boot/Makefile | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index a5c84594f6ad..4e77fd165f38 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -106,7 +106,7 @@ dtbs: prepare scripts dtbs_install: $(Q)$(MAKE) $(dtbinst)=$(boot)/dts -Image.gz-dtb: vmlinux scripts dtbs +Image-dtb Image.gz-dtb: vmlinux scripts dtbs $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ PHONY += vdso_install diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile index ddd405f88344..7ab8e74cd83a 100644 --- a/arch/arm64/boot/Makefile +++ b/arch/arm64/boot/Makefile @@ -32,6 +32,9 @@ $(obj)/Image: vmlinux FORCE $(obj)/Image.bz2: $(obj)/Image FORCE $(call if_changed,bzip2) +$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE + $(call if_changed,cat) + $(obj)/Image.gz: $(obj)/Image FORCE $(call if_changed,gzip) From 9f6f7a27247cbfa50c1c9ebdd8417e2006a18100 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 24 May 2016 14:41:57 -0700 Subject: [PATCH 176/607] ARM64: Ignore Image-dtb from git point of view Change-Id: I5bbf1db90f28ea956383b4a5d91ad508eea656dc Signed-off-by: Dmitry Shmidt --- arch/arm64/boot/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore index eb3551131b1e..34e35209fc2e 100644 --- a/arch/arm64/boot/.gitignore +++ b/arch/arm64/boot/.gitignore @@ -1,3 +1,4 @@ Image +Image-dtb Image.gz Image.gz-dtb From dbe3b633a70b928e8f7ca92c9eb6751ea4bd71bc Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 26 May 2016 12:57:56 +0530 Subject: [PATCH 177/607] Revert "arm: dcc_tty: fix armv6 dcc tty build failure" This reverts commit dfc1d4be88597141f5ad9d39908c13944d209009. Drop AOSP's "armv6 dcc tty driver" in favor of upstream DCC driver for ARMv6/v7 16c63f8ea49c (drivers: char: hvc: add arm JTAG DCC console support) and for ARMv8 4cad4c57e0b3 (ARM64: TTY: hvc_dcc: Add support for ARM64 dcc). Change-Id: I8110a4fd649b8ac1ec9bfac00255c1214135e4b2 Signed-off-by: Amit Pundir --- drivers/char/dcc_tty.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/dcc_tty.c b/drivers/char/dcc_tty.c index 0a62d410286f..a787accdcb14 100644 --- a/drivers/char/dcc_tty.c +++ b/drivers/char/dcc_tty.c @@ -26,7 +26,7 @@ MODULE_DESCRIPTION("DCC TTY Driver"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); -DEFINE_SPINLOCK(g_dcc_tty_lock); +static spinlock_t g_dcc_tty_lock = SPIN_LOCK_UNLOCKED; static struct hrtimer g_dcc_timer; static char g_dcc_buffer[16]; static int g_dcc_buffer_head; @@ -80,8 +80,8 @@ static void dcc_poll_locked(void) ); if (rch >= 0) { ch = rch; - tty_insert_flip_string(g_dcc_tty->port, &ch, 1); - tty_flip_buffer_push(g_dcc_tty->port); + tty_insert_flip_string(g_dcc_tty, &ch, 1); + tty_flip_buffer_push(g_dcc_tty); } } From fd2e341b4741a29e04f2010ced14edf5d2aa3ee0 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 26 May 2016 12:58:21 +0530 Subject: [PATCH 178/607] Revert "armv6 dcc tty driver" This reverts commit 97312429c2bef1bf8055d01b35cf12028f60ef62. Drop AOSP's "armv6 dcc tty driver" in favor of upstream DCC driver for ARMv6/v7 16c63f8ea49c (drivers: char: hvc: add arm JTAG DCC console support) and for ARMv8 4cad4c57e0b3 (ARM64: TTY: hvc_dcc: Add support for ARM64 dcc). Change-Id: I0ca651ef2d854fff03cee070524fe1e3971b6d8f Signed-off-by: Amit Pundir --- drivers/char/Kconfig | 4 - drivers/char/Makefile | 1 - drivers/char/dcc_tty.c | 326 ----------------------------------------- 3 files changed, 331 deletions(-) delete mode 100644 drivers/char/dcc_tty.c diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 29dbe0fbdf8b..a043107da2af 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -588,10 +588,6 @@ config DEVPORT depends on ISA || PCI default y -config DCC_TTY - tristate "DCC tty driver" - depends on ARM - source "drivers/s390/char/Kconfig" config TILE_SROM diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 7671551e3196..d8a7579300d2 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -53,7 +53,6 @@ obj-$(CONFIG_PCMCIA) += pcmcia/ obj-$(CONFIG_HANGCHECK_TIMER) += hangcheck-timer.o obj-$(CONFIG_TCG_TPM) += tpm/ -obj-$(CONFIG_DCC_TTY) += dcc_tty.o obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_JS_RTC) += js-rtc.o diff --git a/drivers/char/dcc_tty.c b/drivers/char/dcc_tty.c deleted file mode 100644 index a787accdcb14..000000000000 --- a/drivers/char/dcc_tty.c +++ /dev/null @@ -1,326 +0,0 @@ -/* drivers/char/dcc_tty.c - * - * Copyright (C) 2007 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_DESCRIPTION("DCC TTY Driver"); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0"); - -static spinlock_t g_dcc_tty_lock = SPIN_LOCK_UNLOCKED; -static struct hrtimer g_dcc_timer; -static char g_dcc_buffer[16]; -static int g_dcc_buffer_head; -static int g_dcc_buffer_count; -static unsigned g_dcc_write_delay_usecs = 1; -static struct tty_driver *g_dcc_tty_driver; -static struct tty_struct *g_dcc_tty; -static int g_dcc_tty_open_count; - -static void dcc_poll_locked(void) -{ - char ch; - int rch; - int written; - - while (g_dcc_buffer_count) { - ch = g_dcc_buffer[g_dcc_buffer_head]; - asm( - "mrc 14, 0, r15, c0, c1, 0\n" - "mcrcc 14, 0, %1, c0, c5, 0\n" - "movcc %0, #1\n" - "movcs %0, #0\n" - : "=r" (written) - : "r" (ch) - ); - if (written) { - if (ch == '\n') - g_dcc_buffer[g_dcc_buffer_head] = '\r'; - else { - g_dcc_buffer_head = (g_dcc_buffer_head + 1) % ARRAY_SIZE(g_dcc_buffer); - g_dcc_buffer_count--; - if (g_dcc_tty) - tty_wakeup(g_dcc_tty); - } - g_dcc_write_delay_usecs = 1; - } else { - if (g_dcc_write_delay_usecs > 0x100) - break; - g_dcc_write_delay_usecs <<= 1; - udelay(g_dcc_write_delay_usecs); - } - } - - if (g_dcc_tty && !test_bit(TTY_THROTTLED, &g_dcc_tty->flags)) { - asm( - "mrc 14, 0, %0, c0, c1, 0\n" - "tst %0, #(1 << 30)\n" - "moveq %0, #-1\n" - "mrcne 14, 0, %0, c0, c5, 0\n" - : "=r" (rch) - ); - if (rch >= 0) { - ch = rch; - tty_insert_flip_string(g_dcc_tty, &ch, 1); - tty_flip_buffer_push(g_dcc_tty); - } - } - - - if (g_dcc_buffer_count) - hrtimer_start(&g_dcc_timer, ktime_set(0, g_dcc_write_delay_usecs * NSEC_PER_USEC), HRTIMER_MODE_REL); - else - hrtimer_start(&g_dcc_timer, ktime_set(0, 20 * NSEC_PER_MSEC), HRTIMER_MODE_REL); -} - -static int dcc_tty_open(struct tty_struct * tty, struct file * filp) -{ - int ret; - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - if (g_dcc_tty == NULL || g_dcc_tty == tty) { - g_dcc_tty = tty; - g_dcc_tty_open_count++; - ret = 0; - } else - ret = -EBUSY; - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - - printk("dcc_tty_open, tty %p, f_flags %x, returned %d\n", tty, filp->f_flags, ret); - - return ret; -} - -static void dcc_tty_close(struct tty_struct * tty, struct file * filp) -{ - printk("dcc_tty_close, tty %p, f_flags %x\n", tty, filp->f_flags); - if (g_dcc_tty == tty) { - if (--g_dcc_tty_open_count == 0) - g_dcc_tty = NULL; - } -} - -static int dcc_write(const unsigned char *buf_start, int count) -{ - const unsigned char *buf = buf_start; - unsigned long irq_flags; - int copy_len; - int space_left; - int tail; - - if (count < 1) - return 0; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - do { - tail = (g_dcc_buffer_head + g_dcc_buffer_count) % ARRAY_SIZE(g_dcc_buffer); - copy_len = ARRAY_SIZE(g_dcc_buffer) - tail; - space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count; - if (copy_len > space_left) - copy_len = space_left; - if (copy_len > count) - copy_len = count; - memcpy(&g_dcc_buffer[tail], buf, copy_len); - g_dcc_buffer_count += copy_len; - buf += copy_len; - count -= copy_len; - if (copy_len < count && copy_len < space_left) { - space_left -= copy_len; - copy_len = count; - if (copy_len > space_left) { - copy_len = space_left; - } - memcpy(g_dcc_buffer, buf, copy_len); - buf += copy_len; - count -= copy_len; - g_dcc_buffer_count += copy_len; - } - dcc_poll_locked(); - space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count; - } while(count && space_left); - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - return buf - buf_start; -} - -static int dcc_tty_write(struct tty_struct * tty, const unsigned char *buf, int count) -{ - int ret; - /* printk("dcc_tty_write %p, %d\n", buf, count); */ - ret = dcc_write(buf, count); - if (ret != count) - printk("dcc_tty_write %p, %d, returned %d\n", buf, count, ret); - return ret; -} - -static int dcc_tty_write_room(struct tty_struct *tty) -{ - int space_left; - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count; - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - return space_left; -} - -static int dcc_tty_chars_in_buffer(struct tty_struct *tty) -{ - int ret; - asm( - "mrc 14, 0, %0, c0, c1, 0\n" - "mov %0, %0, LSR #30\n" - "and %0, %0, #1\n" - : "=r" (ret) - ); - return ret; -} - -static void dcc_tty_unthrottle(struct tty_struct * tty) -{ - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - dcc_poll_locked(); - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); -} - -static enum hrtimer_restart dcc_tty_timer_func(struct hrtimer *timer) -{ - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - dcc_poll_locked(); - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - return HRTIMER_NORESTART; -} - -void dcc_console_write(struct console *co, const char *b, unsigned count) -{ -#if 1 - dcc_write(b, count); -#else - /* blocking printk */ - while (count > 0) { - int written; - written = dcc_write(b, count); - if (written) { - b += written; - count -= written; - } - } -#endif -} - -static struct tty_driver *dcc_console_device(struct console *c, int *index) -{ - *index = 0; - return g_dcc_tty_driver; -} - -static int __init dcc_console_setup(struct console *co, char *options) -{ - if (co->index != 0) - return -ENODEV; - return 0; -} - - -static struct console dcc_console = -{ - .name = "ttyDCC", - .write = dcc_console_write, - .device = dcc_console_device, - .setup = dcc_console_setup, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -static struct tty_operations dcc_tty_ops = { - .open = dcc_tty_open, - .close = dcc_tty_close, - .write = dcc_tty_write, - .write_room = dcc_tty_write_room, - .chars_in_buffer = dcc_tty_chars_in_buffer, - .unthrottle = dcc_tty_unthrottle, -}; - -static int __init dcc_tty_init(void) -{ - int ret; - - hrtimer_init(&g_dcc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - g_dcc_timer.function = dcc_tty_timer_func; - - g_dcc_tty_driver = alloc_tty_driver(1); - if (!g_dcc_tty_driver) { - printk(KERN_ERR "dcc_tty_probe: alloc_tty_driver failed\n"); - ret = -ENOMEM; - goto err_alloc_tty_driver_failed; - } - g_dcc_tty_driver->owner = THIS_MODULE; - g_dcc_tty_driver->driver_name = "dcc"; - g_dcc_tty_driver->name = "ttyDCC"; - g_dcc_tty_driver->major = 0; // auto assign - g_dcc_tty_driver->minor_start = 0; - g_dcc_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; - g_dcc_tty_driver->subtype = SERIAL_TYPE_NORMAL; - g_dcc_tty_driver->init_termios = tty_std_termios; - g_dcc_tty_driver->flags = TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; - tty_set_operations(g_dcc_tty_driver, &dcc_tty_ops); - ret = tty_register_driver(g_dcc_tty_driver); - if (ret) { - printk(KERN_ERR "dcc_tty_probe: tty_register_driver failed, %d\n", ret); - goto err_tty_register_driver_failed; - } - tty_register_device(g_dcc_tty_driver, 0, NULL); - - register_console(&dcc_console); - hrtimer_start(&g_dcc_timer, ktime_set(0, 0), HRTIMER_MODE_REL); - - return 0; - -err_tty_register_driver_failed: - put_tty_driver(g_dcc_tty_driver); - g_dcc_tty_driver = NULL; -err_alloc_tty_driver_failed: - return ret; -} - -static void __exit dcc_tty_exit(void) -{ - int ret; - - tty_unregister_device(g_dcc_tty_driver, 0); - ret = tty_unregister_driver(g_dcc_tty_driver); - if (ret < 0) { - printk(KERN_ERR "dcc_tty_remove: tty_unregister_driver failed, %d\n", ret); - } else { - put_tty_driver(g_dcc_tty_driver); - } - g_dcc_tty_driver = NULL; -} - -module_init(dcc_tty_init); -module_exit(dcc_tty_exit); - - From ef55b4532be92094b0a1f03f2e1a50d4b49a30ae Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2016 10:18:51 +0100 Subject: [PATCH 179/607] UPSTREAM: arm64: module: fix relocation of movz instruction with negative immediate The test whether a movz instruction with a signed immediate should be turned into a movn instruction (i.e., when the immediate is negative) is flawed, since the value of imm is always positive. Also, the subsequent bounds check is incorrect since the limit update never executes, due to the fact that the imm_type comparison will always be false for negative signed immediates. Let's fix this by performing the sign test on sval directly, and replacing the bounds check with a simple comparison against U16_MAX. Change-Id: I9ad3d8bfd91e5fdc6434b1be6c3062dfec193176 Signed-off-by: Ard Biesheuvel [will: tidied up use of sval, renamed MOVK enum value to MOVKZ] Signed-off-by: Will Deacon --- arch/arm64/kernel/module.c | 51 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f4bc779e62e8..03464ab0fff2 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -30,9 +30,6 @@ #include #include -#define AARCH64_INSN_IMM_MOVNZ AARCH64_INSN_IMM_MAX -#define AARCH64_INSN_IMM_MOVK AARCH64_INSN_IMM_16 - void *module_alloc(unsigned long size) { void *p; @@ -110,16 +107,20 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) return 0; } +enum aarch64_insn_movw_imm_type { + AARCH64_INSN_IMM_MOVNZ, + AARCH64_INSN_IMM_MOVKZ, +}; + static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, - int lsb, enum aarch64_insn_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type) { - u64 imm, limit = 0; + u64 imm; s64 sval; u32 insn = le32_to_cpu(*(u32 *)place); sval = do_reloc(op, place, val); - sval >>= lsb; - imm = sval & 0xffff; + imm = sval >> lsb; if (imm_type == AARCH64_INSN_IMM_MOVNZ) { /* @@ -128,7 +129,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, * immediate is less than zero. */ insn &= ~(3 << 29); - if ((s64)imm >= 0) { + if (sval >= 0) { /* >=0: Set the instruction to MOVZ (opcode 10b). */ insn |= 2 << 29; } else { @@ -140,29 +141,13 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, */ imm = ~imm; } - imm_type = AARCH64_INSN_IMM_MOVK; } /* Update the instruction with the new encoding. */ - insn = aarch64_insn_encode_immediate(imm_type, insn, imm); + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); *(u32 *)place = cpu_to_le32(insn); - /* Shift out the immediate field. */ - sval >>= 16; - - /* - * For unsigned immediates, the overflow check is straightforward. - * For signed immediates, the sign bit is actually the bit past the - * most significant bit of the field. - * The AARCH64_INSN_IMM_16 immediate type is unsigned. - */ - if (imm_type != AARCH64_INSN_IMM_16) { - sval++; - limit++; - } - - /* Check the upper bits depending on the sign of the immediate. */ - if ((u64)sval > limit) + if (imm > U16_MAX) return -ERANGE; return 0; @@ -267,25 +252,25 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, overflow_check = false; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, @@ -302,7 +287,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, @@ -311,7 +296,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, @@ -320,7 +305,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, From d8853fd2e900115923bb36a9137d6b3f1095ee70 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2016 10:18:52 +0100 Subject: [PATCH 180/607] UPSTREAM: arm64: module: avoid undefined shift behavior in reloc_data() Compilers may engage the improbability drive when encountering shifts by a distance that is a multiple of the size of the operand type. Since the required bounds check is very simple here, we can get rid of all the fuzzy masking, shifting and comparing, and use the documented bounds directly. Change-Id: Ibc1b73f4a630bc182deb6edfa7458b5e29ba9577 Reported-by: David Binderman Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/kernel/module.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 03464ab0fff2..93e970231ca9 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -72,15 +72,18 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val) static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) { - u64 imm_mask = (1 << len) - 1; s64 sval = do_reloc(op, place, val); switch (len) { case 16: *(s16 *)place = sval; + if (sval < S16_MIN || sval > U16_MAX) + return -ERANGE; break; case 32: *(s32 *)place = sval; + if (sval < S32_MIN || sval > U32_MAX) + return -ERANGE; break; case 64: *(s64 *)place = sval; @@ -89,21 +92,6 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) pr_err("Invalid length (%d) for data relocation\n", len); return 0; } - - /* - * Extract the upper value bits (including the sign bit) and - * shift them to bit 0. - */ - sval = (s64)(sval & ~(imm_mask >> 1)) >> (len - 1); - - /* - * Overflow has occurred if the value is not representable in - * len bits (i.e the bottom len bits are not sign-extended and - * the top bits are not all zero). - */ - if ((u64)(sval + 1) > 2) - return -ERANGE; - return 0; } From ebac2a3dcd458126b8c918b3315b1367b94f274d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 19 Jan 2016 21:35:15 +0000 Subject: [PATCH 181/607] BACKPORT: perf tools: Document the perf sysctls perf_event_paranoid was only documented in source code and a perf error message. Copy the documentation from the error message to Documentation/sysctl/kernel.txt. perf_cpu_time_max_percent was already documented but missing from the list at the top, so add it there. Signed-off-by: Ben Hutchings Cc: Peter Zijlstra Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20160119213515.GG2637@decadent.org.uk [ Remove reference to external Documentation file, provide info inline, as before ] Signed-off-by: Arnaldo Carvalho de Melo Bug: 29054680 Change-Id: I13e73cfb2ad761c94762d0c8196df7725abdf5c5 --- Documentation/sysctl/kernel.txt | 13 +++++++++++++ tools/perf/util/evsel.c | 15 +++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index af70d1541d3a..88a2c8eb4502 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -58,6 +58,8 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- perf_cpu_time_max_percent +- perf_event_paranoid - pid_max - powersave-nap [ PPC only ] - printk @@ -624,6 +626,17 @@ allowed to execute. ============================================================== +perf_event_paranoid: + +Controls use of the performance events system by unprivileged +users (without CAP_SYS_ADMIN). The default value is 1. + + -1: Allow use of (almost) all events by all users +>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK +>=1: Disallow CPU event access by users without CAP_SYS_ADMIN +>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN + +============================================================== pid_max: diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 397fb4ed3c97..d4913a46ee1c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2313,12 +2313,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, case EPERM: case EACCES: return scnprintf(msg, size, - "You may not have permission to collect %sstats.\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n" - " -1 - Not paranoid at all\n" - " 0 - Disallow raw tracepoint access for unpriv\n" - " 1 - Disallow cpu events for unpriv\n" - " 2 - Disallow kernel profiling for unpriv", + "You may not have permission to collect %sstats.\n\n" + "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" + "which controls use of the performance events system by\n" + "unprivileged users (without CAP_SYS_ADMIN).\n\n" + "The default value is 1:\n\n" + " -1: Allow use of (almost) all events by all users\n" + ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n" + ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" + ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN", target->system_wide ? "system-wide " : ""); case ENOENT: return scnprintf(msg, size, "The %s event is not supported.", From 9d5f5d93461e2b6f697054c06ed26bc5fa4d629e Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 29 May 2016 14:22:32 -0700 Subject: [PATCH 182/607] FROMLIST: security,perf: Allow further restriction of perf_event_open When kernel.perf_event_open is set to 3 (or greater), disallow all access to performance events by users without CAP_SYS_ADMIN. Add a Kconfig symbol CONFIG_SECURITY_PERF_EVENTS_RESTRICT that makes this value the default. This is based on a similar feature in grsecurity (CONFIG_GRKERNSEC_PERF_HARDEN). This version doesn't include making the variable read-only. It also allows enabling further restriction at run-time regardless of whether the default is changed. https://lkml.org/lkml/2016/1/11/587 Signed-off-by: Ben Hutchings Bug: 29054680 Change-Id: Iff5bff4fc1042e85866df9faa01bce8d04335ab8 --- Documentation/sysctl/kernel.txt | 4 +++- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 ++++++++ security/Kconfig | 9 +++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 88a2c8eb4502..5728779df1ab 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -629,12 +629,14 @@ allowed to execute. perf_event_paranoid: Controls use of the performance events system by unprivileged -users (without CAP_SYS_ADMIN). The default value is 1. +users (without CAP_SYS_ADMIN). The default value is 3 if +CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 1 otherwise. -1: Allow use of (almost) all events by all users >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK >=1: Disallow CPU event access by users without CAP_SYS_ADMIN >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN +>=3: Disallow all event access by users without CAP_SYS_ADMIN ============================================================== diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..aa7294092e51 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -989,6 +989,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, loff_t *ppos); +static inline bool perf_paranoid_any(void) +{ + return sysctl_perf_event_paranoid > 2; +} + static inline bool perf_paranoid_tracepoint_raw(void) { return sysctl_perf_event_paranoid > -1; diff --git a/kernel/events/core.c b/kernel/events/core.c index cfc227ccfceb..85bc810bece6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,13 @@ static struct srcu_struct pmus_srcu; * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv + * 3 - disallow all unpriv perf event use */ +#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT +int sysctl_perf_event_paranoid __read_mostly = 3; +#else int sysctl_perf_event_paranoid __read_mostly = 1; +#endif /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -8265,6 +8270,9 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; diff --git a/security/Kconfig b/security/Kconfig index e45237897b43..30a2603e8c85 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT If you are unsure how to answer this question, answer N. +config SECURITY_PERF_EVENTS_RESTRICT + bool "Restrict unprivileged use of performance events" + depends on PERF_EVENTS + help + If you say Y here, the kernel.perf_event_paranoid sysctl + will be set to 3 by default, and no unprivileged use of the + perf_event_open syscall will be permitted unless it is + changed. + config SECURITY bool "Enable different security models" depends on SYSFS From 0ac94b48c0c947accdacf310c068f6bad4796d02 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Wed, 1 Jun 2016 13:44:47 -0700 Subject: [PATCH 183/607] ANDROID: restrict access to perf events Add: CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y to android-base.cfg The kernel.perf_event_paranoid sysctl is set to 3 by default. No unprivileged use of the perf_event_open syscall will be permitted unless it is changed. Bug: 29054680 Change-Id: Ie7512259150e146d8e382dc64d40e8faaa438917 --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 304f1d4fd7c4..6db5542a51f4 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -145,6 +145,7 @@ CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y CONFIG_SECURITY_SELINUX=y CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y From 249d2baf9b494381f2ff4f21feeaaa47eaa78e7c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jun 2016 14:06:14 -0700 Subject: [PATCH 184/607] ANDROID: dm verity fec: limit error correction recursion If verity tree itself is sufficiently corrupted in addition to data blocks, it's possible for error correction to end up in a deep recursive error correction loop that eventually causes a kernel panic as follows: [ 14.728962] [] verity_fec_decode+0xa8/0x138 [ 14.734691] [] verity_verify_level+0x11c/0x180 [ 14.740681] [] verity_hash_for_block+0x88/0xe0 [ 14.746671] [] fec_decode_rsb+0x318/0x75c [ 14.752226] [] verity_fec_decode+0xa8/0x138 [ 14.757956] [] verity_verify_level+0x11c/0x180 [ 14.763944] [] verity_hash_for_block+0x88/0xe0 This change limits the recursion to a reasonable level during a single I/O operation. Bug: 28943429 Signed-off-by: Sami Tolvanen Change-Id: I0a7ebff331d259c59a5e03c81918cc1613c3a766 (cherry picked from commit f4b9e40597e73942d2286a73463c55f26f61bfa7) --- drivers/md/dm-verity-fec.c | 11 ++++++++++- drivers/md/dm-verity-fec.h | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index ad10d6d8ed28..b26809a47ca3 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -442,6 +442,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; + if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { + DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + return -EIO; + } + + fio->level++; + if (type == DM_VERITY_BLOCK_TYPE_METADATA) block += v->data_blocks; @@ -475,7 +482,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (r < 0) { r = fec_decode_rsb(v, io, fio, rsb, offset, true); if (r < 0) - return r; + goto done; } if (dest) @@ -485,6 +492,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, r = verity_for_bv_block(v, io, iter, fec_bv_copy); } +done: + fio->level--; return r; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 8c4bee052a73..b8e21cef3ad1 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -28,6 +28,9 @@ #define DM_VERITY_FEC_BUF_MAX \ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) +/* maximum recursion level for verity_fec_decode */ +#define DM_VERITY_FEC_MAX_RECURSION 4 + #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" #define DM_VERITY_OPT_FEC_START "fec_start" @@ -61,6 +64,7 @@ struct dm_verity_fec_io { unsigned nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ size_t output_pos; + unsigned level; /* recursion level */ }; #ifdef CONFIG_DM_VERITY_FEC From c4d8e3e8d2aa09f5826f517339baa7632f7873ea Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jun 2016 14:22:46 -0700 Subject: [PATCH 185/607] ANDROID: dm verity fec: add missing release from fec_ktype Add a release function to allow destroying the dm-verity device. Bug: 27928374 Signed-off-by: Sami Tolvanen Change-Id: Ic0f7c17e4889c5580d70b52d9a709a37165a5747 (cherry picked from commit 0039ccf47c8f99888f7b71b2a36a68a027fbe357) --- drivers/md/dm-verity-fec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index b26809a47ca3..454535d23a7f 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -689,7 +689,8 @@ static struct attribute *fec_attrs[] = { static struct kobj_type fec_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = fec_attrs + .default_attrs = fec_attrs, + .release = dm_kobject_release }; /* From 8f9576b38193b8eb32d94d17f02baab436c3580a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 17 Jun 2016 11:22:03 -0700 Subject: [PATCH 186/607] ANDROID: dm verity fec: fix RS block calculation A call to do_div was changed in Linux 4.5 to div64_u64 in verity_fec_decode, which broke RS block calculation due to incompatible semantics. This change fixes the computation. Bug: 21893453 Change-Id: Idb88b901e0209c2cccc9c0796689f780592d58f9 Signed-off-by: Sami Tolvanen (cherry picked from commit 879aac93eebcc2862d71afa9eca3a0c0f51b3b01) --- drivers/md/dm-verity-fec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 454535d23a7f..a1e8571ce314 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -463,9 +463,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, */ offset = block << v->data_dev_block_bits; - - res = offset; - div64_u64(res, v->fec->rounds << v->data_dev_block_bits); + res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); /* * The base RS block we can feed to the interleaver to find out all From d053106b93363b77ef8ca60352069cbdc8fa0f87 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 17 Jun 2016 11:31:17 -0700 Subject: [PATCH 187/607] ANDROID: dm verity fec: initialize recursion level Explicitly initialize recursion level to zero at the beginning of each I/O operation. Bug: 28943429 Change-Id: I00c612be2b8c22dd5afb65a739551df91cb324fc Signed-off-by: Sami Tolvanen (cherry picked from commit 32ffb3a22d7fd269b2961323478ece92c06a8334) --- drivers/md/dm-verity-fec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index a1e8571ce314..1dd667b97530 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -532,6 +532,7 @@ void verity_fec_init_io(struct dm_verity_io *io) memset(fio->bufs, 0, sizeof(fio->bufs)); fio->nbufs = 0; fio->output = NULL; + fio->level = 0; } /* From ab4f14dd052ba453558a8913d8d5547abbfc7b88 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 7 Mar 2016 11:31:10 +0100 Subject: [PATCH 188/607] UPSTREAM: usbnet: cleanup after bind() in probe() (cherry pick from commit 1666984c8625b3db19a9abc298931d35ab7bc64b) In case bind() works, but a later error forces bailing in probe() in error cases work and a timer may be scheduled. They must be killed. This fixes an error case related to the double free reported in http://www.spinics.net/lists/netdev/msg367669.html and needs to go on top of Linus' fix to cdc-ncm. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller Bug: 28744625 --- drivers/net/usb/usbnet.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 0744bf2ef2d6..c2ea4e5666fb 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1766,6 +1766,13 @@ out3: if (info->unbind) info->unbind (dev, udev); out1: + /* subdrivers must undo all they did in bind() if they + * fail it, but we may fail later and a deferred kevent + * may trigger an error resubmitting itself and, worse, + * schedule a timer. So we kill it all just in case. + */ + cancel_work_sync(&dev->kevent); + del_timer_sync(&dev->delay); free_netdev(net); out: return status; From 872e24621a63b75b3b644f6369b04bfb092264b1 Mon Sep 17 00:00:00 2001 From: Thierry Strudel Date: Tue, 14 Jun 2016 17:46:44 -0700 Subject: [PATCH 189/607] ANDROID: cpu: send KOBJ_ONLINE event when enabling cpus In case some sysfs nodes needs to be labeled with a different label than sysfs then user needs to be notified when a core is brought back online. Signed-off-by: Thierry Strudel Bug: 29359497 Change-Id: I0395c86e01cd49c348fda8f93087d26f88557c91 --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 37731292f8a1..9ced7c751648 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -627,6 +627,7 @@ void __weak arch_enable_nonboot_cpus_end(void) void enable_nonboot_cpus(void) { int cpu, error; + struct device *cpu_device; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); @@ -644,6 +645,12 @@ void enable_nonboot_cpus(void) trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); + cpu_device = get_cpu_device(cpu); + if (!cpu_device) + pr_err("%s: failed to get cpu%d device\n", + __func__, cpu); + else + kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE); continue; } pr_warn("Error taking CPU%d up: %d\n", cpu, error); From 6da57e2f0beee58a6570719a865f4e1b1f7822c2 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 23 Jun 2016 11:51:39 +0530 Subject: [PATCH 190/607] ANDROID: configs: remove unused configs Remove following configs which no longer exist: CONFIG_IP6_NF_TARGET_REJECT_SKERR CONFIG_IP_NF_TARGET_REJECT_SKERR CONFIG_RESOURCE_COUNTERS CONFIG_TABLET_USB_WACOM Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 3 --- android/configs/android-recommended.cfg | 1 - 2 files changed, 4 deletions(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 6db5542a51f4..bdd99f2becfc 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -37,7 +37,6 @@ CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MANGLE=y CONFIG_IP6_NF_RAW=y CONFIG_IP6_NF_TARGET_REJECT=y -CONFIG_IP6_NF_TARGET_REJECT_SKERR=y CONFIG_IPV6=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_MULTIPLE_TABLES=y @@ -64,7 +63,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_TARGET_NETMAP=y CONFIG_IP_NF_TARGET_REDIRECT=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_REJECT_SKERR=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y @@ -140,7 +138,6 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_QUOTA=y -CONFIG_RESOURCE_COUNTERS=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECURITY=y diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 35936afdcae4..c3222a77ba24 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -110,7 +110,6 @@ CONFIG_TABLET_USB_AIPTEK=y CONFIG_TABLET_USB_GTCO=y CONFIG_TABLET_USB_HANWANG=y CONFIG_TABLET_USB_KBTAB=y -CONFIG_TABLET_USB_WACOM=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_IO_ACCOUNTING=y From b40d9bcad6420669ccf41f69278506125fa68428 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:46:24 -0400 Subject: [PATCH 191/607] UPSTREAM: net: fix infoleak in rtnetlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 5f8e44741f9f216e33736ea4ec65ca9ac03036e6) The stack object “map” has a total size of 32 bytes. Its last 4 bytes are padding generated by compiler. These padding bytes are not initialized and sent out via “nla_put”. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller Bug: 28620102 Change-Id: Ica015c6a90d47e9188b1cd87a280ac6819dd9d09 --- net/core/rtnetlink.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 34ba7a08876d..9c6d15756e7a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1174,14 +1174,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) return -EMSGSIZE; From 14dfd8269605b9dc3dcdd10945398d3822f07953 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:07 -0400 Subject: [PATCH 192/607] UPSTREAM: ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit cec8f96e49d9be372fdb0c3836dcf31ec71e457e) The stack object “tread” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980557 Change-Id: Ibda2d126f6d72fedf797a98796c3cde7bb03db76 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 8b06413e967e..8b6c3605d2e9 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1680,6 +1680,7 @@ static int snd_timer_user_params(struct file *file, if (tu->timeri->flags & SNDRV_TIMER_IFLG_EARLY_EVENT) { if (tu->tread) { struct snd_timer_tread tread; + memset(&tread, 0, sizeof(tread)); tread.event = SNDRV_TIMER_EVENT_EARLY; tread.tstamp.tv_sec = 0; tread.tstamp.tv_nsec = 0; From 29d02a76f5ff4af4eeb5f1ef3b09cd5b48fc2662 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:32 -0400 Subject: [PATCH 193/607] UPSTREAM: ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit e4ec8cc8039a7063e24204299b462bd1383184a5) The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980217 Change-Id: If2bba3c9ffb4e57190583b0bb2524d3b2514b2a3 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 8b6c3605d2e9..ded95e6f201f 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1215,6 +1215,7 @@ static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri, } if ((tu->filter & (1 << SNDRV_TIMER_EVENT_RESOLUTION)) && tu->last_resolution != resolution) { + memset(&r1, 0, sizeof(r1)); r1.event = SNDRV_TIMER_EVENT_RESOLUTION; r1.tstamp = tstamp; r1.val = resolution; From 3d1e8cf7ef81f13d4da5218174d12a40e8bf1116 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:20 -0400 Subject: [PATCH 194/607] UPSTREAM: ALSA: timer: Fix leak in events via snd_timer_user_ccallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 9a47e9cff994f37f7f0dbd9ae23740d0f64f9fe6) The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980217 Change-Id: I2e4c27352894b9f1f4c808b8db3ae5f9284faec1 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index ded95e6f201f..1701703b8d93 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1181,6 +1181,7 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri, tu->tstamp = *tstamp; if ((tu->filter & (1 << event)) == 0 || !tu->tread) return; + memset(&r1, 0, sizeof(r1)); r1.event = event; r1.tstamp = *tstamp; r1.val = resolution; From 659740adf180779d0177a086656880bc7a43cfd1 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:32:16 -0400 Subject: [PATCH 195/607] UPSTREAM: USB: usbfs: fix potential infoleak in devio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 681fef8380eb818c0b845fca5d2ab1dcbab114ee) The stack object “ci” has a total size of 8 bytes. Its last 3 bytes are padding bytes which are not initialized and leaked to userland via “copy_to_user”. Signed-off-by: Kangjie Lu Signed-off-by: Greg Kroah-Hartman Bug: 28619695 Change-Id: I170754d659d0891c075f85211b5e3970b114f097 --- drivers/usb/core/devio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 38ae877c46e3..3ffb01ff6549 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1203,10 +1203,11 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg) static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg) { - struct usbdevfs_connectinfo ci = { - .devnum = ps->dev->devnum, - .slow = ps->dev->speed == USB_SPEED_LOW - }; + struct usbdevfs_connectinfo ci; + + memset(&ci, 0, sizeof(ci)); + ci.devnum = ps->dev->devnum; + ci.slow = ps->dev->speed == USB_SPEED_LOW; if (copy_to_user(arg, &ci, sizeof(ci))) return -EFAULT; From 7dbe59704ab0dd0a07ae2c2b7b590bb2b8ccd6d9 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 23 Jun 2016 15:35:07 +0530 Subject: [PATCH 196/607] ANDROID: base-cfg: enable UID_CPUTIME Enabled UID_CPUTIME and dependent PROFILING config option. UID_CPUTIME (/proc/uid_cputime) interfaces provide amount of time a UID's processes spent executing in user-space and kernel-space. It is used by batterystats service. Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index bdd99f2becfc..6496bb3961a2 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -137,6 +137,7 @@ CONFIG_PPP_BSDCOMP=y CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y +CONFIG_PROFILING=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y @@ -149,6 +150,7 @@ CONFIG_STAGING=y CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y +CONFIG_UID_CPUTIME=y CONFIG_UNIX=y CONFIG_USB_GADGET=y CONFIG_USB_CONFIGFS=y From ef51a2254d6579560bf7ff8ebbeb9b3848fe10bf Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 7 Jan 2016 16:46:14 +0100 Subject: [PATCH 197/607] BACKPORT: PM / sleep: Go direct_complete if driver has no callbacks Backport notes: This resolves clk warnings in the designware i2c driver on HiKey seen during suspend/resume. Cherrypicked from: aa8e54b559479d0cb7eb632ba443b8cacd20cd4b If a suitable prepare callback cannot be found for a given device and its driver has no PM callbacks at all, assume that it can go direct to complete when the system goes to sleep. The reason for this is that there's lots of devices in a system that do no PM at all and there's no reason for them to prevent their ancestors to do direct_complete if they can support it. Change-Id: Ia773afb4b266f012336b99fc8cf87453839e078b Signed-off-by: Tomeu Vizoso Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki [jstultz: Backported to 4.4] Signed-off-by: John Stultz --- drivers/base/dd.c | 3 +++ drivers/base/power/main.c | 35 +++++++++++++++++++++++++++++++++++ drivers/base/power/power.h | 3 +++ include/linux/pm.h | 1 + 4 files changed, 42 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index a641cf3ccad6..9e425fbf83cb 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -205,6 +205,8 @@ static void driver_bound(struct device *dev) klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); + device_pm_check_callbacks(dev); + /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices @@ -697,6 +699,7 @@ static void __device_release_driver(struct device *dev) dev->pm_domain->dismiss(dev); klist_remove(&dev->p->knode_driver); + device_pm_check_callbacks(dev); if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBOUND_DRIVER, diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5ea529165362..77379f3d136a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -126,6 +126,7 @@ void device_pm_add(struct device *dev) { pr_debug("PM: Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); + device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", @@ -148,6 +149,7 @@ void device_pm_remove(struct device *dev) mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); + device_pm_check_callbacks(dev); } /** @@ -1574,6 +1576,11 @@ static int device_prepare(struct device *dev, pm_message_t state) dev->power.wakeup_path = device_may_wakeup(dev); + if (dev->power.no_pm_callbacks) { + ret = 1; /* Let device go direct_complete */ + goto unlock; + } + if (dev->pm_domain) { info = "preparing power domain "; callback = dev->pm_domain->ops.prepare; @@ -1596,6 +1603,7 @@ static int device_prepare(struct device *dev, pm_message_t state) if (callback) ret = callback(dev); +unlock: device_unlock(dev); if (ret < 0) { @@ -1724,3 +1732,30 @@ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); + +static bool pm_ops_is_empty(const struct dev_pm_ops *ops) +{ + if (!ops) + return true; + + return !ops->prepare && + !ops->suspend && + !ops->suspend_late && + !ops->suspend_noirq && + !ops->resume_noirq && + !ops->resume_early && + !ops->resume && + !ops->complete; +} + +void device_pm_check_callbacks(struct device *dev) +{ + spin_lock_irq(&dev->power.lock); + dev->power.no_pm_callbacks = + (!dev->bus || pm_ops_is_empty(dev->bus->pm)) && + (!dev->class || pm_ops_is_empty(dev->class->pm)) && + (!dev->type || pm_ops_is_empty(dev->type->pm)) && + (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && + (!dev->driver || pm_ops_is_empty(dev->driver->pm)); + spin_unlock_irq(&dev->power.lock); +} diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index 998fa6b23084..297beae64314 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -123,6 +123,7 @@ extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); +extern void device_pm_check_callbacks(struct device *dev); #else /* !CONFIG_PM_SLEEP */ @@ -141,6 +142,8 @@ static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} +static inline void device_pm_check_callbacks(struct device *dev) {} + #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) diff --git a/include/linux/pm.h b/include/linux/pm.h index 528be6787796..6a5d654f4447 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -573,6 +573,7 @@ struct dev_pm_info { struct wakeup_source *wakeup; bool wakeup_path:1; bool syscore:1; + bool no_pm_callbacks:1; /* Owned by the PM core */ #else unsigned int should_wakeup:1; #endif From e639a4e74d6cd2c01d4f2afbbe0b6e6e233b42d2 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 27 Jun 2016 13:33:59 -0700 Subject: [PATCH 198/607] Revert "usb: gadget: prevent change of Host MAC address of 'usb0' interface" This reverts commit 265801537d110eb68d44a2f66015479908f635c0. Signed-off-by: Badhri Jagan Sridharan --- drivers/usb/gadget/function/u_ether.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index dd73dfe5dcab..74e9f5b5a45d 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -863,8 +863,6 @@ static int eth_stop(struct net_device *net) /*-------------------------------------------------------------------------*/ -static u8 host_ethaddr[ETH_ALEN]; - static int get_ether_addr(const char *str, u8 *dev_addr) { if (str) { @@ -895,17 +893,6 @@ static int get_ether_addr_str(u8 dev_addr[ETH_ALEN], char *str, int len) return 18; } -static int get_host_ether_addr(u8 *str, u8 *dev_addr) -{ - memcpy(dev_addr, str, ETH_ALEN); - if (is_valid_ether_addr(dev_addr)) - return 0; - - random_ether_addr(dev_addr); - memcpy(str, dev_addr, ETH_ALEN); - return 1; -} - static const struct net_device_ops eth_netdev_ops = { .ndo_open = eth_open, .ndo_stop = eth_stop, @@ -963,11 +950,9 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g, if (get_ether_addr(dev_addr, net->dev_addr)) dev_warn(&g->dev, "using random %s ethernet address\n", "self"); - - if (get_host_ether_addr(host_ethaddr, dev->host_mac)) - dev_warn(&g->dev, "using random %s ethernet address\n", "host"); - else - dev_warn(&g->dev, "using previous %s ethernet address\n", "host"); + if (get_ether_addr(host_addr, dev->host_mac)) + dev_warn(&g->dev, + "using random %s ethernet address\n", "host"); if (ethaddr) memcpy(ethaddr, dev->host_mac, ETH_ALEN); From 074c908972a215f63bb5409f6a317020fb137d5b Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 22 Jun 2016 16:49:48 +0800 Subject: [PATCH 199/607] netfilter: xt_quota2: make quota2_log work well In upstream commit 7200135bc1e61f1437dc326ae2ef2f310c50b4eb (netfilter: kill ulog targets) http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7200135bc1e6 ipt_ULOG target was removed, meanwhile, the IP_NF_TARGET_ULOG Kconfig and ipt_ULOG.h header file were removed too. This causes we cannot enable QUOTA2_LOG, and netd complains this error: "Unable to open quota socket". So when we reach the quota2 limit, userspace will not be notified with this event. Since IP_NF_TARGET_ULOG was removed, we need not depend on "IP_NF_TARGET_ULOG=n", and for compatibility, add ulog_packet_msg_t related definitions copied from "ipt_ULOG.h". Change-Id: I38132efaabf52bea75dfd736ce734a1b9690e87e Reported-by: Samboo Shen Signed-off-by: Liping Zhang --- net/netfilter/Kconfig | 1 - net/netfilter/xt_quota2.c | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 273f26b67653..1959548b1161 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1354,7 +1354,6 @@ config NETFILTER_XT_MATCH_QUOTA2 config NETFILTER_XT_MATCH_QUOTA2_LOG bool '"quota2" Netfilter LOG support' depends on NETFILTER_XT_MATCH_QUOTA2 - depends on IP_NF_TARGET_ULOG=n # not yes, not module, just no default n help This option allows `quota2' to log ONCE when a quota limit diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c index 99592ae56d9b..834594aa0085 100644 --- a/net/netfilter/xt_quota2.c +++ b/net/netfilter/xt_quota2.c @@ -21,8 +21,27 @@ #include #include + #ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG -#include +/* For compatibility, these definitions are copied from the + * deprecated header file */ +#define ULOG_MAC_LEN 80 +#define ULOG_PREFIX_LEN 32 + +/* Format of the ULOG packets passed through netlink */ +typedef struct ulog_packet_msg { + unsigned long mark; + long timestamp_sec; + long timestamp_usec; + unsigned int hook; + char indev_name[IFNAMSIZ]; + char outdev_name[IFNAMSIZ]; + size_t data_len; + char prefix[ULOG_PREFIX_LEN]; + unsigned char mac_len; + unsigned char mac[ULOG_MAC_LEN]; + unsigned char payload[0]; +} ulog_packet_msg_t; #endif /** From 286d25ba46346d5b7125b2155ce92a63a73f4d7b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: [PATCH 200/607] BACKPORT: timer: convert timer_slack_ns from unsigned long to u64 This backports da8b44d5a9f8bf26da637b7336508ca534d6b319 from upstream. This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- fs/select.c | 8 ++++---- include/linux/freezer.h | 2 +- include/linux/hrtimer.h | 12 +++++++----- include/linux/poll.h | 2 +- include/linux/sched.h | 4 ++-- kernel/sys.c | 5 ++++- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer.c | 4 ++-- 9 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4c999ce7e73a..3ab9c68b8bce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1588,7 +1588,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; unsigned long flags; - long slack = 0; + u64 slack = 0; wait_queue_t wait; ktime_t expires, *to = NULL; diff --git a/fs/select.c b/fs/select.c index 015547330e88..09e71a00a9b8 100644 --- a/fs/select.c +++ b/fs/select.c @@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -long select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec *tv) { - unsigned long ret; + u64 ret; struct timespec now; /* @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; @@ -784,7 +784,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 6b7fd9cf5ea2..dd03e837ebb7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout) * call this with locks held. */ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode) + u64 delta, const enum hrtimer_mode mode) { int __retval; freezer_do_not_count(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..90d8be19505b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -218,7 +218,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time timer->node.expires = ktime_add_safe(time, delta); } -static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); @@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long range_ns, const enum hrtimer_mode mode); + u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer on the current CPU @@ -376,7 +376,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - unsigned long delta; + u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); @@ -449,10 +449,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); -extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode, int clock); + u64 delta, + const enum hrtimer_mode mode, + int clock); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ diff --git a/include/linux/poll.h b/include/linux/poll.h index c08386fb3e08..9fb4f40d9a26 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern long select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) diff --git a/include/linux/sched.h b/include/linux/sched.h index fa39434e3fdd..a8012d691abf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1767,8 +1767,8 @@ struct task_struct { * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; diff --git a/kernel/sys.c b/kernel/sys.c index 11333311cf1c..0c00e563b153 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2319,7 +2319,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..c98439fdce81 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -963,7 +963,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1529,7 +1529,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1705,7 +1705,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1773,7 +1773,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..d1798fa0c743 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } From eb976ed445e57de1e6469b9f3a1aaf33f862c1dd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:54 -0700 Subject: [PATCH 201/607] BACKPORT: proc: add /proc//timerslack_ns interface This backports 5de23d435e88996b1efe0e2cebe242074ce67c9e This patch provides a proc/PID/timerslack_ns interface which exposes a task's timerslack value in nanoseconds and allows it to be changed. This allows power/performance management software to set timer slack for other threads according to its policy for the thread (such as when the thread is designated foreground vs. background activity) If the value written is non-zero, slack is set to that value. Otherwise sets it to the default for the thread. This interface checks that the calling task has permissions to to use PTRACE_MODE_ATTACH_FSCREDS on the target task, so that we can ensure arbitrary apps do not change the timer slack for other apps. Signed-off-by: John Stultz Acked-by: Kees Cook Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 18 ++++++++ fs/proc/base.c | 67 ++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0bcba2823d9d..1c425191574c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -43,6 +43,7 @@ Table of Contents 3.7 /proc//task//children - Information about task children 3.8 /proc//fdinfo/ - Information about opened file 3.9 /proc//map_files - Information about memory mapped files + 3.10 /proc//timerslack_ns - Task timerslack value 4 Configuring procfs 4.1 Mount options @@ -1856,6 +1857,23 @@ time one can open(2) mappings from the listings of two processes and comparing their inode numbers to figure out which anonymous memory areas are actually shared. +3.10 /proc//timerslack_ns - Task timerslack value +--------------------------------------------------------- +This file provides the value of the task's timerslack value in nanoseconds. +This value specifies a amount of time that normal timers may be deferred +in order to coalesce timers and avoid unnecessary wakeups. + +This allows a task's interactivity vs power consumption trade off to be +adjusted. + +Writing 0 to the file will set the tasks timerslack to the default value. + +Valid values are from 0 - ULLONG_MAX + +An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level +permissions on the task specified to change its timerslack_ns value. + + ------------------------------------------------------------------------------ Configuring procfs ------------------------------------------------------------------------------ diff --git a/fs/proc/base.c b/fs/proc/base.c index c21bb4beb0d7..bb51e4ceb15b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2243,6 +2243,72 @@ static const struct file_operations proc_timers_operations = { .release = seq_release_private, }; +static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + u64 slack_ns; + int err; + + err = kstrtoull_from_user(buf, count, 10, &slack_ns); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + } else + count = -EPERM; + + put_task_struct(p); + + return count; +} + +static int timerslack_ns_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + int err = 0; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + } else + err = -EPERM; + + put_task_struct(p); + + return err; +} + +static int timerslack_ns_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timerslack_ns_show, inode); +} + +static const struct file_operations proc_pid_set_timerslack_ns_operations = { + .open = timerslack_ns_open, + .read = seq_read, + .write = timerslack_ns_write, + .llseek = seq_lseek, + .release = single_release, +}; + static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2820,6 +2886,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), #endif + REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) From e6ec611adde326cd28f0692a263b31821f86b071 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:04 -0800 Subject: [PATCH 202/607] BACKPORT: ptrace: use fsuid, fsgid, effective creds for fs access checks This patch backports 969624b (which backports caaee6234d0 upstream), from the v4.4-stable branch to the common/android-4.4 branch. This patch is needed to provide the PTRACE_MODE_ATTACH_FSCREDS definition which was used by the backported version of proc//timerslack_ns in change-id: Ie5799b9a3402a31f88cd46437dcda4a0e46415a7 commit caaee6234d05a58c5b4d05e7bf766131b810a657 upstream. By checking the effective credentials instead of the real UID / permitted capabilities, ensure that the calling process actually intended to use its credentials. To ensure that all ptrace checks use the correct caller credentials (e.g. in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS flag), use two new flags and require one of them to be set. The problem was that when a privileged task had temporarily dropped its privileges, e.g. by calling setreuid(0, user_uid), with the intent to perform following syscalls with the credentials of a user, it still passed ptrace access checks that the user would not be able to pass. While an attacker should not be able to convince the privileged task to perform a ptrace() syscall, this is a problem because the ptrace access check is reused for things in procfs. In particular, the following somewhat interesting procfs entries only rely on ptrace access checks: /proc/$pid/stat - uses the check for determining whether pointers should be visible, useful for bypassing ASLR /proc/$pid/maps - also useful for bypassing ASLR /proc/$pid/cwd - useful for gaining access to restricted directories that contain files with lax permissions, e.g. in this scenario: lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar drwx------ root root /root drwxr-xr-x root root /root/foobar -rw-r--r-- root root /root/foobar/secret Therefore, on a system where a root-owned mode 6755 binary changes its effective credentials as described and then dumps a user-specified file, this could be used by an attacker to reveal the memory layout of root's processes or reveal the contents of files he is not allowed to access (through /proc/$pid/cwd). [akpm@linux-foundation.org: fix warning] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman [jstultz: Cherry-picked for common/android-4.4] Signed-off-by: John Stultz --- fs/proc/array.c | 2 +- fs/proc/base.c | 21 +++++++++++---------- fs/proc/namespaces.c | 4 ++-- include/linux/ptrace.h | 24 +++++++++++++++++++++++- kernel/events/core.c | 2 +- kernel/futex.c | 2 +- kernel/futex_compat.c | 2 +- kernel/kcmp.c | 4 ++-- kernel/ptrace.c | 39 +++++++++++++++++++++++++++++++-------- mm/process_vm_access.c | 2 +- security/commoncap.c | 7 ++++++- 11 files changed, 80 insertions(+), 29 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index d73291f5f0fc..b6c00ce0e29e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); diff --git a/fs/proc/base.c b/fs/proc/base.c index bb51e4ceb15b..db221e53a1ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = { static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { @@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) + && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else seq_putc(m, '0'); @@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task) int err = mutex_lock_killable(&task->signal->cred_guard_mutex); if (err) return err; - if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { mutex_unlock(&task->signal->cred_guard_mutex); return -EPERM; } @@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; @@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, return true; if (in_group_p(pid->pid_gid)) return true; - return ptrace_may_access(task, PTRACE_MODE_READ); + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { - mm = mm_access(task, mode); + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { @@ -1856,7 +1857,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (!task) goto out_notask; - mm = mm_access(task, PTRACE_MODE_READ); + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; @@ -2007,7 +2008,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = -ENOENT; @@ -2060,7 +2061,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) goto out; ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; @@ -2596,7 +2597,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (result) return result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index f6e8354b8cea..1b0ea4a5d89e 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,7 +42,7 @@ static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) if (!task) return error; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) nd_jump_link(&ns_path); @@ -63,7 +63,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 061265f92876..504c98a278d4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 -/* Returns true on success, false on denial. */ +#define PTRACE_MODE_FSCREDS 0x08 +#define PTRACE_MODE_REALCREDS 0x10 + +/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ +#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) +#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) + +/** + * ptrace_may_access - check whether the caller is permitted to access + * a target task. + * @task: target task + * @mode: selects type of access and caller credentials + * + * Returns true on success, false on denial. + * + * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must + * be set in @mode to specify whether the access was requested through + * a filesystem syscall (should use effective capabilities and fsuid + * of the caller) or through an explicit syscall such as + * process_vm_writev or ptrace (and should use the real credentials). + */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) diff --git a/kernel/events/core.c b/kernel/events/core.c index 85bc810bece6..2f07e9c34f43 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3439,7 +3439,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; diff --git a/kernel/futex.c b/kernel/futex.c index 684d7549825a..495a1d06915b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2881,7 +2881,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c9349cfe..4ae3232e7a28 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea1d8fd..3a47fa998fe0 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b760bae64cf1..3189e51db7e8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071648c2..5d453e58ddbf 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/security/commoncap.c b/security/commoncap.c index f035b84b3601..7fa251aea32f 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -148,12 +148,17 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; + const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); + if (mode & PTRACE_MODE_FSCREDS) + caller_caps = &cred->cap_effective; + else + caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && - cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) + cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; From 50437c019f146eb138d819d558a3f562419573fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 8 Jul 2016 13:24:09 -0700 Subject: [PATCH 203/607] UPSTREAM: cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 4d06dd537f95683aba3651098ae288b7cbff8274) usbnet_link_change will call schedule_work and should be avoided if bind is failing. Otherwise we will end up with scheduled work referring to a netdev which has gone away. Instead of making the call conditional, we can just defer it to usbnet_probe, using the driver_info flag made for this purpose. Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change") Reported-by: Andrey Konovalov Suggested-by: Linus Torvalds Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller Change-Id: Id9a6d02bdd98bf495d26595cf2cc90e480746186 Bug: 28744625 --- drivers/net/usb/cdc_ncm.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index e8a1144c5a8b..93c88a2ce55c 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -941,8 +941,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) { - int ret; - /* MBIM backwards compatible function? */ if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM) return -ENODEV; @@ -951,16 +949,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) * Additionally, generic NCM devices are assumed to accept arbitrarily * placed NDP. */ - ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); - - /* - * We should get an event when network connection is "connected" or - * "disconnected". Set network connection in "disconnected" state - * (carrier is OFF) during attach, so the IP network stack does not - * start IPv6 negotiation and more. - */ - usbnet_link_change(dev, 0, 0); - return ret; + return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); } static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max) @@ -1543,7 +1532,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", - .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET, + .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET + | FLAG_LINK_INTR, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1556,7 +1546,7 @@ static const struct driver_info cdc_ncm_info = { static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN, + | FLAG_LINK_INTR | FLAG_WWAN, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1569,7 +1559,7 @@ static const struct driver_info wwan_info = { static const struct driver_info wwan_noarp_info = { .description = "Mobile Broadband Network Device (NO ARP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN | FLAG_NOARP, + | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, From 5b2129ec25a33cda9662232afdfa6af7c9bc168a Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 8 Jul 2016 14:15:14 -0700 Subject: [PATCH 204/607] sdcardfs: Truncate packages_gid.list on overflow packages_gid.list was improperly returning the wrong count. Use scnprintf instead, and inform the user that the list was truncated if it is. Bug: 30013843 Change-Id: Ida2b2ef7cd86dd87300bfb4c2cdb6bfe2ee1650d Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/packagelist.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 10f0d6be718b..9c3340528eee 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -335,12 +335,19 @@ static ssize_t packages_attr_show(struct config_item *item, struct hashtable_entry *hash_cur; struct hlist_node *h_t; int i; - int count = 0; - mutex_lock(&pkgl_data_all->hashtable_lock); - hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) - count += snprintf(page + count, PAGE_SIZE - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); - mutex_unlock(&pkgl_data_all->hashtable_lock); + int count = 0, written = 0; + char errormsg[] = "\n"; + mutex_lock(&pkgl_data_all->hashtable_lock); + hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) { + written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); + if (count + written == PAGE_SIZE - sizeof(errormsg)) { + count += scnprintf(page + count, PAGE_SIZE - count, errormsg); + break; + } + count += written; + } + mutex_unlock(&pkgl_data_all->hashtable_lock); return count; } From 6b72c6ef4442d1e4cbae32839ee69a1b42704d86 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 12 Jun 2016 17:37:52 -0700 Subject: [PATCH 205/607] android-recommended.cfg: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Bug: 28967314 Change-Id: I588c2d544250e9e4b5082b43c237b8f85b7313ca Signed-off-by: Jeff Vander Stoep --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index c3222a77ba24..2f1ef077aa9e 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -11,6 +11,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y CONFIG_DM_UEVENT=y From 96f61ff98aab275e0ac7dae4a42d3357e3d6e4a2 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jun 2016 10:28:49 -0700 Subject: [PATCH 206/607] ANDROID: sdcardfs: fix itnull.cocci warnings List_for_each_entry has the property that the first argument is always bound to a real list element, never NULL, so testing dentry is not needed. Generated by: scripts/coccinelle/iterators/itnull.cocci Cc: Daniel Rosenberg Signed-off-by: Julia Lawall Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- fs/sdcardfs/derived_perm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 128b3e56851f..41e0e11b3c35 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -112,7 +112,7 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) void get_derive_permissions_recursive(struct dentry *parent) { struct dentry *dentry; list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (dentry && dentry->d_inode) { + if (dentry->d_inode) { mutex_lock(&dentry->d_inode->i_mutex); get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); From 9cb6c482ae8032970efe10f6f15d3bbee7795fb4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:52 +0100 Subject: [PATCH 207/607] UPSTREAM: netfilter: x_tables: fix unconditional helper (cherry pick from commit 54d83fc74aa9ec72794373cb47432c5f7fb1a309) Ben Hawkes says: In the mark_source_chains function (net/ipv4/netfilter/ip_tables.c) it is possible for a user-supplied ipt_entry structure to have a large next_offset field. This field is not bounds checked prior to writing a counter value at the supplied offset. Problem is that mark_source_chains should not have been called -- the rule doesn't have a next entry, so its supposed to return an absolute verdict of either ACCEPT or DROP. However, the function conditional() doesn't work as the name implies. It only checks that the rule is using wildcard address matching. However, an unconditional rule must also not be using any matches (no -m args). The underflow validator only checked the addresses, therefore passing the 'unconditional absolute verdict' test, while mark_source_chains also tested for presence of matches, and thus proceeeded to the next (not-existent) rule. Unify this so that all the callers have same idea of 'unconditional rule'. Reported-by: Ben Hawkes Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: I47ec0713ac563ac244200c7b2c54f09a91aceabc Bug: 28940694 --- net/ipv4/netfilter/arp_tables.c | 18 +++++++++--------- net/ipv4/netfilter/ip_tables.c | 23 +++++++++++------------ net/ipv6/netfilter/ip6_tables.c | 23 +++++++++++------------ 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 11dccba474b7..e5697e649ce1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline bool unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; - return memcmp(arp, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct arpt_entry) && + memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->arp)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -557,7 +557,7 @@ static bool check_underflow(const struct arpt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->arp)) + if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6ba1..2bf6b3f1a00c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; - return memcmp(ip, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ipt_entry) && + memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ip)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->ip)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -721,7 +720,7 @@ static bool check_underflow(const struct ipt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ip)) + if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 99425cf2819b..37e945cfa5da 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -198,11 +198,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ip6t_ip6 *ipv6) +static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; - return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ip6t_entry) && + memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * @@ -258,11 +259,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ip6t_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ipv6)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] @@ -488,11 +488,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ip6t_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && - unconditional(&e->ipv6)) || visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -733,7 +732,7 @@ static bool check_underflow(const struct ip6t_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ipv6)) + if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -775,9 +774,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; From f8a27f34070e29bdbbe88f5330f77d8ac5d1c2fb Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 23 Mar 2016 16:38:55 +0100 Subject: [PATCH 208/607] UPSTREAM: ppp: take reference on channels netns (cherry pick from commit 1f461dcdd296eecedaffffc6bae2bfa90bd7eb89) Let channels hold a reference on their network namespace. Some channel types, like ppp_async and ppp_synctty, can have their userspace controller running in a different namespace. Therefore they can't rely on them to preclude their netns from being removed from under them. ================================================================== BUG: KASAN: use-after-free in ppp_unregister_channel+0x372/0x3a0 at addr ffff880064e217e0 Read of size 8 by task syz-executor/11581 ============================================================================= BUG net_namespace (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in copy_net_ns+0x6b/0x1a0 age=92569 cpu=3 pid=6906 [< none >] ___slab_alloc+0x4c7/0x500 kernel/mm/slub.c:2440 [< none >] __slab_alloc+0x4c/0x90 kernel/mm/slub.c:2469 [< inline >] slab_alloc_node kernel/mm/slub.c:2532 [< inline >] slab_alloc kernel/mm/slub.c:2574 [< none >] kmem_cache_alloc+0x23a/0x2b0 kernel/mm/slub.c:2579 [< inline >] kmem_cache_zalloc kernel/include/linux/slab.h:597 [< inline >] net_alloc kernel/net/core/net_namespace.c:325 [< none >] copy_net_ns+0x6b/0x1a0 kernel/net/core/net_namespace.c:360 [< none >] create_new_namespaces+0x2f6/0x610 kernel/kernel/nsproxy.c:95 [< none >] copy_namespaces+0x297/0x320 kernel/kernel/nsproxy.c:150 [< none >] copy_process.part.35+0x1bf4/0x5760 kernel/kernel/fork.c:1451 [< inline >] copy_process kernel/kernel/fork.c:1274 [< none >] _do_fork+0x1bc/0xcb0 kernel/kernel/fork.c:1723 [< inline >] SYSC_clone kernel/kernel/fork.c:1832 [< none >] SyS_clone+0x37/0x50 kernel/kernel/fork.c:1826 [< none >] entry_SYSCALL_64_fastpath+0x16/0x7a kernel/arch/x86/entry/entry_64.S:185 INFO: Freed in net_drop_ns+0x67/0x80 age=575 cpu=2 pid=2631 [< none >] __slab_free+0x1fc/0x320 kernel/mm/slub.c:2650 [< inline >] slab_free kernel/mm/slub.c:2805 [< none >] kmem_cache_free+0x2a0/0x330 kernel/mm/slub.c:2814 [< inline >] net_free kernel/net/core/net_namespace.c:341 [< none >] net_drop_ns+0x67/0x80 kernel/net/core/net_namespace.c:348 [< none >] cleanup_net+0x4e5/0x600 kernel/net/core/net_namespace.c:448 [< none >] process_one_work+0x794/0x1440 kernel/kernel/workqueue.c:2036 [< none >] worker_thread+0xdb/0xfc0 kernel/kernel/workqueue.c:2170 [< none >] kthread+0x23f/0x2d0 kernel/drivers/block/aoe/aoecmd.c:1303 [< none >] ret_from_fork+0x3f/0x70 kernel/arch/x86/entry/entry_64.S:468 INFO: Slab 0xffffea0001938800 objects=3 used=0 fp=0xffff880064e20000 flags=0x5fffc0000004080 INFO: Object 0xffff880064e20000 @offset=0 fp=0xffff880064e24200 CPU: 1 PID: 11581 Comm: syz-executor Tainted: G B 4.4.0+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014 00000000ffffffff ffff8800662c7790 ffffffff8292049d ffff88003e36a300 ffff880064e20000 ffff880064e20000 ffff8800662c77c0 ffffffff816f2054 ffff88003e36a300 ffffea0001938800 ffff880064e20000 0000000000000000 Call Trace: [< inline >] __dump_stack kernel/lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 kernel/lib/dump_stack.c:50 [] print_trailer+0xf4/0x150 kernel/mm/slub.c:654 [] object_err+0x2f/0x40 kernel/mm/slub.c:661 [< inline >] print_address_description kernel/mm/kasan/report.c:138 [] kasan_report_error+0x215/0x530 kernel/mm/kasan/report.c:236 [< inline >] kasan_report kernel/mm/kasan/report.c:259 [] __asan_report_load8_noabort+0x3e/0x40 kernel/mm/kasan/report.c:280 [< inline >] ? ppp_pernet kernel/include/linux/compiler.h:218 [] ? ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [< inline >] ppp_pernet kernel/include/linux/compiler.h:218 [] ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [< inline >] ? ppp_pernet kernel/drivers/net/ppp/ppp_generic.c:293 [] ? ppp_unregister_channel+0xe6/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [] ppp_asynctty_close+0xa3/0x130 kernel/drivers/net/ppp/ppp_async.c:241 [] ? async_lcp_peek+0x5b0/0x5b0 kernel/drivers/net/ppp/ppp_async.c:1000 [] tty_ldisc_close.isra.1+0x99/0xe0 kernel/drivers/tty/tty_ldisc.c:478 [] tty_ldisc_kill+0x40/0x170 kernel/drivers/tty/tty_ldisc.c:744 [] tty_ldisc_release+0x1b3/0x260 kernel/drivers/tty/tty_ldisc.c:772 [] tty_release+0xac1/0x13e0 kernel/drivers/tty/tty_io.c:1901 [] ? release_tty+0x320/0x320 kernel/drivers/tty/tty_io.c:1688 [] __fput+0x236/0x780 kernel/fs/file_table.c:208 [] ____fput+0x15/0x20 kernel/fs/file_table.c:244 [] task_work_run+0x16b/0x200 kernel/kernel/task_work.c:115 [< inline >] exit_task_work kernel/include/linux/task_work.h:21 [] do_exit+0x8b5/0x2c60 kernel/kernel/exit.c:750 [] ? debug_check_no_locks_freed+0x290/0x290 kernel/kernel/locking/lockdep.c:4123 [] ? mm_update_next_owner+0x6f0/0x6f0 kernel/kernel/exit.c:357 [] ? __dequeue_signal+0x136/0x470 kernel/kernel/signal.c:550 [] ? recalc_sigpending_tsk+0x13b/0x180 kernel/kernel/signal.c:145 [] do_group_exit+0x108/0x330 kernel/kernel/exit.c:880 [] get_signal+0x5e4/0x14f0 kernel/kernel/signal.c:2307 [< inline >] ? kretprobe_table_lock kernel/kernel/kprobes.c:1113 [] ? kprobe_flush_task+0xb5/0x450 kernel/kernel/kprobes.c:1158 [] do_signal+0x83/0x1c90 kernel/arch/x86/kernel/signal.c:712 [] ? recycle_rp_inst+0x310/0x310 kernel/include/linux/list.h:655 [] ? setup_sigcontext+0x780/0x780 kernel/arch/x86/kernel/signal.c:165 [] ? finish_task_switch+0x424/0x5f0 kernel/kernel/sched/core.c:2692 [< inline >] ? finish_lock_switch kernel/kernel/sched/sched.h:1099 [] ? finish_task_switch+0x120/0x5f0 kernel/kernel/sched/core.c:2678 [< inline >] ? context_switch kernel/kernel/sched/core.c:2807 [] ? __schedule+0x919/0x1bd0 kernel/kernel/sched/core.c:3283 [] exit_to_usermode_loop+0xf1/0x1a0 kernel/arch/x86/entry/common.c:247 [< inline >] prepare_exit_to_usermode kernel/arch/x86/entry/common.c:282 [] syscall_return_slowpath+0x19f/0x210 kernel/arch/x86/entry/common.c:344 [] int_ret_from_sys_call+0x25/0x9f kernel/arch/x86/entry/entry_64.S:281 Memory state around the buggy address: ffff880064e21680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880064e21700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff880064e21780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff880064e21800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880064e21880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Fixes: 273ec51dd7ce ("net: ppp_generic - introduce net-namespace functionality v2") Reported-by: Baozeng Ding Signed-off-by: Guillaume Nault Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b Bug: 28979703 --- drivers/net/ppp/ppp_generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 9a863c6a6a33..afe6fddd2baa 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2290,7 +2290,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan) pch->ppp = NULL; pch->chan = chan; - pch->chan_net = net; + pch->chan_net = get_net(net); chan->ppp = pch; init_ppp_file(&pch->file, CHANNEL); pch->file.hdrlen = chan->hdrlen; @@ -2387,6 +2387,8 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); + put_net(pch->chan_net); + pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); From 1d2d5ceaf5ae9d41656a084d394a1e38f1a80d3c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Feb 2016 11:03:12 +0000 Subject: [PATCH 209/607] UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing (cherry pick from commit 23c8a812dc3c621009e4f0e5342aa4e2ede1ceaa) This fixes CVE-2016-0758. In the ASN.1 decoder, when the length field of an ASN.1 value is extracted, it isn't validated against the remaining amount of data before being added to the cursor. With a sufficiently large size indicated, the check: datalen - dp < 2 may then fail due to integer overflow. Fix this by checking the length indicated against the amount of remaining data in both places a definite length is determined. Whilst we're at it, make the following changes: (1) Check the maximum size of extended length does not exceed the capacity of the variable it's being stored in (len) rather than the type that variable is assumed to be (size_t). (2) Compare the EOC tag to the symbolic constant ASN1_EOC rather than the integer 0. (3) To reduce confusion, move the initialisation of len outside of: for (len = 0; n > 0; n--) { since it doesn't have anything to do with the loop counter n. Signed-off-by: David Howells Reviewed-by: Mimi Zohar Acked-by: David Woodhouse Acked-by: Peter Jones Change-Id: If760bc3b8ab0e59fefc24fa687514324348fb8e8 Bug: 29814470 --- lib/asn1_decoder.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2b3f46c049d4..554522934c44 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -74,7 +74,7 @@ next_tag: /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -96,10 +96,8 @@ next_tag: /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -110,14 +108,18 @@ next_tag: } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; From 2e2e345376a1e0320956512d9ccfa8d9dac60d27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Jun 2016 15:48:57 +0100 Subject: [PATCH 210/607] UPSTREAM: KEYS: potential uninitialized variable (cherry picked from commit 38327424b40bcebe2de92d07312c89360ac9229a) If __key_link_begin() failed then "edit" would be uninitialized. I've added a check to fix that. This allows a random user to crash the kernel, though it's quite difficult to achieve. There are three ways it can be done as the user would have to cause an error to occur in __key_link(): (1) Cause the kernel to run out of memory. In practice, this is difficult to achieve without ENOMEM cropping up elsewhere and aborting the attempt. (2) Revoke the destination keyring between the keyring ID being looked up and it being tested for revocation. In practice, this is difficult to time correctly because the KEYCTL_REJECT function can only be used from the request-key upcall process. Further, users can only make use of what's in /sbin/request-key.conf, though this does including a rejection debugging test - which means that the destination keyring has to be the caller's session keyring in practice. (3) Have just enough key quota available to create a key, a new session keyring for the upcall and a link in the session keyring, but not then sufficient quota to create a link in the nominated destination keyring so that it fails with EDQUOT. The bug can be triggered using option (3) above using something like the following: echo 80 >/proc/sys/kernel/keys/root_maxbytes keyctl request2 user debug:fred negate @t The above sets the quota to something much lower (80) to make the bug easier to trigger, but this is dependent on the system. Note also that the name of the keyring created contains a random number that may be between 1 and 10 characters in size, so may throw the test off by changing the amount of quota used. Assuming the failure occurs, something like the following will be seen: kfree_debugcheck: out of range ptr 6b6b6b6b6b6b6b68h ------------[ cut here ]------------ kernel BUG at ../mm/slab.c:2821! ... RIP: 0010:[] kfree_debugcheck+0x20/0x25 RSP: 0018:ffff8804014a7de8 EFLAGS: 00010092 RAX: 0000000000000034 RBX: 6b6b6b6b6b6b6b68 RCX: 0000000000000000 RDX: 0000000000040001 RSI: 00000000000000f6 RDI: 0000000000000300 RBP: ffff8804014a7df0 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8804014a7e68 R11: 0000000000000054 R12: 0000000000000202 R13: ffffffff81318a66 R14: 0000000000000000 R15: 0000000000000001 ... Call Trace: kfree+0xde/0x1bc assoc_array_cancel_edit+0x1f/0x36 __key_link_end+0x55/0x63 key_reject_and_link+0x124/0x155 keyctl_reject_key+0xb6/0xe0 keyctl_negate_key+0x10/0x12 SyS_keyctl+0x9f/0xe7 do_syscall_64+0x63/0x13a entry_SYSCALL64_slow_path+0x25/0x25 Fixes: f70e2e06196a ('KEYS: Do preallocation for __key_link()') Signed-off-by: Dan Carpenter Signed-off-by: David Howells cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: Ia9616cce142a616beea0ef20bde49129939d2d2d Bug: 29823941 --- security/keys/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/key.c b/security/keys/key.c index ab7997ded725..534808915371 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -578,7 +578,7 @@ int key_reject_and_link(struct key *key, mutex_unlock(&key_construction_mutex); - if (keyring) + if (keyring && link_ret == 0) __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ From 44bd40ae321b27fe6b5e107d207a79fe1cbaa507 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:49 +0100 Subject: [PATCH 211/607] UPSTREAM: netfilter: x_tables: validate e->target_offset early (cherry pick from commit bdf533de6968e9686df777dc178486f600c6e617) We should check that e->target_offset is sane before mark_source_chains gets called since it will fetch the target entry for loop detection. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: Ic2dbc31c9525d698e94d4d8875886acf3524abbd Bug: 29637687 --- net/ipv4/netfilter/arp_tables.c | 17 ++++++++--------- net/ipv4/netfilter/ip_tables.c | 17 ++++++++--------- net/ipv6/netfilter/ip6_tables.c | 17 ++++++++--------- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e5697e649ce1..449e2b56b3b2 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -474,14 +474,12 @@ next: return 1; } -static inline int check_entry(const struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e) { const struct xt_entry_target *t; - if (!arp_checkentry(&e->arp)) { - duprintf("arp_tables: arp check failed %p %s.\n", e, name); + if (!arp_checkentry(&e->arp)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) struct xt_target *target; int ret; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -576,6 +570,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { @@ -590,6 +585,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1246,7 +1245,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct arpt_entry *)e, name); + ret = check_entry((struct arpt_entry *)e); if (ret) return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2bf6b3f1a00c..c3ab1ab12ee6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -568,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e) { const struct xt_entry_target *t; - if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, name); + if (!ip_checkentry(&e->ip)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -665,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -740,6 +734,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { @@ -754,6 +749,10 @@ check_entry_size_and_hooks(struct ipt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1505,7 +1504,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ipt_entry *)e, name); + ret = check_entry((struct ipt_entry *)e); if (ret) return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 37e945cfa5da..f532cb6d9c3d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -580,14 +580,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e) { const struct xt_entry_target *t; - if (!ip6_checkentry(&e->ipv6)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); + if (!ip6_checkentry(&e->ipv6)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -678,10 +676,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -752,6 +746,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { @@ -766,6 +761,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1517,7 +1516,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ip6t_entry *)e, name); + ret = check_entry((struct ip6t_entry *)e); if (ret) return ret; From 160202a2e8a37baf7a1dcd3b3748a19ece5afe3e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:50 +0100 Subject: [PATCH 212/607] UPSTREAM: netfilter: x_tables: make sure e->next_offset covers remaining blob size (cherry pick from commit 6e94e0cfb0887e4013b3b930fa6ab1fe6bb6ba91) Otherwise this function may read data beyond the ruleset blob. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: I9d19ecf3e00a2d52817b35b9042623927895c005 Bug: 29637687 --- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 449e2b56b3b2..36a30fab8625 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -573,7 +573,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1232,7 +1233,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c3ab1ab12ee6..99d46b0a4ead 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -737,7 +737,8 @@ check_entry_size_and_hooks(struct ipt_entry *e, int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1491,7 +1492,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f532cb6d9c3d..6198807e06f4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -749,7 +749,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e, int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1503,7 +1504,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } From 71b1886054473597c46a8a25c95477b5262971b5 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Fri, 23 Oct 2015 15:13:42 -0700 Subject: [PATCH 213/607] CHROMIUM: android: binder: Fix potential scheduling-while-atomic (cherry picked from commit 166b45af97359159f9585a836c9849e725e31fd6) Commit f1e7f0a724f6 ("android: binder: Disable preemption while holding the global binder lock.") re-enabled preemption around most of the sites where calls to potentially sleeping functions were made, but missed __alloc_fd(), which can sleep if the fdtable needs to be resized. Re-enable preemption around __alloc_fd() as well as __fd_install() which can now sleep in upstream kernels as of commit 8a81252b774b ("fs/file.c: don't acquire files->file_lock in fd_install()"). BUG=chrome-os-partner:44012 TEST=Build and boot on Smaug. Change-Id: I9819c4b95876f697e75b1b84810b6c520d9c33ec Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/308582 Reviewed-by: Stephen Barber Reviewed-by: Riley Andrews Bug: 30141999 --- drivers/android/binder.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index f0ce9959d14d..b66fe6545f32 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -379,6 +379,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; + int ret; if (files == NULL) return -ESRCH; @@ -389,7 +390,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - return __alloc_fd(files, 0, rlim_cur, flags); + preempt_enable_no_resched(); + ret = __alloc_fd(files, 0, rlim_cur, flags); + preempt_disable(); + + return ret; } /* @@ -398,8 +403,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) + if (proc->files) { + preempt_enable_no_resched(); __fd_install(proc->files, fd, file); + preempt_disable(); + } } /* From 13c17d0179f9a055062f37e91a6f6cf00a249ebd Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Mon, 18 Jul 2016 22:21:12 +0000 Subject: [PATCH 214/607] Revert "CHROMIUM: android: binder: Fix potential scheduling-while-atomic" This reverts commit 71b1886054473597c46a8a25c95477b5262971b5. Change-Id: I9ded0ff43535c1367c2cf79dfeec20d4b5f0357a --- drivers/android/binder.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b66fe6545f32..f0ce9959d14d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -379,7 +379,6 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; - int ret; if (files == NULL) return -ESRCH; @@ -390,11 +389,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - preempt_enable_no_resched(); - ret = __alloc_fd(files, 0, rlim_cur, flags); - preempt_disable(); - - return ret; + return __alloc_fd(files, 0, rlim_cur, flags); } /* @@ -403,11 +398,8 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) { - preempt_enable_no_resched(); + if (proc->files) __fd_install(proc->files, fd, file); - preempt_disable(); - } } /* From dd802a9c976c9e098d236c551a0c3cb2c6f5271d Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Fri, 5 Jun 2015 18:59:29 -0700 Subject: [PATCH 215/607] cpuset: Add allow_attach hook for cpusets on android. This patch provides a allow_attach hook for cpusets, which resolves lots of the following logcat noise. W SchedPolicy: add_tid_to_cgroup failed to write '2816' (Permission denied); fd=29 W ActivityManager: Failed setting process group of 2816 to 0 W System.err: java.lang.IllegalArgumentException W System.err: at android.os.Process.setProcessGroup(Native Method) W System.err: at com.android.server.am.ActivityManagerService.applyOomAdjLocked(ActivityManagerService.java:18763) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19028) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19106) W System.err: at com.android.server.am.ActiveServices.serviceDoneExecutingLocked(ActiveServices.java:2015) W System.err: at com.android.server.am.ActiveServices.publishServiceLocked(ActiveServices.java:905) W System.err: at com.android.server.am.ActivityManagerService.publishService(ActivityManagerService.java:16065) W System.err: at android.app.ActivityManagerNative.onTransact(ActivityManagerNative.java:1007) W System.err: at com.android.server.am.ActivityManagerService.onTransact(ActivityManagerService.java:2493) W System.err: at android.os.Binder.execTransact(Binder.java:453) Change-Id: Ic1b61b2bbb7ce74c9e9422b5e22ee9078251de21 [Ported to 4.4, added commit message] Signed-off-by: John Stultz --- kernel/cpuset.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02a8ea5c9963..0ff01c2a7222 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2051,12 +2051,30 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +static int cpuset_allow_attach(struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); + + if ((current != task) && !capable(CAP_SYS_ADMIN) && + cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) + return -EACCES; + } + + return 0; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, From f394c99b6ac0236d7e06ee406107294d5f389259 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 5 Jul 2016 22:12:36 -0700 Subject: [PATCH 216/607] UPSTREAM: ppp: defer netns reference release for ppp channel (cherry pick from commit 205e1e255c479f3fd77446415706463b282f94e4) Matt reported that we have a NULL pointer dereference in ppp_pernet() from ppp_connect_channel(), i.e. pch->chan_net is NULL. This is due to that a parallel ppp_unregister_channel() could happen while we are in ppp_connect_channel(), during which pch->chan_net set to NULL. Since we need a reference to net per channel, it makes sense to sync the refcnt with the life time of the channel, therefore we should release this reference when we destroy it. Fixes: 1f461dcdd296 ("ppp: take reference on channels netns") Reported-by: Matt Bennett Cc: Paul Mackerras Cc: linux-ppp@vger.kernel.org Cc: Guillaume Nault Cc: Cyrill Gorcunov Signed-off-by: Cong Wang Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Fixes: Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b ("UPSTREAM: ppp: take reference on channels netns") Signed-off-by: Amit Pundir Change-Id: I24d0bb6f349ab3829f63cfe935ed97b6913a3508 --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index afe6fddd2baa..50cda254b57a 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2387,8 +2387,6 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); - put_net(pch->chan_net); - pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); @@ -2980,6 +2978,9 @@ ppp_disconnect_channel(struct channel *pch) */ static void ppp_destroy_channel(struct channel *pch) { + put_net(pch->chan_net); + pch->chan_net = NULL; + atomic_dec(&channel_count); if (!pch->file.dead) { From 53c39433d5fd7d71bec7e904a6fa6691a9e90fdc Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:05 +0200 Subject: [PATCH 217/607] UPSTREAM: proc: prevent stacking filesystems on top (cherry picked from commit e54ad7f1ee263ffa5a2de9c609d58dfa27b21cd9) This prevents stacking filesystems (ecryptfs and overlayfs) from using procfs as lower filesystem. There is too much magic going on inside procfs, and there is no good reason to stack stuff on top of procfs. (For example, procfs does access checks in VFS open handlers, and ecryptfs by design calls open handlers from a kernel thread that doesn't drop privileges or so.) Signed-off-by: Jann Horn Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: Ib050ef9dc10e623589d22e3a9e6aee9ee4f0cd5d Bug: 29444228 --- fs/proc/root.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/proc/root.c b/fs/proc/root.c index 361ab4ee42fc..ec649c92d270 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); From ed23e1145053fb43639a178e89e99bb14f11272b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:06 +0200 Subject: [PATCH 218/607] UPSTREAM: ecryptfs: forbid opening files without mmap handler (cherry picked from commit 2f36db71009304b3f0b95afacd8eba1f9f046b87) This prevents users from triggering a stack overflow through a recursive invocation of pagefault handling that involves mapping procfs files into virtual memory. Signed-off-by: Jann Horn Acked-by: Tyler Hicks Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: I0be77c7f8bd3046bc34cd87ef577529792d479bc Bug: 29444228 --- fs/ecryptfs/kthread.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 866bb18efefe..e818f5ac7a26 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -147,7 +148,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto out; + goto have_file; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -165,8 +166,16 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) + if (IS_ERR(*lower_file)) { rc = PTR_ERR(*lower_file); + goto out; + } +have_file: + if ((*lower_file)->f_op->mmap == NULL) { + fput(*lower_file); + *lower_file = NULL; + rc = -EMEDIUMTYPE; + } out: return rc; } From 85a09f2732f9d0bbb9fdec98f292f1db6c75a96a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:07 +0200 Subject: [PATCH 219/607] UPSTREAM: sched: panic on corrupted stack end (cherry picked from commit 29d6455178a09e1dc340380c582b13356227e8df) Until now, hitting this BUG_ON caused a recursive oops (because oops handling involves do_exit(), which calls into the scheduler, which in turn raises an oops), which caused stuff below the stack to be overwritten until a panic happened (e.g. via an oops in interrupt context, caused by the overwritten CPU index in the thread_info). Just panic directly. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds Change-Id: Ia3acb3f747f7a58ec2d071644433b0591925969f Bug: 29444228 --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 61b0914cc7aa..b290af9c37a1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3008,7 +3008,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { From 419d43edc519b477d365b6311c48f875cec6bfdf Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2016 11:20:55 -0700 Subject: [PATCH 220/607] FROMLIST: proc: Relax /proc//timerslack_ns capability requirements When an interface to allow a task to change another tasks timerslack was first proposed, it was suggested that something greater then CAP_SYS_NICE would be needed, as a task could be delayed further then what normally could be done with nice adjustments. So CAP_SYS_PTRACE was adopted instead for what became the /proc//timerslack_ns interface. However, for Android (where this feature originates), giving the system_server CAP_SYS_PTRACE would allow it to observe and modify all tasks memory. This is considered too high a privilege level for only needing to change the timerslack. After some discussion, it was realized that a CAP_SYS_NICE process can set a task as SCHED_FIFO, so they could fork some spinning processes and set them all SCHED_FIFO 99, in effect delaying all other tasks for an infinite amount of time. So as a CAP_SYS_NICE task can already cause trouble for other tasks, using it as a required capability for accessing and modifying /proc//timerslack_ns seems sufficient. Thus, this patch loosens the capability requirements to CAP_SYS_NICE and removes CAP_SYS_PTRACE, simplifying some of the code flow as well. This is technically an ABI change, but as the feature just landed in 4.6, I suspect no one is yet using it. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Reviewed-by: Nick Kralevich Acked-by: Serge Hallyn Acked-by: Kees Cook Signed-off-by: John Stultz FROMLIST URL: https://lkml.org/lkml/2016/7/21/522 Change-Id: Ia75481402e3948165a1b7c1551c539530cb25509 (Cherry picked against common/android-4.4) Signed-off-by: John Stultz --- fs/proc/base.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index db221e53a1ac..ee1b021e153e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2260,16 +2260,19 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { count = -EPERM; + goto out; + } + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + +out: put_task_struct(p); return count; @@ -2279,19 +2282,22 @@ static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; - int err = 0; + int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - seq_printf(m, "%llu\n", p->timer_slack_ns); - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { err = -EPERM; + goto out; + } + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + +out: put_task_struct(p); return err; From 5d1b9c0c012ff96d543aa90db888ca511f53dfaa Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2016 17:22:19 -0700 Subject: [PATCH 221/607] FROMLIST: proc: Add LSM hook checks to /proc//timerslack_ns As requested, this patch checks the existing LSM hooks task_getscheduler/task_setscheduler when reading or modifying the task's timerslack value. Previous versions added new get/settimerslack LSM hooks, but since they checked the same PROCESS__SET/GETSCHED values as existing hooks, it was suggested we just use the existing ones. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: James Morris Cc: Android Kernel Team Cc: linux-security-module@vger.kernel.org Cc: selinux@tycho.nsa.gov Signed-off-by: John Stultz FROMLIST URL: https://lkml.org/lkml/2016/7/21/523 Change-Id: Id157d10e2fe0b85f1be45035a6117358a42af028 (Cherry picked back to common/android-4.4) Signed-off-by: John Stultz --- fs/proc/base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index ee1b021e153e..ac5d2aea63b5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2265,6 +2265,12 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, goto out; } + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } + task_lock(p); if (slack_ns == 0) p->timer_slack_ns = p->default_timer_slack_ns; @@ -2293,6 +2299,10 @@ static int timerslack_ns_show(struct seq_file *m, void *v) goto out; } + err = security_task_getscheduler(p); + if (err) + goto out; + task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); From 850117629453026394e13622203e2276891c3b8a Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 15 Jul 2016 12:39:13 +0200 Subject: [PATCH 222/607] BACKPORT: brcmfmac: defer DPC processing during probe The sdio dpc starts processing when in SDIOD_STATE_DATA. This state was entered right after firmware download. This patch moves that transition just before enabling sdio interrupt handling thus avoiding watchdog expiry which would put the bus to sleep while probing. Change-Id: I09c60752374b8145da78000935062be08c5c8a52 Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Dmitry Shmidt --- drivers/net/wireless/brcm80211/brcmfmac/sdio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c index 7e74ac3ad815..bcf29bf6f727 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c @@ -3401,10 +3401,6 @@ static int brcmf_sdio_download_firmware(struct brcmf_sdio *bus, goto err; } - /* Allow full data communication using DPC from now on. */ - brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA); - bcmerror = 0; - err: brcmf_sdio_clkctl(bus, CLK_SDONLY, false); sdio_release_host(bus->sdiodev->func[1]); @@ -4112,6 +4108,9 @@ static void brcmf_sdio_firmware_callback(struct device *dev, } if (err == 0) { + /* Allow full data communication using DPC from now on. */ + brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA); + err = brcmf_sdiod_intr_register(sdiodev); if (err != 0) brcmf_err("intr register failed:%d\n", err); From 1b06e2539281187910eb971202c7b2786fab5b57 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Sun, 31 Jul 2016 22:30:14 -0400 Subject: [PATCH 223/607] usb: gadget: f_accessory: remove duplicate endpoint alloc usb_ep_autoconfig is called twice for allocating bulk out endpoint. Removed the unwanted call. Fixes Issue: 67180 Change-Id: I03e87a86fbbbc85831ff7f0496adf038d1de2956 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index c62123560143..2ca16a577542 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -531,15 +531,6 @@ static int create_bulk_endpoints(struct acc_dev *dev, ep->driver_data = dev; /* claim the endpoint */ dev->ep_out = ep; - ep = usb_ep_autoconfig(cdev->gadget, out_desc); - if (!ep) { - DBG(cdev, "usb_ep_autoconfig for ep_out failed\n"); - return -ENODEV; - } - DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name); - ep->driver_data = dev; /* claim the endpoint */ - dev->ep_out = ep; - /* now allocate requests for our endpoints */ for (i = 0; i < TX_REQ_MAX; i++) { req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE); From 818aa36ea868ba8f2985f9ca0906fd9cba3e437d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Sun, 31 Jul 2016 17:07:46 +0530 Subject: [PATCH 224/607] Revert "panic: Add board ID to panic output" This reverts commit 4e09c510185cb4db2277ce81cce81b7aa06bea45. I checked for the usage of this debug helper in AOSP common kernels as well as vendor kernels (e.g exynos, msm, mediatek, omap, tegra, x86, x86_64) hosted at https://android.googlesource.com/kernel/ and I found out that other than few fairly obsolete Omap trees (for tuna & Glass) and Exynos tree (for Manta), there is no active user of this debug helper. So we can safely remove this helper code. Signed-off-by: Amit Pundir --- include/linux/kernel.h | 4 ---- kernel/panic.c | 8 -------- 2 files changed, 12 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 1eaf3d81f5c1..350dfb08aee3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -830,8 +830,4 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } /* OTHER_WRITABLE? Generally considered a bad idea. */ \ BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) - -/* To identify board information in panic logs, set this */ -extern char *mach_panic_string; - #endif diff --git a/kernel/panic.c b/kernel/panic.c index f07bfc9fe613..4b150bc0c6c1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -28,9 +28,6 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -/* Machine specific panic information string */ -char *mach_panic_string; - int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; @@ -416,11 +413,6 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - - if (mach_panic_string) - printk(KERN_WARNING "Board Information: %s\n", - mach_panic_string); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } From f3d9c312b89966b6d1ff9e4c503a7ae6d38e0176 Mon Sep 17 00:00:00 2001 From: James Carr Date: Fri, 29 Jul 2016 19:02:16 -0700 Subject: [PATCH 225/607] Implement memory_state_time, used by qcom,cpubw New driver memory_state_time tracks time spent in different DDR frequency and bandwidth states. Memory drivers such as qcom,cpubw can post updated state to the driver after registering a callback. Processed by a workqueue Bandwidth buckets are read in from device tree in the relevant qualcomm section, can be defined in any quantity and spacing. The data is exposed at /sys/kernel/memory_state_time, able to be read by the Android framework. Functionality is behind a config option CONFIG_MEMORY_STATE_TIME Change-Id: I4fee165571cb975fb9eacbc9aada5e6d7dd748f0 Signed-off-by: James Carr --- .../bindings/misc/memory-state-time.txt | 8 + android/configs/android-recommended.cfg | 1 + drivers/misc/Kconfig | 6 + drivers/misc/Makefile | 1 + drivers/misc/memory_state_time.c | 454 ++++++++++++++++++ include/linux/memory-state-time.h | 42 ++ 6 files changed, 512 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/memory-state-time.txt create mode 100644 drivers/misc/memory_state_time.c create mode 100644 include/linux/memory-state-time.h diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt new file mode 100644 index 000000000000..c99a506c030d --- /dev/null +++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt @@ -0,0 +1,8 @@ +Memory bandwidth and frequency state tracking + +Required properties: +- compatible : should be: + "memory-state-time" +- freq-tbl: Should contain entries with each frequency in Hz. +- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps. + Must match the framework power_profile.xml for the device. diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 2f1ef077aa9e..3465a848d74d 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -119,6 +119,7 @@ CONFIG_TIMER_STATS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y +CONFIG_MEMORY_STATE_TIME=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 2763d6b3b063..598f96c26678 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -531,6 +531,12 @@ config UID_CPUTIME help Per UID based cpu time statistics exported to /proc/uid_cputime +config MEMORY_STATE_TIME + tristate "Memory freq/bandwidth time statistics" + depends on PROFILING + help + Memory time statistics exported to /sys/kernel/memory_state_time + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index e5142b836aee..b76b4c9fe104 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -57,3 +57,4 @@ obj-$(CONFIG_ECHO) += echo/ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o obj-$(CONFIG_CXL_BASE) += cxl/ obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o +obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c new file mode 100644 index 000000000000..34c797a06a31 --- /dev/null +++ b/drivers/misc/memory_state_time.c @@ -0,0 +1,454 @@ +/* drivers/misc/memory_state_time.c + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define KERNEL_ATTR_RW(_name) \ +static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +#define FREQ_HASH_BITS 4 +DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS); + +static DEFINE_MUTEX(mem_lock); + +#define TAG "memory_state_time" +#define BW_NODE "/soc/memory-state-time" +#define FREQ_TBL "freq-tbl" +#define BW_TBL "bw-buckets" +#define NUM_SOURCES "num-sources" + +#define LOWEST_FREQ 2 + +static int curr_bw; +static int curr_freq; +static u32 *bw_buckets; +static u32 *freq_buckets; +static int num_freqs; +static int num_buckets; +static int registered_bw_sources; +static u64 last_update; +static bool init_success; +static struct workqueue_struct *memory_wq; +static u32 num_sources = 10; +static int *bandwidths; + +struct freq_entry { + int freq; + u64 *buckets; /* Bandwidth buckets. */ + struct hlist_node hash; +}; + +struct queue_container { + struct work_struct update_state; + int value; + u64 time_now; + int id; + struct mutex *lock; +}; + +static int find_bucket(int bw) +{ + int i; + + if (bw_buckets != NULL) { + for (i = 0; i < num_buckets; i++) { + if (bw_buckets[i] > bw) { + pr_debug("Found bucket %d for bandwidth %d\n", + i, bw); + return i; + } + } + return num_buckets - 1; + } + return 0; +} + +static u64 get_time_diff(u64 time_now) +{ + u64 ms; + + ms = time_now - last_update; + last_update = time_now; + return ms; +} + +static ssize_t show_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, j; + int len = 0; + struct freq_entry *freq_entry; + + for (i = 0; i < num_freqs; i++) { + hash_for_each_possible(freq_hash_table, freq_entry, hash, + freq_buckets[i]) { + if (freq_entry->freq == freq_buckets[i]) { + len += scnprintf(buf + len, PAGE_SIZE - len, + "%d ", freq_buckets[i]); + if (len >= PAGE_SIZE) + break; + for (j = 0; j < num_buckets; j++) { + len += scnprintf(buf + len, + PAGE_SIZE - len, + "%llu ", + freq_entry->buckets[j]); + } + len += scnprintf(buf + len, PAGE_SIZE - len, + "\n"); + } + } + } + pr_debug("Current Time: %llu\n", ktime_get_boot_ns()); + return len; +} +KERNEL_ATTR_RO(show_stat); + +static void update_table(u64 time_now) +{ + struct freq_entry *freq_entry; + + pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq); + hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) { + if (curr_freq == freq_entry->freq) { + freq_entry->buckets[find_bucket(curr_bw)] + += get_time_diff(time_now); + break; + } + } +} + +static bool freq_exists(int freq) +{ + int i; + + for (i = 0; i < num_freqs; i++) { + if (freq == freq_buckets[i]) + return true; + } + return false; +} + +static int calculate_total_bw(int bw, int index) +{ + int i; + int total_bw = 0; + + pr_debug("memory_state_time New bw %d for id %d\n", bw, index); + bandwidths[index] = bw; + for (i = 0; i < registered_bw_sources; i++) + total_bw += bandwidths[i]; + return total_bw; +} + +static void freq_update_do_work(struct work_struct *work) +{ + struct queue_container *freq_state_update + = container_of(work, struct queue_container, + update_state); + if (freq_state_update) { + mutex_lock(&mem_lock); + update_table(freq_state_update->time_now); + curr_freq = freq_state_update->value; + mutex_unlock(&mem_lock); + kfree(freq_state_update); + } +} + +static void bw_update_do_work(struct work_struct *work) +{ + struct queue_container *bw_state_update + = container_of(work, struct queue_container, + update_state); + if (bw_state_update) { + mutex_lock(&mem_lock); + update_table(bw_state_update->time_now); + curr_bw = calculate_total_bw(bw_state_update->value, + bw_state_update->id); + mutex_unlock(&mem_lock); + kfree(bw_state_update); + } +} + +static void memory_state_freq_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (freq_exists(value) && init_success) { + struct queue_container *freq_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!freq_container) + return; + INIT_WORK(&freq_container->update_state, + freq_update_do_work); + freq_container->time_now = ktime_get_boot_ns(); + freq_container->value = value; + pr_debug("Scheduling freq update in work queue\n"); + queue_work(memory_wq, &freq_container->update_state); + } else { + pr_debug("Freq does not exist.\n"); + } + } +} + +static void memory_state_bw_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (init_success) { + struct queue_container *bw_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!bw_container) + return; + INIT_WORK(&bw_container->update_state, + bw_update_do_work); + bw_container->time_now = ktime_get_boot_ns(); + bw_container->value = value; + bw_container->id = ub->id; + pr_debug("Scheduling bandwidth update in work queue\n"); + queue_work(memory_wq, &bw_container->update_state); + } + } +} + +struct memory_state_update_block *memory_state_register_frequency_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating frequency source\n"); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_freq_update; + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_frequency_source); + +struct memory_state_update_block *memory_state_register_bandwidth_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating bandwidth source %d\n", + registered_bw_sources); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_bw_update; + if (registered_bw_sources < num_sources) { + block->id = registered_bw_sources++; + } else { + pr_err("Unable to allocate source; max number reached\n"); + kfree(block); + return NULL; + } + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source); + +/* Buckets are designated by their maximum. + * Returns the buckets decided by the capability of the device. + */ +static int get_bw_buckets(struct device *dev) +{ + int ret, lenb; + struct device_node *node = dev->of_node; + + of_property_read_u32(node, NUM_SOURCES, &num_sources); + if (of_find_property(node, BW_TBL, &lenb)) { + bandwidths = devm_kzalloc(dev, + sizeof(*bandwidths) * num_sources, GFP_KERNEL); + if (!bandwidths) + return -ENOMEM; + lenb /= sizeof(*bw_buckets); + bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets), + GFP_KERNEL); + if (!bw_buckets) { + devm_kfree(dev, bandwidths); + return -ENOMEM; + } + ret = of_property_read_u32_array(node, BW_TBL, bw_buckets, + lenb); + if (ret < 0) { + devm_kfree(dev, bandwidths); + devm_kfree(dev, bw_buckets); + pr_err("Unable to read bandwidth table from device tree.\n"); + return ret; + } + } + curr_bw = 0; + num_buckets = lenb; + return 0; +} + +/* Adds struct freq_entry nodes to the hashtable for each compatible frequency. + * Returns the supported number of frequencies. + */ +static int freq_buckets_init(struct device *dev) +{ + struct freq_entry *freq_entry; + int i; + int ret, lenf; + struct device_node *node = dev->of_node; + + if (of_find_property(node, FREQ_TBL, &lenf)) { + lenf /= sizeof(*freq_buckets); + freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets), + GFP_KERNEL); + if (!freq_buckets) + return -ENOMEM; + pr_debug("freqs found len %d\n", lenf); + ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets, + lenf); + if (ret < 0) { + devm_kfree(dev, freq_buckets); + pr_err("Unable to read frequency table from device tree.\n"); + return ret; + } + pr_debug("ret freq %d\n", ret); + } + num_freqs = lenf; + curr_freq = freq_buckets[LOWEST_FREQ]; + + for (i = 0; i < num_freqs; i++) { + freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry), + GFP_KERNEL); + if (!freq_entry) + return -ENOMEM; + freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets, + GFP_KERNEL); + if (!freq_entry->buckets) { + devm_kfree(dev, freq_entry); + return -ENOMEM; + } + pr_debug("memory_state_time Adding freq to ht %d\n", + freq_buckets[i]); + freq_entry->freq = freq_buckets[i]; + hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]); + } + return 0; +} + +struct kobject *memory_kobj; +EXPORT_SYMBOL_GPL(memory_kobj); + +static struct attribute *memory_attrs[] = { + &show_stat_attr.attr, + NULL +}; + +static struct attribute_group memory_attr_group = { + .attrs = memory_attrs, +}; + +static int memory_state_time_probe(struct platform_device *pdev) +{ + int error; + + error = get_bw_buckets(&pdev->dev); + if (error) + return error; + error = freq_buckets_init(&pdev->dev); + if (error) + return error; + last_update = ktime_get_boot_ns(); + init_success = true; + + pr_debug("memory_state_time initialized with num_freqs %d\n", + num_freqs); + return 0; +} + +static const struct of_device_id match_table[] = { + { .compatible = "memory-state-time" }, + {} +}; + +static struct platform_driver memory_state_time_driver = { + .probe = memory_state_time_probe, + .driver = { + .name = "memory-state-time", + .of_match_table = match_table, + .owner = THIS_MODULE, + }, +}; + +static int __init memory_state_time_init(void) +{ + int error; + + hash_init(freq_hash_table); + memory_wq = create_singlethread_workqueue("memory_wq"); + if (!memory_wq) { + pr_err("Unable to create workqueue.\n"); + return -EINVAL; + } + /* + * Create sys/kernel directory for memory_state_time. + */ + memory_kobj = kobject_create_and_add(TAG, kernel_kobj); + if (!memory_kobj) { + pr_err("Unable to allocate memory_kobj for sysfs directory.\n"); + error = -ENOMEM; + goto wq; + } + error = sysfs_create_group(memory_kobj, &memory_attr_group); + if (error) { + pr_err("Unable to create sysfs folder.\n"); + goto kobj; + } + + error = platform_driver_register(&memory_state_time_driver); + if (error) { + pr_err("Unable to register memory_state_time platform driver.\n"); + goto group; + } + return 0; + +group: sysfs_remove_group(memory_kobj, &memory_attr_group); +kobj: kobject_put(memory_kobj); +wq: destroy_workqueue(memory_wq); + return error; +} +module_init(memory_state_time_init); diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h new file mode 100644 index 000000000000..d2212b027866 --- /dev/null +++ b/include/linux/memory-state-time.h @@ -0,0 +1,42 @@ +/* include/linux/memory-state-time.h + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include + +#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE) + +struct memory_state_update_block; + +typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub, + int value); + +/* This struct is populated when you pass it to a memory_state_register* + * function. The update_call function is used for an update and defined in the + * typedef memory_state_update_fn_t + */ +struct memory_state_update_block { + memory_state_update_fn_t update_call; + int id; +}; + +/* Register a frequency struct memory_state_update_block to provide updates to + * memory_state_time about frequency changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_frequency_source(void); + +/* Register a bandwidth struct memory_state_update_block to provide updates to + * memory_state_time about bandwidth changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_bandwidth_source(void); From ee247d4205fdee14675c4068c7fbb2c8118ba4ce Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 14 Mar 2016 09:56:35 -0300 Subject: [PATCH 226/607] UPSTREAM: net: Fix use after free in the recvmmsg exit path (cherry picked from commit 34b88a68f26a75e4fded796f1a49c40f82234b7d) The syzkaller fuzzer hit the following use-after-free: Call Trace: [] __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:295 [] __sys_recvmmsg+0x6fa/0x7f0 net/socket.c:2261 [< inline >] SYSC_recvmmsg net/socket.c:2281 [] SyS_recvmmsg+0x16f/0x180 net/socket.c:2270 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 And, as Dmitry rightly assessed, that is because we can drop the reference and then touch it when the underlying recvmsg calls return some packets and then hit an error, which will make recvmmsg to set sock->sk->sk_err, oops, fix it. Reported-and-Tested-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Eric Dumazet Cc: Kostya Serebryany Cc: Sasha Levin Fixes: a2e2725541fa ("net: Introduce recvmmsg socket syscall") http://lkml.kernel.org/r/20160122211644.GC2470@redhat.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Change-Id: I2adb0faf595b7b634d9b739dfdd1a47109e20ecb Bug: 30515201 --- net/socket.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/net/socket.c b/net/socket.c index d730ef9dfbf0..263b334ec5e4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2238,31 +2238,31 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, break; } + if (err == 0) + goto out_put; + + if (datagrams == 0) { + datagrams = err; + goto out_put; + } + + /* + * We may return less entries than requested (vlen) if the + * sock is non block and there aren't enough datagrams... + */ + if (err != -EAGAIN) { + /* + * ... or if recvmsg returns an error after we + * received some datagrams, where we record the + * error to return on the next call or if the + * app asks about it using getsockopt(SO_ERROR). + */ + sock->sk->sk_err = -err; + } out_put: fput_light(sock->file, fput_needed); - if (err == 0) - return datagrams; - - if (datagrams != 0) { - /* - * We may return less entries than requested (vlen) if the - * sock is non block and there aren't enough datagrams... - */ - if (err != -EAGAIN) { - /* - * ... or if recvmsg returns an error after we - * received some datagrams, where we record the - * error to return on the next call or if the - * app asks about it using getsockopt(SO_ERROR). - */ - sock->sk->sk_err = -err; - } - - return datagrams; - } - - return err; + return datagrams; } SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, From 96b0434c25be8c73822a679be35b29ca5263baea Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Wed, 9 Jun 2010 17:47:38 -0500 Subject: [PATCH 227/607] CHROMIUM: dm: boot time specification of dm= This is a wrap-up of three patches pending upstream approval. I'm bundling them because they are interdependent, and it'll be easier to drop it on rebase later. 1. dm: allow a dm-fs-style device to be shared via dm-ioctl Integrates feedback from Alisdair, Mike, and Kiyoshi. Two main changes occur here: - One function is added which allows for a programmatically created mapped device to be inserted into the dm-ioctl hash table. This binds the device to a name and, optional, uuid which is needed by udev and allows for userspace management of the mapped device. - dm_table_complete() was extended to handle all of the final functional changes required for the table to be operational once called. 2. init: boot to device-mapper targets without an initr* Add a dm= kernel parameter modeled after the md= parameter from do_mounts_md. It allows for device-mapper targets to be configured at boot time for use early in the boot process (as the root device or otherwise). It also replaces /dev/XXX calls with major:minor opportunistically. The format is dm="name uuid ro,table line 1,table line 2,...". The parser expects the comma to be safe to use as a newline substitute but, otherwise, uses the normal separator of space. Some attempt has been made to make it forgiving of additional spaces (using skip_spaces()). A mapped device created during boot will be assigned a minor of 0 and may be access via /dev/dm-0. An example dm-linear root with no uuid may look like: root=/dev/dm-0 dm="lroot none ro, 0 4096 linear /dev/ubdb 0, 4096 4096 linear /dv/ubdc 0" Once udev is started, /dev/dm-0 will become /dev/mapper/lroot. Older upstream threads: http://marc.info/?l=dm-devel&m=127429492521964&w=2 http://marc.info/?l=dm-devel&m=127429499422096&w=2 http://marc.info/?l=dm-devel&m=127429493922000&w=2 Latest upstream threads: https://patchwork.kernel.org/patch/104859/ https://patchwork.kernel.org/patch/104860/ https://patchwork.kernel.org/patch/104861/ Bug: 27175947 Signed-off-by: Will Drewry Review URL: http://codereview.chromium.org/2020011 Change-Id: I92bd53432a11241228d2e5ac89a3b20d19b05a31 --- Documentation/device-mapper/boot.txt | 42 +++ Documentation/kernel-parameters.txt | 6 + drivers/md/dm-ioctl.c | 39 +++ drivers/md/dm-table.c | 1 + include/linux/device-mapper.h | 6 + init/Makefile | 1 + init/do_mounts.c | 1 + init/do_mounts.h | 10 + init/do_mounts_dm.c | 410 +++++++++++++++++++++++++++ 9 files changed, 516 insertions(+) create mode 100644 Documentation/device-mapper/boot.txt create mode 100644 init/do_mounts_dm.c diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt new file mode 100644 index 000000000000..adcaad5e5e32 --- /dev/null +++ b/Documentation/device-mapper/boot.txt @@ -0,0 +1,42 @@ +Boot time creation of mapped devices +=================================== + +It is possible to configure a device mapper device to act as the root +device for your system in two ways. + +The first is to build an initial ramdisk which boots to a minimal +userspace which configures the device, then pivot_root(8) in to it. + +For simple device mapper configurations, it is possible to boot directly +using the following kernel command line: + +dm=" ,table line 1,...,table line n" + +name = the name to associate with the device + after boot, udev, if used, will use that name to label + the device node. +uuid = may be 'none' or the UUID desired for the device. +ro = may be "ro" or "rw". If "ro", the device and device table will be + marked read-only. + +Each table line may be as normal when using the dmsetup tool except for +two variations: +1. Any use of commas will be interpreted as a newline +2. Quotation marks cannot be escaped and cannot be used without + terminating the dm= argument. + +Unless renamed by udev, the device node created will be dm-0 as the +first minor number for the device-mapper is used during early creation. + +Example +======= + +- Booting to a linear array made up of user-mode linux block devices: + + dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \ + root=/dev/dm-0 + +Will boot to a rw dm-linear target of 8192 sectors split across two +block devices identified by their major:minor numbers. After boot, udev +will rename this target to /dev/mapper/lroot (depending on the rules). +No uuid was assigned. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 742f69d18fc8..b02f027cba72 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -56,6 +56,7 @@ parameter is applicable: BLACKFIN Blackfin architecture is enabled. CLK Common clock infrastructure is enabled. CMA Contiguous Memory Area support is enabled. + DM Device mapper support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime EDD BIOS Enhanced Disk Drive Services (EDD) is enabled @@ -915,6 +916,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. dis_ucode_ldr [X86] Disable the microcode loader. + dm= [DM] Allows early creation of a device-mapper device. + See Documentation/device-mapper/boot.txt. + + dmasound= [HW,OSS] Sound subsystem buff + dma_debug=off If the kernel is compiled with DMA_API_DEBUG support, this option disables the debugging code at boot. diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 80a439543259..bc5e9a5b1f30 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1923,6 +1923,45 @@ void dm_interface_exit(void) dm_hash_exit(); } + +/** + * dm_ioctl_export - Permanently export a mapped device via the ioctl interface + * @md: Pointer to mapped_device + * @name: Buffer (size DM_NAME_LEN) for name + * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired + */ +int dm_ioctl_export(struct mapped_device *md, const char *name, + const char *uuid) +{ + int r = 0; + struct hash_cell *hc; + + if (!md) { + r = -ENXIO; + goto out; + } + + /* The name and uuid can only be set once. */ + mutex_lock(&dm_hash_cells_mutex); + hc = dm_get_mdptr(md); + mutex_unlock(&dm_hash_cells_mutex); + if (hc) { + DMERR("%s: already exported", dm_device_name(md)); + r = -ENXIO; + goto out; + } + + r = dm_hash_insert(name, uuid, md); + if (r) { + DMERR("%s: could not bind to '%s'", dm_device_name(md), name); + goto out; + } + + /* Let udev know we've changed. */ + dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md)); +out: + return r; +} /** * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers * @md: Pointer to mapped_device diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 061152a43730..80ede7a70761 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ec1c61c87d89..87afa0552398 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -380,6 +380,12 @@ void dm_put(struct mapped_device *md); void dm_set_mdptr(struct mapped_device *md, void *ptr); void *dm_get_mdptr(struct mapped_device *md); +/* + * Export the device via the ioctl interface (uses mdptr). + */ +int dm_ioctl_export(struct mapped_device *md, const char *name, + const char *uuid); + /* * A device can still be used while suspended, but I/O is deferred. */ diff --git a/init/Makefile b/init/Makefile index 692b91f1c1d4..243f61de2cba 100644 --- a/init/Makefile +++ b/init/Makefile @@ -15,6 +15,7 @@ mounts-y := do_mounts.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o +mounts-$(CONFIG_BLK_DEV_DM) += do_mounts_dm.o # dependencies on generated files need to be listed explicitly $(obj)/version.o: include/generated/compile.h diff --git a/init/do_mounts.c b/init/do_mounts.c index dea5de95c2dd..1902a1c80831 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -566,6 +566,7 @@ void __init prepare_namespace(void) wait_for_device_probe(); md_run_setup(); + dm_run_setup(); if (saved_root_name[0]) { root_device_name = saved_root_name; diff --git a/init/do_mounts.h b/init/do_mounts.h index f5b978a9bb92..09d22862e8c3 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -74,3 +74,13 @@ void md_run_setup(void); static inline void md_run_setup(void) {} #endif + +#ifdef CONFIG_BLK_DEV_DM + +void dm_run_setup(void); + +#else + +static inline void dm_run_setup(void) {} + +#endif diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c new file mode 100644 index 000000000000..0fd3411533f3 --- /dev/null +++ b/init/do_mounts_dm.c @@ -0,0 +1,410 @@ +/* do_mounts_dm.c + * Copyright (C) 2010 The Chromium OS Authors + * All Rights Reserved. + * Based on do_mounts_md.c + * + * This file is released under the GPL. + */ +#include +#include +#include + +#include "do_mounts.h" + +#define DM_MAX_NAME 32 +#define DM_MAX_UUID 129 +#define DM_NO_UUID "none" + +#define DM_MSG_PREFIX "init" + +/* Separators used for parsing the dm= argument. */ +#define DM_FIELD_SEP ' ' +#define DM_LINE_SEP ',' + +/* + * When the device-mapper and any targets are compiled into the kernel + * (not a module), one target may be created and used as the root device at + * boot time with the parameters given with the boot line dm=... + * The code for that is here. + */ + +struct dm_setup_target { + sector_t begin; + sector_t length; + char *type; + char *params; + /* simple singly linked list */ + struct dm_setup_target *next; +}; + +static struct { + int minor; + int ro; + char name[DM_MAX_NAME]; + char uuid[DM_MAX_UUID]; + char *targets; + struct dm_setup_target *target; + int target_count; +} dm_setup_args __initdata; + +static __initdata int dm_early_setup; + +static size_t __init get_dm_option(char *str, char **next, char sep) +{ + size_t len = 0; + char *endp = NULL; + + if (!str) + return 0; + + endp = strchr(str, sep); + if (!endp) { /* act like strchrnul */ + len = strlen(str); + endp = str + len; + } else { + len = endp - str; + } + + if (endp == str) + return 0; + + if (!next) + return len; + + if (*endp == 0) { + /* Don't advance past the nul. */ + *next = endp; + } else { + *next = endp + 1; + } + return len; +} + +static int __init dm_setup_args_init(void) +{ + dm_setup_args.minor = 0; + dm_setup_args.ro = 0; + dm_setup_args.target = NULL; + dm_setup_args.target_count = 0; + return 0; +} + +static int __init dm_setup_cleanup(void) +{ + struct dm_setup_target *target = dm_setup_args.target; + struct dm_setup_target *old_target = NULL; + while (target) { + kfree(target->type); + kfree(target->params); + old_target = target; + target = target->next; + kfree(old_target); + dm_setup_args.target_count--; + } + BUG_ON(dm_setup_args.target_count); + return 0; +} + +static char * __init dm_setup_parse_device_args(char *str) +{ + char *next = NULL; + size_t len = 0; + + /* Grab the logical name of the device to be exported to udev */ + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len) { + DMERR("failed to parse device name"); + goto parse_fail; + } + len = min(len + 1, sizeof(dm_setup_args.name)); + strlcpy(dm_setup_args.name, str, len); /* includes nul */ + str = skip_spaces(next); + + /* Grab the UUID value or "none" */ + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len) { + DMERR("failed to parse device uuid"); + goto parse_fail; + } + len = min(len + 1, sizeof(dm_setup_args.uuid)); + strlcpy(dm_setup_args.uuid, str, len); + str = skip_spaces(next); + + /* Determine if the table/device will be read only or read-write */ + if (!strncmp("ro,", str, 3)) { + dm_setup_args.ro = 1; + } else if (!strncmp("rw,", str, 3)) { + dm_setup_args.ro = 0; + } else { + DMERR("failed to parse table mode"); + goto parse_fail; + } + str = skip_spaces(str + 3); + + return str; + +parse_fail: + return NULL; +} + +static void __init dm_substitute_devices(char *str, size_t str_len) +{ + char *candidate = str; + char *candidate_end = str; + char old_char; + size_t len = 0; + dev_t dev; + + if (str_len < 3) + return; + + while (str && *str) { + candidate = strchr(str, '/'); + if (!candidate) + break; + + /* Avoid embedded slashes */ + if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) { + str = strchr(candidate, DM_FIELD_SEP); + continue; + } + + len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP); + str = skip_spaces(candidate_end); + if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */ + continue; + + /* Temporarily terminate with a nul */ + candidate_end--; + old_char = *candidate_end; + *candidate_end = '\0'; + + DMDEBUG("converting candidate device '%s' to dev_t", candidate); + /* Use the boot-time specific device naming */ + dev = name_to_dev_t(candidate); + *candidate_end = old_char; + + DMDEBUG(" -> %u", dev); + /* No suitable replacement found */ + if (!dev) + continue; + + /* Rewrite the /dev/path as a major:minor */ + len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev)); + if (!len) { + DMERR("error substituting device major/minor."); + break; + } + candidate += len; + /* Pad out with spaces (fixing our nul) */ + while (candidate < candidate_end) + *(candidate++) = DM_FIELD_SEP; + } +} + +static int __init dm_setup_parse_targets(char *str) +{ + char *next = NULL; + size_t len = 0; + struct dm_setup_target **target = NULL; + + /* Targets are defined as per the table format but with a + * comma as a newline separator. */ + target = &dm_setup_args.target; + while (str && *str) { + *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL); + if (!*target) { + DMERR("failed to allocate memory for target %d", + dm_setup_args.target_count); + goto parse_fail; + } + dm_setup_args.target_count++; + + (*target)->begin = simple_strtoull(str, &next, 10); + if (!next || *next != DM_FIELD_SEP) { + DMERR("failed to parse starting sector for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next + 1); + + (*target)->length = simple_strtoull(str, &next, 10); + if (!next || *next != DM_FIELD_SEP) { + DMERR("failed to parse length for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next + 1); + + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len || + !((*target)->type = kstrndup(str, len, GFP_KERNEL))) { + DMERR("failed to parse type for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next); + + len = get_dm_option(str, &next, DM_LINE_SEP); + if (!len || + !((*target)->params = kstrndup(str, len, GFP_KERNEL))) { + DMERR("failed to parse params for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next); + + /* Before moving on, walk through the copied target and + * attempt to replace all /dev/xxx with the major:minor number. + * It may not be possible to resolve them traditionally at + * boot-time. */ + dm_substitute_devices((*target)->params, len); + + target = &((*target)->next); + } + DMDEBUG("parsed %d targets", dm_setup_args.target_count); + + return 0; + +parse_fail: + return 1; +} + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the DM device now; that is handled by + * dm_setup_drive after the low-level disk drivers have initialised. + * dm format is as follows: + * dm="name uuid fmode,[table line 1],[table line 2],..." + * May be used with root=/dev/dm-0 as it always uses the first dm minor. + */ + +static int __init dm_setup(char *str) +{ + dm_setup_args_init(); + + str = dm_setup_parse_device_args(str); + if (!str) { + DMDEBUG("str is NULL"); + goto parse_fail; + } + + /* Target parsing is delayed until we have dynamic memory */ + dm_setup_args.targets = str; + + printk(KERN_INFO "dm: will configure '%s' on dm-%d\n", + dm_setup_args.name, dm_setup_args.minor); + + dm_early_setup = 1; + return 1; + +parse_fail: + printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n"); + return 0; +} + + +static void __init dm_setup_drive(void) +{ + struct mapped_device *md = NULL; + struct dm_table *table = NULL; + struct dm_setup_target *target; + char *uuid = dm_setup_args.uuid; + fmode_t fmode = FMODE_READ; + + /* Finish parsing the targets. */ + if (dm_setup_parse_targets(dm_setup_args.targets)) + goto parse_fail; + + if (dm_create(dm_setup_args.minor, &md)) { + DMDEBUG("failed to create the device"); + goto dm_create_fail; + } + DMDEBUG("created device '%s'", dm_device_name(md)); + + /* In addition to flagging the table below, the disk must be + * set explicitly ro/rw. */ + set_disk_ro(dm_disk(md), dm_setup_args.ro); + + if (!dm_setup_args.ro) + fmode |= FMODE_WRITE; + if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) { + DMDEBUG("failed to create the table"); + goto dm_table_create_fail; + } + + target = dm_setup_args.target; + while (target) { + DMINFO("adding target '%llu %llu %s %s'", + (unsigned long long) target->begin, + (unsigned long long) target->length, target->type, + target->params); + if (dm_table_add_target(table, target->type, target->begin, + target->length, target->params)) { + DMDEBUG("failed to add the target to the table"); + goto add_target_fail; + } + target = target->next; + } + + if (dm_table_complete(table)) { + DMDEBUG("failed to complete the table"); + goto table_complete_fail; + } + + /* Suspend the device so that we can bind it to the table. */ + if (dm_suspend(md, 0)) { + DMDEBUG("failed to suspend the device pre-bind"); + goto suspend_fail; + } + + /* Bind the table to the device. This is the only way to associate + * md->map with the table and set the disk capacity directly. */ + if (dm_swap_table(md, table)) { /* should return NULL. */ + DMDEBUG("failed to bind the device to the table"); + goto table_bind_fail; + } + + /* Finally, resume and the device should be ready. */ + if (dm_resume(md)) { + DMDEBUG("failed to resume the device"); + goto resume_fail; + } + + /* Export the dm device via the ioctl interface */ + if (!strcmp(DM_NO_UUID, dm_setup_args.uuid)) + uuid = NULL; + if (dm_ioctl_export(md, dm_setup_args.name, uuid)) { + DMDEBUG("failed to export device with given name and uuid"); + goto export_fail; + } + printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + + dm_setup_cleanup(); + return; + +export_fail: +resume_fail: +table_bind_fail: +suspend_fail: +table_complete_fail: +add_target_fail: + dm_table_put(table); +dm_table_create_fail: + dm_put(md); +dm_create_fail: + dm_setup_cleanup(); +parse_fail: + printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n", + dm_setup_args.minor, dm_setup_args.name); +} + +__setup("dm=", dm_setup); + +void __init dm_run_setup(void) +{ + if (!dm_early_setup) + return; + printk(KERN_INFO "dm: attempting early device configuration.\n"); + dm_setup_drive(); +} From 5a77db78398a7247ab6bb6ff9720bae19d1f9ff4 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 8 Feb 2016 16:47:41 -0800 Subject: [PATCH 228/607] ANDROID: dm: Rebase on top of 4.1 1. "dm: optimize use SRCU and RCU" removes the use of dm_table_put. 2. "dm: remove request-based logic from make_request_fn wrapper" necessitates calling dm_setup_md_queue or else the request_queue's make_request_fn pointer ends being unset. [ 7.711600] Internal error: Oops - bad mode: 0 [#1] PREEMPT SMP [ 7.717519] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G W 4.1.15-02273-gb057d16-dirty #33 [ 7.726559] Hardware name: HiKey Development Board (DT) [ 7.731779] task: ffffffc005f8acc0 ti: ffffffc005f8c000 task.ti: ffffffc005f8c000 [ 7.739257] PC is at 0x0 [ 7.741787] LR is at generic_make_request+0x8c/0x108 .... [ 9.082931] Call trace: [ 9.085372] [< (null)>] (null) [ 9.090074] [] submit_bio+0x98/0x1e0 [ 9.095212] [] _submit_bh+0x120/0x1f0 [ 9.096165] cfg80211: Calling CRDA to update world regulatory domain [ 9.106781] [] __bread_gfp+0x94/0x114 [ 9.112004] [] ext4_fill_super+0x18c/0x2d64 [ 9.117750] [] mount_bdev+0x194/0x1c0 [ 9.122973] [] ext4_mount+0x14/0x1c [ 9.128021] [] mount_fs+0x3c/0x194 [ 9.132985] [] vfs_kern_mount+0x4c/0x134 [ 9.138467] [] do_mount+0x204/0xbbc [ 9.143514] [] SyS_mount+0x94/0xe8 [ 9.148479] [] mount_block_root+0x120/0x24c [ 9.154222] [] mount_root+0x110/0x12c [ 9.159443] [] prepare_namespace+0x170/0x1b8 [ 9.165273] [] kernel_init_freeable+0x23c/0x260 [ 9.171365] [] kernel_init+0x10/0x118 [ 9.176589] Code: bad PC value [ 9.179807] ---[ end trace 75e1bc52ba364d13 ]--- Bug: 27175947 Signed-off-by: Badhri Jagan Sridharan Change-Id: I952d86fd1475f0825f9be1386e3497b36127abd0 --- init/do_mounts_dm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index 0fd3411533f3..f521bc5ae248 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -10,6 +10,7 @@ #include #include "do_mounts.h" +#include "../drivers/md/dm.h" #define DM_MAX_NAME 32 #define DM_MAX_UUID 129 @@ -333,6 +334,7 @@ static void __init dm_setup_drive(void) goto dm_table_create_fail; } + dm_lock_md_type(md); target = dm_setup_args.target; while (target) { DMINFO("adding target '%llu %llu %s %s'", @@ -352,6 +354,17 @@ static void __init dm_setup_drive(void) goto table_complete_fail; } + if (dm_get_md_type(md) == DM_TYPE_NONE) { + dm_set_md_type(md, dm_table_get_type(table)); + if (dm_setup_md_queue(md)) { + DMWARN("unable to set up device queue for new table."); + goto setup_md_queue_fail; + } + } else if (dm_get_md_type(md) != dm_table_get_type(table)) { + DMWARN("can't change device type after initial table load."); + goto setup_md_queue_fail; + } + /* Suspend the device so that we can bind it to the table. */ if (dm_suspend(md, 0)) { DMDEBUG("failed to suspend the device pre-bind"); @@ -380,6 +393,7 @@ static void __init dm_setup_drive(void) } printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + dm_unlock_md_type(md); dm_setup_cleanup(); return; @@ -387,9 +401,10 @@ export_fail: resume_fail: table_bind_fail: suspend_fail: +setup_md_queue_fail: table_complete_fail: add_target_fail: - dm_table_put(table); + dm_unlock_md_type(md); dm_table_create_fail: dm_put(md); dm_create_fail: From 26cab56d9af49e78dd1b395454f484003cc85575 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Mon, 2 May 2016 17:29:28 +0200 Subject: [PATCH 229/607] ANDROID: dm: fix dm_substitute_devices() When candidate is the last parameter, candidate_end points to the '\0' character and not the DM_FIELD_SEP character. In such a situation, we should not move the candidate_end pointer one character backward. Signed-off-by: Jeremy Compostella --- init/do_mounts_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index f521bc5ae248..ecda58df9a19 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -176,7 +176,8 @@ static void __init dm_substitute_devices(char *str, size_t str_len) continue; /* Temporarily terminate with a nul */ - candidate_end--; + if (*candidate_end) + candidate_end--; old_char = *candidate_end; *candidate_end = '\0'; From 36d01a590d035804dc100e105facf4049cf6fe43 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 14 Dec 2015 20:09:39 -0800 Subject: [PATCH 230/607] ANDROID: dm: Add android verity target This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked to the actual data blocks in the block device. The signature of the metadata contents are verified against the key included in the system keyring. Upon success, the underlying verity target is setup. BUG: 27175947 Change-Id: I7e99644a0960ac8279f02c0158ed20999510ea97 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/Kconfig | 16 + drivers/md/Makefile | 4 + drivers/md/dm-android-verity.c | 771 +++++++++++++++++++++++++++++++++ drivers/md/dm-android-verity.h | 92 ++++ drivers/md/dm-verity-target.c | 12 +- drivers/md/dm-verity.h | 12 + 6 files changed, 901 insertions(+), 6 deletions(-) create mode 100644 drivers/md/dm-android-verity.c create mode 100644 drivers/md/dm-android-verity.h diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d8b0ab6f3753..96b419b544ed 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -500,4 +500,20 @@ config DM_LOG_WRITES If unsure, say N. +config DM_ANDROID_VERITY + bool "Android verity target support" + depends on DM_VERITY + depends on X509_CERTIFICATE_PARSER + depends on SYSTEM_TRUSTED_KEYRING + depends on PUBLIC_KEY_ALGO_RSA + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE + depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + ---help--- + This device-mapper target is virtually a VERITY target. This + target is setup by reading the metadata contents piggybacked + to the actual data blocks in the block device. The signature + of the metadata contents are verified against the key included + in the system keyring. Upon success, the underlying verity + target is setup. endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e8e0..c8fb00d8cc36 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -68,3 +68,7 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif + +ifeq ($(CONFIG_DM_ANDROID_VERITY),y) +dm-verity-objs += dm-android-verity.o +endif diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c new file mode 100644 index 000000000000..c77c9fa7a962 --- /dev/null +++ b/drivers/md/dm-android-verity.c @@ -0,0 +1,771 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "dm-verity.h" +#include "dm-android-verity.h" + +static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; + +static int __init verified_boot_state_param(char *line) +{ + strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); + return 1; +} + +__setup("androidboot.verifiedbootstate=", verified_boot_state_param); + +static int __init verity_mode_param(char *line) +{ + strlcpy(veritymode, line, sizeof(veritymode)); + return 1; +} + +__setup("androidboot.veritymode=", verity_mode_param); + +static int table_extract_mpi_array(struct public_key_signature *pks, + const void *data, size_t len) +{ + MPI mpi = mpi_read_raw_data(data, len); + + if (!mpi) { + DMERR("Error while allocating mpi array"); + return -ENOMEM; + } + + pks->mpi[0] = mpi; + pks->nr_mpi = 1; + return 0; +} + +static struct public_key_signature *table_make_digest( + enum pkey_hash_algo hash, + const void *table, + unsigned long table_len) +{ + struct public_key_signature *pks = NULL; + struct crypto_shash *tfm; + struct shash_desc *desc; + size_t digest_size, desc_size; + int ret; + + /* Allocate the hashing algorithm we're going to need and find out how + * big the hash operational data will be. + */ + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + if (IS_ERR(tfm)) + return ERR_CAST(tfm); + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + digest_size = crypto_shash_digestsize(tfm); + + /* We allocate the hash operational data storage on the end of out + * context data and the digest output buffer on the end of that. + */ + ret = -ENOMEM; + pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); + if (!pks) + goto error; + + pks->pkey_hash_algo = hash; + pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; + pks->digest_size = digest_size; + + desc = (struct shash_desc *)(pks + 1); + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto error; + + ret = crypto_shash_finup(desc, table, table_len, pks->digest); + if (ret < 0) + goto error; + + crypto_free_shash(tfm); + return pks; + +error: + kfree(pks); + crypto_free_shash(tfm); + return ERR_PTR(ret); +} + +static int read_block_dev(struct bio_read *payload, struct block_device *bdev, + sector_t offset, int length) +{ + struct bio *bio; + int err = 0, i; + + payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE); + + bio = bio_alloc(GFP_KERNEL, payload->number_of_pages); + if (!bio) { + DMERR("Error while allocating bio"); + return -ENOMEM; + } + + bio->bi_bdev = bdev; + bio->bi_sector = offset; + + payload->page_io = kzalloc(sizeof(struct page *) * + payload->number_of_pages, GFP_KERNEL); + if (!payload->page_io) { + DMERR("page_io array alloc failed"); + err = -ENOMEM; + goto free_bio; + } + + for (i = 0; i < payload->number_of_pages; i++) { + payload->page_io[i] = alloc_page(GFP_KERNEL); + if (!payload->page_io[i]) { + DMERR("alloc_page failed"); + err = -ENOMEM; + goto free_pages; + } + if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) { + DMERR("bio_add_page error"); + err = -EIO; + goto free_pages; + } + } + + if (!submit_bio_wait(READ, bio)) + /* success */ + goto free_bio; + DMERR("bio read failed"); + err = -EIO; + +free_pages: + for (i = 0; i < payload->number_of_pages; i++) + if (payload->page_io[i]) + __free_page(payload->page_io[i]); + kfree(payload->page_io); +free_bio: + bio_put(bio); + return err; +} + +static inline u64 fec_div_round_up(u64 x, u64 y) +{ + u64 remainder; + + return div64_u64_rem(x, y, &remainder) + + (remainder > 0 ? 1 : 0); +} + +static inline void populate_fec_metadata(struct fec_header *header, + struct fec_ecc_metadata *ecc) +{ + ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size), + FEC_BLOCK_SIZE); + ecc->roots = le32_to_cpu(header->roots); + ecc->start = le64_to_cpu(header->inp_size); +} + +static inline int validate_fec_header(struct fec_header *header, u64 offset) +{ + /* move offset to make the sanity check work for backup header + * as well. */ + offset -= offset % FEC_BLOCK_SIZE; + if (le32_to_cpu(header->magic) != FEC_MAGIC || + le32_to_cpu(header->version) != FEC_VERSION || + le32_to_cpu(header->size) != sizeof(struct fec_header) || + le32_to_cpu(header->roots) == 0 || + le32_to_cpu(header->roots) >= FEC_RSM || + offset < le32_to_cpu(header->fec_size) || + offset - le32_to_cpu(header->fec_size) != + le64_to_cpu(header->inp_size)) + return -EINVAL; + + return 0; +} + +static int extract_fec_header(dev_t dev, struct fec_header *fec, + struct fec_ecc_metadata *ecc) +{ + u64 device_size; + struct bio_read payload; + int i, err = 0; + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + + if (IS_ERR(bdev)) { + DMERR("bdev get error"); + return PTR_ERR(bdev); + } + + device_size = i_size_read(bdev->bd_inode); + + /* fec metadata size is a power of 2 and PAGE_SIZE + * is a power of 2 as well. + */ + BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE); + /* 512 byte sector alignment */ + BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0); + + err = read_block_dev(&payload, bdev, (device_size - + FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE); + if (err) { + DMERR("Error while reading verity metadata"); + goto error; + } + + BUG_ON(sizeof(struct fec_header) > PAGE_SIZE); + memcpy(fec, page_address(payload.page_io[0]), + sizeof(*fec)); + + ecc->valid = true; + if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) { + /* Try the backup header */ + memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE + - sizeof(*fec) , + sizeof(*fec)); + if (validate_fec_header(fec, device_size - + sizeof(struct fec_header))) + ecc->valid = false; + } + + if (ecc->valid) + populate_fec_metadata(fec, ecc); + + for (i = 0; i < payload.number_of_pages; i++) + __free_page(payload.page_io[i]); + kfree(payload.page_io); + +error: + blkdev_put(bdev, FMODE_READ); + return err; +} +static void find_metadata_offset(struct fec_header *fec, + struct block_device *bdev, u64 *metadata_offset) +{ + u64 device_size; + + device_size = i_size_read(bdev->bd_inode); + + if (le32_to_cpu(fec->magic) == FEC_MAGIC) + *metadata_offset = le64_to_cpu(fec->inp_size) - + VERITY_METADATA_SIZE; + else + *metadata_offset = device_size - VERITY_METADATA_SIZE; +} + +static struct android_metadata *extract_metadata(dev_t dev, + struct fec_header *fec) +{ + struct block_device *bdev; + struct android_metadata_header *header; + struct android_metadata *uninitialized_var(metadata); + int i; + u32 table_length, copy_length, offset; + u64 metadata_offset; + struct bio_read payload; + int err = 0; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + + if (IS_ERR(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return ERR_CAST(bdev); + } + + find_metadata_offset(fec, bdev, &metadata_offset); + + /* Verity metadata size is a power of 2 and PAGE_SIZE + * is a power of 2 as well. + * PAGE_SIZE is also a multiple of 512 bytes. + */ + if (VERITY_METADATA_SIZE > PAGE_SIZE) + BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0); + /* 512 byte sector alignment */ + BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0); + + err = read_block_dev(&payload, bdev, metadata_offset / + (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); + if (err) { + DMERR("Error while reading verity metadata"); + metadata = ERR_PTR(err); + goto blkdev_release; + } + + header = kzalloc(sizeof(*header), GFP_KERNEL); + if (!header) { + DMERR("kzalloc failed for header"); + err = -ENOMEM; + goto free_payload; + } + + memcpy(header, page_address(payload.page_io[0]), + sizeof(*header)); + + DMINFO("bio magic_number:%u protocol_version:%d table_length:%u", + le32_to_cpu(header->magic_number), + le32_to_cpu(header->protocol_version), + le32_to_cpu(header->table_length)); + + metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); + if (!metadata) { + DMERR("kzalloc for metadata failed"); + err = -ENOMEM; + goto free_header; + } + + metadata->header = header; + table_length = le32_to_cpu(header->table_length); + + if (table_length == 0 || + table_length > (VERITY_METADATA_SIZE - + sizeof(struct android_metadata_header))) + goto free_metadata; + + metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + + if (!metadata->verity_table) { + DMERR("kzalloc verity_table failed"); + err = -ENOMEM; + goto free_metadata; + } + + if (sizeof(struct android_metadata_header) + + table_length <= PAGE_SIZE) { + memcpy(metadata->verity_table, page_address(payload.page_io[0]) + + sizeof(struct android_metadata_header), + table_length); + } else { + copy_length = PAGE_SIZE - + sizeof(struct android_metadata_header); + memcpy(metadata->verity_table, page_address(payload.page_io[0]) + + sizeof(struct android_metadata_header), + copy_length); + table_length -= copy_length; + offset = copy_length; + i = 1; + while (table_length != 0) { + if (table_length > PAGE_SIZE) { + memcpy(metadata->verity_table + offset, + page_address(payload.page_io[i]), + PAGE_SIZE); + offset += PAGE_SIZE; + table_length -= PAGE_SIZE; + } else { + memcpy(metadata->verity_table + offset, + page_address(payload.page_io[i]), + table_length); + table_length = 0; + } + i++; + } + } + metadata->verity_table[table_length] = '\0'; + + goto free_payload; + +free_metadata: + kfree(metadata); +free_header: + kfree(header); + metadata = ERR_PTR(err); +free_payload: + for (i = 0; i < payload.number_of_pages; i++) + if (payload.page_io[i]) + __free_page(payload.page_io[i]); + kfree(payload.page_io); + + DMINFO("verity_table: %s", metadata->verity_table); +blkdev_release: + blkdev_put(bdev, FMODE_READ); + return metadata; +} + +/* helper functions to extract properties from dts */ +const char *find_dt_value(const char *name) +{ + struct device_node *firmware; + const char *value; + + firmware = of_find_node_by_path("/firmware/android"); + if (!firmware) + return NULL; + value = of_get_property(firmware, name, NULL); + of_node_put(firmware); + + return value; +} + +static bool is_unlocked(void) +{ + static const char unlocked[] = "orange"; + static const char verified_boot_prop[] = "verifiedbootstate"; + const char *value; + + value = find_dt_value(verified_boot_prop); + if (!value) + value = verifiedbootstate; + + return !strncmp(value, unlocked, sizeof(unlocked) - 1); +} + +static int verity_mode(void) +{ + static const char enforcing[] = "enforcing"; + static const char verified_mode_prop[] = "veritymode"; + const char *value; + + value = find_dt_value(verified_mode_prop); + if (!value) + value = veritymode; + if (!strncmp(value, enforcing, sizeof(enforcing) - 1)) + return DM_VERITY_MODE_RESTART; + + return DM_VERITY_MODE_EIO; +} + +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_unlocked() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) { + retval = VERITY_STATE_DISABLE; + return retval; + } + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int verify_verity_signature(char *key_id, + struct android_metadata *metadata) +{ + key_ref_t key_ref; + struct key *key; + struct public_key_signature *pks = NULL; + int retval = -EINVAL; + + key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1), + &key_type_asymmetric, key_id); + + if (IS_ERR(key_ref)) { + DMERR("keyring: key not found"); + return -ENOKEY; + } + + key = key_ref_to_ptr(key_ref); + + pks = table_make_digest(PKEY_HASH_SHA256, + (const void *)metadata->verity_table, + le32_to_cpu(metadata->header->table_length)); + + if (IS_ERR(pks)) { + DMERR("hashing failed"); + goto error; + } + + retval = table_extract_mpi_array(pks, &metadata->header->signature[0], + RSANUMBYTES); + if (retval < 0) { + DMERR("Error extracting mpi %d", retval); + goto error; + } + + retval = verify_signature(key, pks); + mpi_free(pks->rsa.s); +error: + kfree(pks); + key_put(key); + + return retval; +} + +static void handle_error(void) +{ + int mode = verity_mode(); + if (mode == DM_VERITY_MODE_RESTART) { + DMERR("triggering restart"); + kernel_restart("dm-verity device corrupted"); + } else { + DMERR("Mounting verity root failed"); + } +} + +static inline bool test_mult_overflow(sector_t a, u32 b) +{ + sector_t r = (sector_t)~0ULL; + + sector_div(r, b); + return a > r; +} + +/* + * Target parameters: + * Key id of the public key in the system keyring. + * Verity metadata's signature would be verified against + * this. If the key id contains spaces, replace them + * with '#'. + * The block device for which dm-verity is being setup. + */ +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + dev_t uninitialized_var(dev); + struct android_metadata *uninitialized_var(metadata); + int err = 0, i, mode; + char *key_id, *table_ptr, dummy, + *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; + /* One for specifying number of opt args and one for mode */ + sector_t data_sectors; + u32 data_block_size; + unsigned int major, minor, + no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + struct fec_header fec; + struct fec_ecc_metadata uninitialized_var(ecc); + char buf[FEC_ARG_LENGTH], *buf_ptr; + unsigned long long tmpll; + + if (argc != 2) { + DMERR("Incorrect number of arguments"); + handle_error(); + return -EINVAL; + } + + /* should come as one of the arguments for the verity target */ + key_id = argv[0]; + strreplace(argv[0], '#', ' '); + + if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) { + DMERR("Incorrect bdev major minor number"); + handle_error(); + return -EOVERFLOW; + } + } + + DMINFO("key:%s dev:%s", argv[0], argv[1]); + + if (extract_fec_header(dev, &fec, &ecc)) { + DMERR("Error while extracting fec header"); + handle_error(); + return -EINVAL; + } + + metadata = extract_metadata(dev, &fec); + + if (IS_ERR(metadata)) { + DMERR("Error while extracting metadata"); + handle_error(); + return -EINVAL; + } + + err = verify_header(metadata->header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + return -EINVAL; + } else if (err) { + DMERR("Verity header handle error"); + handle_error(); + goto free_metadata; + } + + err = verify_verity_signature(key_id, metadata); + + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + + table_ptr = metadata->verity_table; + + for (i = 0; i < VERITY_TABLE_ARGS; i++) { + verity_table_args[i] = strsep(&table_ptr, " "); + if (verity_table_args[i] == NULL) + break; + } + + if (i != VERITY_TABLE_ARGS) { + DMERR("Verity table not in the expected format"); + err = -EINVAL; + handle_error(); + goto free_metadata; + } + + if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy) + != 1) { + DMERR("Verity table not in the expected format"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + if (tmpll > ULONG_MAX) { + DMERR(" too large. Forgot to turn on CONFIG_LBDAF?"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + data_sectors = tmpll; + + if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy) + != 1) { + DMERR("Verity table not in the expected format"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + if (test_mult_overflow(data_sectors, data_block_size >> + SECTOR_SHIFT)) { + DMERR("data_sectors too large"); + handle_error(); + err = -EOVERFLOW; + goto free_metadata; + } + + data_sectors *= data_block_size >> SECTOR_SHIFT; + DMINFO("Data sectors %llu", (unsigned long long)data_sectors); + + /* update target length */ + ti->len = data_sectors; + + /*substitute data_dev and hash_dev*/ + verity_table_args[1] = argv[1]; + verity_table_args[2] = argv[1]; + + mode = verity_mode(); + + if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { + if (mode) { + err = snprintf(buf, FEC_ARG_LENGTH, + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, + argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); + } else { + err = snprintf(buf, FEC_ARG_LENGTH, + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, argv[1], + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + } + } else if (mode) { + err = snprintf(buf, FEC_ARG_LENGTH, + "2 " VERITY_TABLE_OPT_IGNZERO " %s", + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING); + } else { + err = snprintf(buf, FEC_ARG_LENGTH, "1 %s", + "ignore_zero_blocks"); + } + + if (err < 0 || err >= FEC_ARG_LENGTH) + goto free_metadata; + + buf_ptr = buf; + + for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS + + VERITY_TABLE_OPT_FEC_ARGS + 2); i++) { + verity_table_args[i] = strsep(&buf_ptr, " "); + if (verity_table_args[i] == NULL) { + no_of_args = i; + break; + } + } + + err = verity_ctr(ti, no_of_args, verity_table_args); + +free_metadata: + kfree(metadata->header); + kfree(metadata->verity_table); + kfree(metadata); + return err; +} + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_android_verity_init(void) +{ + int r; + + r = dm_register_target(&android_verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_android_verity_exit(void) +{ + dm_unregister_target(&android_verity_target); +} + +module_init(dm_android_verity_init); +module_exit(dm_android_verity_exit); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h new file mode 100644 index 000000000000..11477ffd2243 --- /dev/null +++ b/drivers/md/dm-android-verity.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef DM_ANDROID_VERITY_H +#define DM_ANDROID_VERITY_H + +#include + +#define RSANUMBYTES 256 +#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001 +#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56 +#define VERITY_METADATA_VERSION 0 +#define VERITY_STATE_DISABLE 1 +#define DATA_BLOCK_SIZE (4 * 1024) +#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) +#define VERITY_TABLE_ARGS 10 +#define VERITY_COMMANDLINE_PARAM_LENGTH 20 + +#define FEC_MAGIC 0xFECFECFE +#define FEC_BLOCK_SIZE (4 * 1024) +#define FEC_VERSION 0 +#define FEC_RSM 255 +#define FEC_ARG_LENGTH 300 + +#define VERITY_TABLE_OPT_RESTART "restart_on_corruption" +#define VERITY_TABLE_OPT_LOGGING "ignore_corruption" +#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks" + +#define VERITY_TABLE_OPT_FEC_FORMAT \ + "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks" +#define VERITY_TABLE_OPT_FEC_ARGS 9 + +#define VERITY_DEBUG 0 + +#define DM_MSG_PREFIX "android-verity" +/* + * There can be two formats. + * if fec is present + * + * if fec is not present + * + */ +/* TODO: rearrange structure to reduce memory holes + * depends on userspace change. + */ +struct fec_header { + __le32 magic; + __le32 version; + __le32 size; + __le32 roots; + __le32 fec_size; + __le64 inp_size; + u8 hash[SHA256_DIGEST_SIZE]; +}; + +struct android_metadata_header { + __le32 magic_number; + __le32 protocol_version; + char signature[RSANUMBYTES]; + __le32 table_length; +}; + +struct android_metadata { + struct android_metadata_header *header; + char *verity_table; +}; + +struct fec_ecc_metadata { + bool valid; + u32 roots; + u64 blocks; + u64 rounds; + u64 start; +}; + +struct bio_read { + struct page **page_io; + int number_of_pages; +}; + +#endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c5d30cb6ec5..65835f15a116 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -551,7 +551,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) * Bio map function. It allocates dm_verity_io structure and bio vector and * fills them. Then it issues prefetches and the I/O. */ -static int verity_map(struct dm_target *ti, struct bio *bio) +int verity_map(struct dm_target *ti, struct bio *bio) { struct dm_verity *v = ti->private; struct dm_verity_io *io; @@ -596,7 +596,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) /* * Status: V (valid) or C (corruption found) */ -static void verity_status(struct dm_target *ti, status_type_t type, +void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct dm_verity *v = ti->private; @@ -669,7 +669,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, return 0; } -static int verity_iterate_devices(struct dm_target *ti, +int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dm_verity *v = ti->private; @@ -677,7 +677,7 @@ static int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } -static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dm_verity *v = ti->private; @@ -690,7 +690,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } -static void verity_dtr(struct dm_target *ti) +void verity_dtr(struct dm_target *ti) { struct dm_verity *v = ti->private; @@ -817,7 +817,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) * * Hex string or "-" if no salt. */ -static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; struct dm_arg_set as; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index fb419f422d73..d9cf5e4939eb 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -126,4 +126,16 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); +extern void verity_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int verity_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg); +extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); +extern void verity_dtr(struct dm_target *ti); +extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv); +extern int verity_map(struct dm_target *ti, struct bio *bio); #endif /* DM_VERITY_H */ From f42f971b7b59257a8610608b7f45a046c7c57b5d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 8 Feb 2016 16:28:43 -0800 Subject: [PATCH 231/607] ANDROID: dm-android-verity: Rebase on top of 4.1 Following CLs in upstream causes minor changes to dm-android-verity target. 1. keys: change asymmetric keys to use common hash definitions 2. block: Abstract out bvec iterator Rebase dm-android-verity on top of these changes. Bug: 27175947 Signed-off-by: Badhri Jagan Sridharan Change-Id: Icfdc3e7b3ead5de335a059cade1aca70414db415 --- drivers/md/dm-android-verity.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index c77c9fa7a962..aeb5045830d9 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -75,7 +75,7 @@ static int table_extract_mpi_array(struct public_key_signature *pks, } static struct public_key_signature *table_make_digest( - enum pkey_hash_algo hash, + enum hash_algo hash, const void *table, unsigned long table_len) { @@ -88,7 +88,7 @@ static struct public_key_signature *table_make_digest( /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return ERR_CAST(tfm); @@ -143,7 +143,7 @@ static int read_block_dev(struct bio_read *payload, struct block_device *bdev, } bio->bi_bdev = bdev; - bio->bi_sector = offset; + bio->bi_iter.bi_sector = offset; payload->page_io = kzalloc(sizeof(struct page *) * payload->number_of_pages, GFP_KERNEL); @@ -505,7 +505,7 @@ static int verify_verity_signature(char *key_id, key = key_ref_to_ptr(key_ref); - pks = table_make_digest(PKEY_HASH_SHA256, + pks = table_make_digest(HASH_ALGO_SHA256, (const void *)metadata->verity_table, le32_to_cpu(metadata->header->table_length)); @@ -569,7 +569,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) u32 data_block_size; unsigned int major, minor, no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; - struct fec_header fec; + struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; From 67ce481897e0656404fe0af42d563101a5629bbe Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 21 Mar 2016 10:55:23 -0700 Subject: [PATCH 232/607] ANDROID: dm: Mounting root as linear device when verity disabled This CL makes android-verity target to be added as linear dm device if when bootloader is unlocked and verity is disabled. Bug: 27175947 Change-Id: Ic41ca4b8908fb2777263799cf3a3e25934d70f18 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 131 +++++++++++++++++++++++++++------ drivers/md/dm-android-verity.h | 3 + drivers/md/dm-linear.c | 2 +- 3 files changed, 113 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index aeb5045830d9..db4ddf789a39 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -43,6 +44,25 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static bool target_added; +static bool verity_enabled = true; +struct dentry *debug_dir; +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv); + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + static int __init verified_boot_state_param(char *line) { strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); @@ -549,6 +569,35 @@ static inline bool test_mult_overflow(sector_t a, u32 b) return a > r; } +static int add_as_linear_device(struct dm_target *ti, char *dev) +{ + /*Move to linear mapping defines*/ + char *linear_table_args[DM_LINEAR_ARGS]; + char offset[] = "0"; + int err = 0; + + linear_table_args[0] = dev; + linear_table_args[1] = offset; + + android_verity_target.dtr = linear_target.dtr, + android_verity_target.map = linear_target.map, + android_verity_target.status = linear_target.status, + android_verity_target.ioctl = linear_target.ioctl, + android_verity_target.merge = linear_target.merge, + android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.io_hints = NULL; + + err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + + if (!err) { + DMINFO("Added android-verity as a linear target"); + target_added = true; + } else + DMERR("Failed to add android-verity as linear target"); + + return err; +} + /* * Target parameters: * Key id of the public key in the system keyring. @@ -613,21 +662,27 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (err == VERITY_STATE_DISABLE) { DMERR("Mounting root with verity disabled"); - return -EINVAL; + verity_enabled = false; + /* we would still have to parse the args to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + */ } else if (err) { DMERR("Verity header handle error"); handle_error(); goto free_metadata; } - err = verify_verity_signature(key_id, metadata); + if (!verity_enabled) { + err = verify_verity_signature(key_id, metadata); - if (err) { - DMERR("Signature verification failed"); - handle_error(); - goto free_metadata; - } else - DMINFO("Signature verification success"); + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + } table_ptr = metadata->verity_table; @@ -683,6 +738,12 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* update target length */ ti->len = data_sectors; + /* Setup linear target and free */ + if (!verity_enabled) { + err = add_as_linear_device(ti, argv[1]); + goto free_metadata; + } + /*substitute data_dev and hash_dev*/ verity_table_args[1] = argv[1]; verity_table_args[2] = argv[1]; @@ -730,6 +791,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) err = verity_ctr(ti, no_of_args, verity_table_args); + if (err) + DMERR("android-verity failed to mount as verity target"); + else { + target_added = true; + DMINFO("android-verity mounted as verity target"); + } + free_metadata: kfree(metadata->header); kfree(metadata->verity_table); @@ -737,33 +805,52 @@ free_metadata: return err; } -static struct target_type android_verity_target = { - .name = "android-verity", - .version = {1, 0, 0}, - .module = THIS_MODULE, - .ctr = android_verity_ctr, - .dtr = verity_dtr, - .map = verity_map, - .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, - .iterate_devices = verity_iterate_devices, - .io_hints = verity_io_hints, -}; - static int __init dm_android_verity_init(void) { int r; + struct dentry *file; r = dm_register_target(&android_verity_target); if (r < 0) DMERR("register failed %d", r); + /* Tracks the status of the last added target */ + debug_dir = debugfs_create_dir("android_verity", NULL); + + if (IS_ERR_OR_NULL(debug_dir)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + goto end; + } + + file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, + (u32 *)&target_added); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + goto end; + } + + file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, + (u32 *)&verity_enabled); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + } + +end: return r; } static void __exit dm_android_verity_exit(void) { + if (!IS_ERR_OR_NULL(debug_dir)) + debugfs_remove_recursive(debug_dir); + dm_unregister_target(&android_verity_target); } diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 11477ffd2243..2cf7de1b7910 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -44,6 +44,8 @@ #define VERITY_DEBUG 0 #define DM_MSG_PREFIX "android-verity" + +#define DM_LINEAR_ARGS 2 /* * There can be two formats. * if fec is present @@ -89,4 +91,5 @@ struct bio_read { int number_of_pages; }; +extern struct target_type linear_target; #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 05c35aacb3aa..d8bf876e7bed 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -141,7 +141,7 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -static struct target_type linear_target = { +struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, From 438e162621ac7e80ec9c81bea3f0eaec0b801ff9 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 28 Mar 2016 14:41:21 -0700 Subject: [PATCH 233/607] ANDROID: dm: Minor cleanup Compacts the linear device arguments removing the unnecessary variables. Bug: 27175947 Change-Id: I157170eebe3c0f89a68ae05870a1060f188d0da0 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 7 ++----- drivers/md/dm-android-verity.h | 2 ++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index db4ddf789a39..f6ddbee5e2d3 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -572,13 +572,10 @@ static inline bool test_mult_overflow(sector_t a, u32 b) static int add_as_linear_device(struct dm_target *ti, char *dev) { /*Move to linear mapping defines*/ - char *linear_table_args[DM_LINEAR_ARGS]; - char offset[] = "0"; + char *linear_table_args[DM_LINEAR_ARGS] = {dev, + DM_LINEAR_TARGET_OFFSET}; int err = 0; - linear_table_args[0] = dev; - linear_table_args[1] = offset; - android_verity_target.dtr = linear_target.dtr, android_verity_target.map = linear_target.map, android_verity_target.status = linear_target.status, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 2cf7de1b7910..fe53863c664b 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -46,6 +46,8 @@ #define DM_MSG_PREFIX "android-verity" #define DM_LINEAR_ARGS 2 +#define DM_LINEAR_TARGET_OFFSET "0" + /* * There can be two formats. * if fec is present From 86fd82659f745b5fea48090fe320cb817e199805 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 5 Apr 2016 11:18:16 -0700 Subject: [PATCH 234/607] ANDROID: dm: rename dm-linear methods for dm-android-verity This keeps linear_target as static variable and just exposes the linear target methods for android-verity Cherry-picked: https://android-review.googlesource.com/#/c/212858 Change-Id: I4a377e417b00afd9ecccdb3e605fea31a7df112e Signed-off-by: Badhri Jagan Sridharan (cherry picked from commit a6d1b091f40b25d97849487e29ec097bc5f568dd) --- drivers/md/dm-android-verity.c | 14 +++++++------- drivers/md/dm-android-verity.h | 12 ++++++++++++ drivers/md/dm-linear.c | 26 +++++++++++++------------- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index f6ddbee5e2d3..b7e059595f75 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -576,15 +576,15 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) DM_LINEAR_TARGET_OFFSET}; int err = 0; - android_verity_target.dtr = linear_target.dtr, - android_verity_target.map = linear_target.map, - android_verity_target.status = linear_target.status, - android_verity_target.ioctl = linear_target.ioctl, - android_verity_target.merge = linear_target.merge, - android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.dtr = dm_linear_dtr, + android_verity_target.map = dm_linear_map, + android_verity_target.status = dm_linear_status, + android_verity_target.ioctl = dm_linear_ioctl, + android_verity_target.merge = dm_linear_merge, + android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; - err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args); if (!err) { DMINFO("Added android-verity as a linear target"); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index fe53863c664b..efb796524896 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -94,4 +94,16 @@ struct bio_read { }; extern struct target_type linear_target; + +extern void dm_linear_dtr(struct dm_target *ti); +extern int dm_linear_map(struct dm_target *ti, struct bio *bio); +extern void dm_linear_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg); +extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int dm_linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d8bf876e7bed..74caca2888a6 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -25,7 +25,7 @@ struct linear_c { /* * Construct a linear mapping: */ -static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct linear_c *lc; unsigned long long tmp; @@ -67,7 +67,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) return ret; } -static void linear_dtr(struct dm_target *ti) +void dm_linear_dtr(struct dm_target *ti) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -92,14 +92,14 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) linear_map_sector(ti, bio->bi_iter.bi_sector); } -static int linear_map(struct dm_target *ti, struct bio *bio) +int dm_linear_map(struct dm_target *ti, struct bio *bio) { linear_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } -static void linear_status(struct dm_target *ti, status_type_t type, +void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -116,7 +116,7 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, +static int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -133,7 +133,7 @@ static int linear_prepare_ioctl(struct dm_target *ti, return 0; } -static int linear_iterate_devices(struct dm_target *ti, +int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct linear_c *lc = ti->private; @@ -141,16 +141,16 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -struct target_type linear_target = { +static struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, - .ctr = linear_ctr, - .dtr = linear_dtr, - .map = linear_map, - .status = linear_status, - .prepare_ioctl = linear_prepare_ioctl, - .iterate_devices = linear_iterate_devices, + .ctr = dm_linear_ctr, + .dtr = dm_linear_dtr, + .map = dm_linear_map, + .status = dm_linear_status, + .prepare_ioctl = dm_linear_prepare_ioctl, + .iterate_devices = dm_linear_iterate_devices, }; int __init dm_linear_init(void) From a517817c1708dd9d762d37d7dd475d4728a4b34e Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Fri, 15 Apr 2016 13:32:54 +0200 Subject: [PATCH 235/607] ANDROID: dm: use name_to_dev_t This patch makes android_verity_ctr() parse its block device string parameter with name_to_dev_t(). It allows the use of less hardware related block device reference like PARTUUID for instance. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf07 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index b7e059595f75..9c26cbb5f179 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -613,8 +613,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* One for specifying number of opt args and one for mode */ sector_t data_sectors; u32 data_block_size; - unsigned int major, minor, - no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; @@ -630,13 +629,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) key_id = argv[0]; strreplace(argv[0], '#', ' '); - if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { - dev = MKDEV(major, minor); - if (MAJOR(dev) != major || MINOR(dev) != minor) { - DMERR("Incorrect bdev major minor number"); - handle_error(); - return -EOVERFLOW; - } + dev = name_to_dev_t(argv[1]); + if (!dev) { + DMERR("no dev found for %s", argv[1]); + handle_error(); + return -EINVAL; } DMINFO("key:%s dev:%s", argv[0], argv[1]); From 9c43aca47bbdf652fc0d05d8e94ee43aaeea2458 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:44:19 -0700 Subject: [PATCH 236/607] ANDROID: dm: fix signature verification flag The bug was that the signature verification was only happening when verity was disabled. It should always happen when verity is enabled. Signed-off-by: Badhri Jagan Sridharan Change-Id: I2d9354e240d36ea06fc68c2d18d8e87b823a4c2f (cherry picked from commit 5364b5ca0b1a12a58283b51408e43fc36d4e4fe7) --- drivers/md/dm-android-verity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 9c26cbb5f179..00275a986d03 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -667,7 +667,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto free_metadata; } - if (!verity_enabled) { + if (verity_enabled) { err = verify_verity_signature(key_id, metadata); if (err) { From 051d4706c621a73d3622f0b0d91a444ab37a8f1a Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:45:45 -0700 Subject: [PATCH 237/607] ANDROID: dm: use default verity public key If the dm-android-verity target does not provide a default key try using the default public key from the system keyring. The defualt verity keyid is passed as a kernel command line argument veritykeyid=. The order of the dm-android-verity params have been reversed to facilitate the change. Old format example: dm="system none ro,0 1 android-verity Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f /dev/mmcblk0p43" New formats supported: dm="system none ro,0 1 android-verity /dev/mmcblk0p43 Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f" (or) dm="system none ro,0 1 android-verity /dev/mmcblk0p43" when veritykeyid= is set in the kernel command line. BUG: 28384658 Signed-off-by: Badhri Jagan Sridharan Change-Id: I506c89b053d835ab579e703eef2bc1f8487250de (cherry picked from commit c5c74d0327729f35b576564976885596c6d0e7fb) --- drivers/md/dm-android-verity.c | 67 ++++++++++++++++++++++++---------- drivers/md/dm-android-verity.h | 16 ++++++++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 00275a986d03..097fb2b1de89 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -43,6 +43,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; static bool target_added; static bool verity_enabled = true; @@ -79,6 +80,19 @@ static int __init verity_mode_param(char *line) __setup("androidboot.veritymode=", verity_mode_param); +static int __init verity_keyid_param(char *line) +{ + strlcpy(veritykeyid, line, sizeof(veritykeyid)); + return 1; +} + +__setup("veritykeyid=", verity_keyid_param); + +static inline bool default_verity_key_id(void) +{ + return veritykeyid[0] != '\0'; +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -608,7 +622,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) dev_t uninitialized_var(dev); struct android_metadata *uninitialized_var(metadata); int err = 0, i, mode; - char *key_id, *table_ptr, dummy, + char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; /* One for specifying number of opt args and one for mode */ sector_t data_sectors; @@ -619,24 +633,34 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - if (argc != 2) { + if (argc == 1) { + /* Use the default keyid */ + if (default_verity_key_id()) + key_id = veritykeyid; + else { + DMERR("veritykeyid= is not set"); + handle_error(); + return -EINVAL; + } + } else if (argc == 2) + key_id = argv[1]; + else { DMERR("Incorrect number of arguments"); handle_error(); return -EINVAL; } - /* should come as one of the arguments for the verity target */ - key_id = argv[0]; - strreplace(argv[0], '#', ' '); + strreplace(key_id, '#', ' '); + target_device = argv[0]; - dev = name_to_dev_t(argv[1]); + dev = name_to_dev_t(target_device); if (!dev) { - DMERR("no dev found for %s", argv[1]); + DMERR("no dev found for %s", target_device); handle_error(); return -EINVAL; } - DMINFO("key:%s dev:%s", argv[0], argv[1]); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { DMERR("Error while extracting fec header"); @@ -734,30 +758,33 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* Setup linear target and free */ if (!verity_enabled) { - err = add_as_linear_device(ti, argv[1]); + err = add_as_linear_device(ti, target_device); goto free_metadata; } /*substitute data_dev and hash_dev*/ - verity_table_args[1] = argv[1]; - verity_table_args[2] = argv[1]; + verity_table_args[1] = target_device; + verity_table_args[2] = target_device; mode = verity_mode(); if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, - "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, - 1 + VERITY_TABLE_OPT_FEC_ARGS, - mode == DM_VERITY_MODE_RESTART ? - VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, - argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, - ecc.roots); + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : + VERITY_TABLE_OPT_LOGGING, + target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } else { err = snprintf(buf, FEC_ARG_LENGTH, - "%u " VERITY_TABLE_OPT_FEC_FORMAT, - VERITY_TABLE_OPT_FEC_ARGS, argv[1], - ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } } else if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index efb796524896..43655ee0f813 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -27,6 +27,22 @@ #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +/* + * : is the format for the identifier. + * subject can either be the Common Name(CN) + Organization Name(O) or + * just the CN if the it is prefixed with O + * From https://tools.ietf.org/html/rfc5280#appendix-A + * ub-organization-name-length INTEGER ::= 64 + * ub-common-name-length INTEGER ::= 64 + * + * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278 + * ctx->o_size + 2 + ctx->cn_size + 1 + * + 41 characters for ":" and sha1 id + * 64 + 2 + 64 + 1 + 1 + 40 (172) + * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters. + */ +#define VERITY_DEFAULT_KEY_ID_LENGTH 200 + #define FEC_MAGIC 0xFECFECFE #define FEC_BLOCK_SIZE (4 * 1024) #define FEC_VERSION 0 From 58bae772a7018ebfa735c91af22e26474a1dfcc7 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 17 Jun 2016 18:54:35 -0700 Subject: [PATCH 238/607] ANDROID: dm: mount as linear target if eng build eng builds dont have verity enabled i.e it does even have verity metadata appended to the parition. Therefore add rootdev as linear device and map the entire partition if build variant is "eng". (Cherry-picked based on https://partner-android-review.git.corp.google.com/#/c/618690/) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I8f5c2289b842b820ca04f5773525e5449bb3f355 --- drivers/md/dm-android-verity.c | 62 +++++++++++++++++++++++++++++++--- drivers/md/dm-android-verity.h | 1 + 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 097fb2b1de89..e1a8e284e7e4 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -44,6 +44,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; +static char buildvariant[BUILD_VARIANT]; static bool target_added; static bool verity_enabled = true; @@ -88,11 +89,26 @@ static int __init verity_keyid_param(char *line) __setup("veritykeyid=", verity_keyid_param); +static int __init verity_buildvariant(char *line) +{ + strlcpy(buildvariant, line, sizeof(buildvariant)); + return 1; +} + +__setup("buildvariant=", verity_buildvariant); + static inline bool default_verity_key_id(void) { return veritykeyid[0] != '\0'; } +static inline bool is_eng(void) +{ + static const char typeeng[] = "eng"; + + return !strncmp(buildvariant, typeeng, sizeof(typeeng)); +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -262,7 +278,7 @@ static int extract_fec_header(dev_t dev, struct fec_header *fec, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("bdev get error"); return PTR_ERR(bdev); } @@ -323,6 +339,24 @@ static void find_metadata_offset(struct fec_header *fec, *metadata_offset = device_size - VERITY_METADATA_SIZE; } +static int find_size(dev_t dev, u64 *device_size) +{ + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR_OR_NULL(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return PTR_ERR(bdev); + } + + *device_size = i_size_read(bdev->bd_inode); + *device_size >>= SECTOR_SHIFT; + + DMINFO("blkdev size in sectors: %llu", *device_size); + blkdev_put(bdev, FMODE_READ); + return 0; +} + static struct android_metadata *extract_metadata(dev_t dev, struct fec_header *fec) { @@ -337,7 +371,7 @@ static struct android_metadata *extract_metadata(dev_t dev, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); return ERR_CAST(bdev); } @@ -632,12 +666,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; + u64 device_size; if (argc == 1) { /* Use the default keyid */ if (default_verity_key_id()) key_id = veritykeyid; - else { + else if (!is_eng()) { DMERR("veritykeyid= is not set"); handle_error(); return -EINVAL; @@ -650,7 +685,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - strreplace(key_id, '#', ' '); target_device = argv[0]; dev = name_to_dev_t(target_device); @@ -660,6 +694,26 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } + if (is_eng()) { + err = find_size(dev, &device_size); + if (err) { + DMERR("error finding bdev size"); + handle_error(); + return err; + } + + ti->len = device_size; + err = add_as_linear_device(ti, target_device); + if (err) { + handle_error(); + return err; + } + verity_enabled = false; + return 0; + } + + strreplace(key_id, '#', ' '); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 43655ee0f813..782e1c815c67 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -26,6 +26,7 @@ #define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +#define BUILD_VARIANT 20 /* * : is the format for the identifier. From f74284f6c25dca79c052c3a1e1bd3fc58de4d4a0 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 27 Jun 2016 16:25:55 -0700 Subject: [PATCH 239/607] ANDROID: dm: allow adb disable-verity only in userdebug adb disable-verity was allowed when the phone is in the unlocked state. Since the driver is now aware of the build variant, honor "adb disable-verity" only in userdebug builds. (Cherry-picked from https://partner-android-review.git.corp.google.com/#/c/622117) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I7ce9f38d8c7a62361392c5a8ccebb288f8a3a2ea --- drivers/md/dm-android-verity.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index e1a8e284e7e4..999e75bf2ba0 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -109,6 +109,14 @@ static inline bool is_eng(void) return !strncmp(buildvariant, typeeng, sizeof(typeeng)); } +static inline bool is_userdebug(void) +{ + static const char typeuserdebug[] = "userdebug"; + + return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug)); +} + + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -499,19 +507,6 @@ const char *find_dt_value(const char *name) return value; } -static bool is_unlocked(void) -{ - static const char unlocked[] = "orange"; - static const char verified_boot_prop[] = "verifiedbootstate"; - const char *value; - - value = find_dt_value(verified_boot_prop); - if (!value) - value = verifiedbootstate; - - return !strncmp(value, unlocked, sizeof(unlocked) - 1); -} - static int verity_mode(void) { static const char enforcing[] = "enforcing"; @@ -531,7 +526,7 @@ static int verify_header(struct android_metadata_header *header) { int retval = -EINVAL; - if (is_unlocked() && le32_to_cpu(header->magic_number) == + if (is_userdebug() && le32_to_cpu(header->magic_number) == VERITY_METADATA_MAGIC_DISABLE) { retval = VERITY_STATE_DISABLE; return retval; From ad2f6cf0be78107a2bf435b8a21f801658935719 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 6 Jul 2016 17:16:19 -0700 Subject: [PATCH 240/607] ANDROID: dm: android-verity: Verify header before fetching table Move header validation logic before reading the verity_table as an invalid header implies the table is invalid as well. (Cherry-picked from: https://partner-android-review.git.corp.google.com/#/c/625203) BUG: 29940612 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ib34d25c0854202f3e70df0a6d0ef1d96f0250c8e --- drivers/md/dm-android-verity.c | 140 +++++++++++++++++---------------- 1 file changed, 71 insertions(+), 69 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 999e75bf2ba0..1f4eb099209d 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -365,12 +365,38 @@ static int find_size(dev_t dev, u64 *device_size) return 0; } -static struct android_metadata *extract_metadata(dev_t dev, - struct fec_header *fec) +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_userdebug() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) + return VERITY_STATE_DISABLE; + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int extract_metadata(dev_t dev, struct fec_header *fec, + struct android_metadata **metadata, + bool *verity_enabled) { struct block_device *bdev; struct android_metadata_header *header; - struct android_metadata *uninitialized_var(metadata); int i; u32 table_length, copy_length, offset; u64 metadata_offset; @@ -381,7 +407,7 @@ static struct android_metadata *extract_metadata(dev_t dev, if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); - return ERR_CAST(bdev); + return -ENODEV; } find_metadata_offset(fec, bdev, &metadata_offset); @@ -399,7 +425,6 @@ static struct android_metadata *extract_metadata(dev_t dev, (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); if (err) { DMERR("Error while reading verity metadata"); - metadata = ERR_PTR(err); goto blkdev_release; } @@ -418,24 +443,42 @@ static struct android_metadata *extract_metadata(dev_t dev, le32_to_cpu(header->protocol_version), le32_to_cpu(header->table_length)); - metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); - if (!metadata) { + err = verify_header(header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + *verity_enabled = false; + /* we would still have to read the metadata to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + * + * Reset error as well as the verity_enabled flag is changed. + */ + err = 0; + } else if (err) + goto free_header; + + *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL); + if (!*metadata) { DMERR("kzalloc for metadata failed"); err = -ENOMEM; goto free_header; } - metadata->header = header; + (*metadata)->header = header; table_length = le32_to_cpu(header->table_length); if (table_length == 0 || table_length > (VERITY_METADATA_SIZE - - sizeof(struct android_metadata_header))) + sizeof(struct android_metadata_header))) { + DMERR("table_length too long"); + err = -EINVAL; goto free_metadata; + } - metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL); - if (!metadata->verity_table) { + if (!(*metadata)->verity_table) { DMERR("kzalloc verity_table failed"); err = -ENOMEM; goto free_metadata; @@ -443,13 +486,15 @@ static struct android_metadata *extract_metadata(dev_t dev, if (sizeof(struct android_metadata_header) + table_length <= PAGE_SIZE) { - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), table_length); } else { copy_length = PAGE_SIZE - sizeof(struct android_metadata_header); - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), copy_length); table_length -= copy_length; @@ -457,13 +502,13 @@ static struct android_metadata *extract_metadata(dev_t dev, i = 1; while (table_length != 0) { if (table_length > PAGE_SIZE) { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), PAGE_SIZE); offset += PAGE_SIZE; table_length -= PAGE_SIZE; } else { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), table_length); table_length = 0; @@ -471,25 +516,23 @@ static struct android_metadata *extract_metadata(dev_t dev, i++; } } - metadata->verity_table[table_length] = '\0'; + (*metadata)->verity_table[table_length] = '\0'; + DMINFO("verity_table: %s", (*metadata)->verity_table); goto free_payload; free_metadata: - kfree(metadata); + kfree(*metadata); free_header: kfree(header); - metadata = ERR_PTR(err); free_payload: for (i = 0; i < payload.number_of_pages; i++) if (payload.page_io[i]) __free_page(payload.page_io[i]); kfree(payload.page_io); - - DMINFO("verity_table: %s", metadata->verity_table); blkdev_release: blkdev_put(bdev, FMODE_READ); - return metadata; + return err; } /* helper functions to extract properties from dts */ @@ -522,34 +565,6 @@ static int verity_mode(void) return DM_VERITY_MODE_EIO; } -static int verify_header(struct android_metadata_header *header) -{ - int retval = -EINVAL; - - if (is_userdebug() && le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE) { - retval = VERITY_STATE_DISABLE; - return retval; - } - - if (!(le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_NUMBER) || - (le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE)) { - DMERR("Incorrect magic number"); - return retval; - } - - if (le32_to_cpu(header->protocol_version) != - VERITY_METADATA_VERSION) { - DMERR("Unsupported version %u", - le32_to_cpu(header->protocol_version)); - return retval; - } - - return 0; -} - static int verify_verity_signature(char *key_id, struct android_metadata *metadata) { @@ -649,7 +664,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { dev_t uninitialized_var(dev); - struct android_metadata *uninitialized_var(metadata); + struct android_metadata *metadata = NULL; int err = 0, i, mode; char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; @@ -717,26 +732,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - metadata = extract_metadata(dev, &fec); + err = extract_metadata(dev, &fec, &metadata, &verity_enabled); - if (IS_ERR(metadata)) { + if (err) { DMERR("Error while extracting metadata"); handle_error(); - return -EINVAL; - } - - err = verify_header(metadata->header); - - if (err == VERITY_STATE_DISABLE) { - DMERR("Mounting root with verity disabled"); - verity_enabled = false; - /* we would still have to parse the args to figure out - * the data blocks size. Or may be could map the entire - * partition similar to mounting the device. - */ - } else if (err) { - DMERR("Verity header handle error"); - handle_error(); goto free_metadata; } @@ -869,8 +869,10 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } free_metadata: - kfree(metadata->header); - kfree(metadata->verity_table); + if (metadata) { + kfree(metadata->header); + kfree(metadata->verity_table); + } kfree(metadata); return err; } From edb97738f20194cb7eea8eeb81bac21f4a8cfb83 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Tue, 10 May 2016 13:10:20 +0200 Subject: [PATCH 241/607] ANDROID: dm verity fec: pack the fec_header structure The fec_header structure is generated build time and stored on disk. The fec_header might be build on a 64 bits machine while it is read per a 32 bits device or the other way around. In such situations, the fec_header fields are not aligned as expected by the device and it fails to read the fec_header structure. This patch makes the fec_header packed. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf06 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 782e1c815c67..f43b02fbb475 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -72,9 +72,6 @@ * if fec is not present * */ -/* TODO: rearrange structure to reduce memory holes - * depends on userspace change. - */ struct fec_header { __le32 magic; __le32 version; @@ -83,7 +80,7 @@ struct fec_header { __le32 fec_size; __le64 inp_size; u8 hash[SHA256_DIGEST_SIZE]; -}; +} __attribute__((packed)); struct android_metadata_header { __le32 magic_number; From 3a400abdc57e047786d0b35d39617a794e2bb97e Mon Sep 17 00:00:00 2001 From: Joseph Lo Date: Mon, 22 Apr 2013 14:39:18 +0800 Subject: [PATCH 242/607] CHROMIUM: sched: update the average of nr_running Doing a Exponential moving average per nr_running++/-- does not guarantee a fixed sample rate which induces errors if there are lots of threads being enqueued/dequeued from the rq (Linpack mt). Instead of keeping track of the avg, the scheduler now keeps track of the integral of nr_running and allows the readers to perform filtering on top. Original-author: Sai Charan Gurrappadi Change-Id: Id946654f32fa8be0eaf9d8fa7c9a8039b5ef9fab Signed-off-by: Joseph Lo Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/174694 Reviewed-on: https://chromium-review.googlesource.com/272853 [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 30 ++++++++++++++++++++++++++ kernel/sched/sched.h | 49 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b9b11b2dbabd..ee59abcab30d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -173,6 +173,9 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); +#ifdef CONFIG_CPU_QUIET +extern u64 nr_running_integral(unsigned int cpu); +#endif extern void calc_global_load(unsigned long ticks); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9e25e68747d7..42c45ccc1f62 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2768,6 +2768,36 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +#ifdef CONFIG_CPU_QUIET +u64 nr_running_integral(unsigned int cpu) +{ + unsigned int seqcnt; + u64 integral; + struct rq *q; + + if (cpu >= nr_cpu_ids) + return 0; + + q = cpu_rq(cpu); + + /* + * Update average to avoid reading stalled value if there were + * no run-queue changes for a long time. On the other hand if + * the changes are happening right now, just read current value + * directly. + */ + + seqcnt = read_seqcount_begin(&q->ave_seqcnt); + integral = do_nr_running_integral(q); + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) { + read_seqcount_begin(&q->ave_seqcnt); + integral = q->nr_running_integral; + } + + return integral; +} +#endif + void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3f3db9849c..5713513adb9d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,14 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif + +#ifdef CONFIG_CPU_QUIET + /* time-based average load */ + u64 nr_last_stamp; + u64 nr_running_integral; + seqcount_t ave_seqcnt; +#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -1353,7 +1361,7 @@ extern void init_entity_runnable_average(struct sched_entity *se); extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); -static inline void add_nr_running(struct rq *rq, unsigned count) +static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1381,11 +1389,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } } -static inline void sub_nr_running(struct rq *rq, unsigned count) +static inline void __sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; } +#ifdef CONFIG_CPU_QUIET +#define NR_AVE_SCALE(x) ((x) << FSHIFT) +static inline u64 do_nr_running_integral(struct rq *rq) +{ + s64 nr, deltax; + u64 nr_running_integral = rq->nr_running_integral; + + deltax = rq->clock_task - rq->nr_last_stamp; + nr = NR_AVE_SCALE(rq->nr_running); + + nr_running_integral += nr * deltax; + + return nr_running_integral; +} + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __add_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __sub_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} +#else +#define add_nr_running __add_nr_running +#define sub_nr_running __sub_nr_running +#endif + static inline void rq_last_tick_reset(struct rq *rq) { #ifdef CONFIG_NO_HZ_FULL From 687fa2dcd285d3344fe89cd6a3c992fcaba13b6f Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 17:16:41 +0000 Subject: [PATCH 243/607] arm: topology: Define TC2 energy and provide it to the scheduler This patch is only here to be able to test provisioning of energy related data from an arch topology shim layer to the scheduler. Since there is no code today which deals with extracting energy related data from the dtb or acpi, and process it in the topology shim layer, the content of the sched_group_energy structures as well as the idle_state and capacity_state arrays are hard-coded here. This patch defines the sched_group_energy structure as well as the idle_state and capacity_state array for the cluster (relates to sched groups (sgs) in DIE sched domain level) and for the core (relates to sgs in MC sd level) for a Cortex A7 as well as for a Cortex A15. It further provides related implementations of the sched_domain_energy_f functions (cpu_cluster_energy() and cpu_core_energy()). To be able to propagate this information from the topology shim layer to the scheduler, the elements of the arm_topology[] table have been provisioned with the appropriate sched_domain_energy_f functions. Change-Id: I8c014bbd04f6a1d57892be9bfa16affe07948dcf cc: Russell King Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 126 ++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 0308342def8c..f5941004efba 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -287,6 +287,127 @@ void store_cpu_topology(unsigned int cpuid) cpu_topology[cpuid].socket_id, mpidr); } +/* + * ARM TC2 specific energy cost model data. There are no unit requirements for + * the data. Data can be normalized to any reference point, but the + * normalization must be consistent. That is, one bogo-joule/watt must be the + * same quantity for all data, but we don't care what it is. + */ +static struct idle_state idle_states_cluster_a7[] = { + { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 25 }, /* WFI */ + { .power = 10 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_cluster_a15[] = { + { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 70 }, /* WFI */ + { .power = 25 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_cluster_a7[] = { + /* Cluster only power */ + { .cap = 150, .power = 2967, }, /* 350 MHz */ + { .cap = 172, .power = 2792, }, /* 400 MHz */ + { .cap = 215, .power = 2810, }, /* 500 MHz */ + { .cap = 258, .power = 2815, }, /* 600 MHz */ + { .cap = 301, .power = 2919, }, /* 700 MHz */ + { .cap = 344, .power = 2847, }, /* 800 MHz */ + { .cap = 387, .power = 3917, }, /* 900 MHz */ + { .cap = 430, .power = 4905, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_cluster_a15[] = { + /* Cluster only power */ + { .cap = 426, .power = 7920, }, /* 500 MHz */ + { .cap = 512, .power = 8165, }, /* 600 MHz */ + { .cap = 597, .power = 8172, }, /* 700 MHz */ + { .cap = 682, .power = 8195, }, /* 800 MHz */ + { .cap = 768, .power = 8265, }, /* 900 MHz */ + { .cap = 853, .power = 8446, }, /* 1000 MHz */ + { .cap = 938, .power = 11426, }, /* 1100 MHz */ + { .cap = 1024, .power = 15200, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_cluster_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7), + .idle_states = idle_states_cluster_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7), + .cap_states = cap_states_cluster_a7, +}; + +static struct sched_group_energy energy_cluster_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15), + .idle_states = idle_states_cluster_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15), + .cap_states = cap_states_cluster_a15, +}; + +static struct idle_state idle_states_core_a7[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_core_a15[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_core_a7[] = { + /* Power per cpu */ + { .cap = 150, .power = 187, }, /* 350 MHz */ + { .cap = 172, .power = 275, }, /* 400 MHz */ + { .cap = 215, .power = 334, }, /* 500 MHz */ + { .cap = 258, .power = 407, }, /* 600 MHz */ + { .cap = 301, .power = 447, }, /* 700 MHz */ + { .cap = 344, .power = 549, }, /* 800 MHz */ + { .cap = 387, .power = 761, }, /* 900 MHz */ + { .cap = 430, .power = 1024, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_core_a15[] = { + /* Power per cpu */ + { .cap = 426, .power = 2021, }, /* 500 MHz */ + { .cap = 512, .power = 2312, }, /* 600 MHz */ + { .cap = 597, .power = 2756, }, /* 700 MHz */ + { .cap = 682, .power = 3125, }, /* 800 MHz */ + { .cap = 768, .power = 3524, }, /* 900 MHz */ + { .cap = 853, .power = 3846, }, /* 1000 MHz */ + { .cap = 938, .power = 5177, }, /* 1100 MHz */ + { .cap = 1024, .power = 6997, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_core_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a7), + .idle_states = idle_states_core_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a7), + .cap_states = cap_states_core_a7, +}; + +static struct sched_group_energy energy_core_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a15), + .idle_states = idle_states_core_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a15), + .cap_states = cap_states_core_a15, +}; + +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_cluster_a7 : + &energy_cluster_a15; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_core_a7 : + &energy_core_a15; +} + static inline int cpu_corepower_flags(void) { return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ @@ -295,10 +416,9 @@ static inline int cpu_corepower_flags(void) static struct sched_domain_topology_level arm_topology[] = { #ifdef CONFIG_SCHED_MC - { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, { NULL, }, }; From 962b7c10ab3a09cc30114d5ddf01abd9e6979278 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:43:37 +0000 Subject: [PATCH 244/607] DEBUG: sched/tune: add tracepoint for task boost signal Change-Id: I545d3bf5569fc41c0fa70f51dff9a19c11d532ee Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 30 ++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 32 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7ec9dcfc701a..606509500446 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -764,6 +764,36 @@ TRACE_EVENT(sched_tune_boostgroup_update, __entry->cpu, __entry->variation, __entry->max_boost) ); +/* + * Tracepoint for accounting task boosted utilization + */ +TRACE_EVENT(sched_boost_task, + + TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + + TP_ARGS(tsk, util, margin), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("comm=%s pid=%d util=%lu margin=%lu", + __entry->comm, __entry->pid, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bc56e0ea1ab..3a1fd4d3a860 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5283,6 +5283,8 @@ boosted_task_util(struct task_struct *task) unsigned long util = task_util(task); unsigned long margin = schedtune_task_margin(task); + trace_sched_boost_task(task, util, margin); + return util + margin; } From 4525aa343e8b252dd365e603ad141639989d547c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:47:21 +0000 Subject: [PATCH 245/607] DEBUG: sched/tune: add tracepoint for energy_diff() values Change-Id: Id8fafbd85f6d81248f322e073ee790a7ceec0bf7 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 57 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 606509500446..13fb31cebf62 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -794,6 +794,63 @@ TRACE_EVENT(sched_boost_task, __entry->margin) ); +/* + * Tracepoint for accounting sched group energy + */ +TRACE_EVENT(sched_energy_diff, + + TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta, + int nrgb, int nrga, int nrgd, int capb, int capa, int capd, + int nrgn, int nrgp), + + TP_ARGS(tsk, scpu, dcpu, udelta, + nrgb, nrga, nrgd, capb, capa, capd, + nrgn, nrgp), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, scpu ) + __field( int, dcpu ) + __field( int, udelta ) + __field( int, nrgb ) + __field( int, nrga ) + __field( int, nrgd ) + __field( int, capb ) + __field( int, capa ) + __field( int, capd ) + __field( int, nrgn ) + __field( int, nrgp ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->scpu = scpu; + __entry->dcpu = dcpu; + __entry->udelta = udelta; + __entry->nrgb = nrgb; + __entry->nrga = nrga; + __entry->nrgd = nrgd; + __entry->capb = capb; + __entry->capa = capa; + __entry->capd = capd; + __entry->nrgn = nrgn; + __entry->nrgp = nrgp; + ), + + TP_printk("pid=%d comm=%s " + "src_cpu=%d dst_cpu=%d usage_delta=%d " + "nrg_before=%d nrg_after=%d nrg_diff=%d " + "cap_before=%d cap_after=%d cap_delta=%d " + "nrg_delta=%d nrg_payoff=%d", + __entry->pid, __entry->comm, + __entry->scpu, __entry->dcpu, __entry->udelta, + __entry->nrgb, __entry->nrga, __entry->nrgd, + __entry->capb, __entry->capa, __entry->capd, + __entry->nrgn, __entry->nrgp) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a1fd4d3a860..69cb6fb6793e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4998,6 +4998,7 @@ static int energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int result; struct energy_env eenv_before = { .util_delta = 0, @@ -5041,7 +5042,15 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - return energy_diff_evaluate(eenv); + result = energy_diff_evaluate(eenv); + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + return result; } /* From ae54c7741f30a963d84134e92fd452b8252946a8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 20 Jan 2016 14:06:05 +0000 Subject: [PATCH 246/607] DEBUG: sched/tune: add tracepoint on P-E space filtering Change-Id: I31dfed67c0486713b88efb75df767329f2802e06 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 13fb31cebf62..cb082a68cdd6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -851,6 +851,41 @@ TRACE_EVENT(sched_energy_diff, __entry->nrgn, __entry->nrgp) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_filter, + + TP_PROTO(int nrg_delta, int cap_delta, + int nrg_gain, int cap_gain, + int payoff, int region), + + TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region), + + TP_STRUCT__entry( + __field( int, nrg_delta ) + __field( int, cap_delta ) + __field( int, nrg_gain ) + __field( int, cap_gain ) + __field( int, payoff ) + __field( int, region ) + ), + + TP_fast_assign( + __entry->nrg_delta = nrg_delta; + __entry->cap_delta = cap_delta; + __entry->nrg_gain = nrg_gain; + __entry->cap_gain = cap_gain; + __entry->payoff = payoff; + __entry->region = region; + ), + + TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d", + __entry->nrg_delta, __entry->cap_delta, + __entry->nrg_gain, __entry->cap_gain, + __entry->payoff, __entry->region) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 7a434f2394e7..b40d40dc3c49 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -70,6 +70,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_boost_idx].nrg_gain, + threshold_gains[perf_boost_idx].cap_gain, + payoff, 8); + return payoff; } @@ -84,6 +91,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_constrain_idx].nrg_gain, + threshold_gains[perf_constrain_idx].cap_gain, + payoff, 6); + return payoff; } @@ -155,12 +169,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_constrain_idx; /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } /* Get task specific perf Boost/Constraints indexes */ rcu_read_lock(); @@ -531,12 +549,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) { /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } return __schedtune_accept_deltas(nrg_delta, cap_delta, perf_boost_idx, perf_constrain_idx); From 765c2ab363a02a71f9ef4c55a7f598e3dd401e0b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 22 Jul 2016 11:35:59 +0100 Subject: [PATCH 247/607] FIXUP: sched: fix build for non-SMP target Currently the build for a single-core (e.g. user-mode) Linux is broken and this configuration is required (at least) to run some network tests. The main issues for the current code support on single-core systems are: 1. {se,rq}::sched_avg is not available nor maintained for !SMP systems This means that load and utilisation signals are NOT available in single core systems. All the EAS code depends on these signals. 2. sched_group_energy is also SMP dependant. Again this means that all the EAS setup and preparation code (energyn model initialization) has to be properly guarded/disabled for !SMP systems. 3. SchedFreq depends on utilization signal, which is not available on !SMP systems. 4. SchedTune is useless on unicore systems if SchedFreq is not available. 5. WALT machinery is not required on single-core systems. This patch addresses all these issues by enforcing some constraints for single-core systems: a) WALT, SchedTune and SchedTune are now dependant on SMP b) The default governor for !SMP systems is INTERACTIVE c) The energy model initialisation/build functions are d) Other minor code re-arrangements and CONFIG_SMP guarding to enable single core builds. Signed-off-by: Patrick Bellasi --- drivers/cpufreq/Kconfig | 1 + include/linux/sched_energy.h | 8 ++++++++ include/trace/events/sched.h | 4 ++++ init/Kconfig | 1 + kernel/sched/Makefile | 4 ++-- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++---- kernel/sched/sched.h | 3 +-- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index af523c539af0..a2d63d86bb70 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -195,6 +195,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config CPU_FREQ_GOV_SCHED bool "'sched' cpufreq governor" depends on CPU_FREQ + depends on SMP select CPU_FREQ_GOV_COMMON help 'sched' - this governor scales cpu frequency from the diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h index a3f1627ac609..1daf3e1f98a7 100644 --- a/include/linux/sched_energy.h +++ b/include/linux/sched_energy.h @@ -29,8 +29,16 @@ #define for_each_possible_sd_level(level) \ for (level = 0; level < NR_SD_LEVELS; level++) +#ifdef CONFIG_SMP + extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; void init_sched_energy_costs(void); +#else + +#define init_sched_energy_costs() do { } while (0) + +#endif /* CONFIG_SMP */ + #endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cb082a68cdd6..07ed295d0f23 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -587,6 +587,8 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu_scale_factor) ); +#ifdef CONFIG_SMP + /* * Tracepoint for accounting sched averages for tasks. */ @@ -886,6 +888,8 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#endif /* CONFIG_SMP */ + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 5d9097e2b805..facb2b587d5b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1256,6 +1256,7 @@ config SCHED_AUTOGROUP config SCHED_TUNE bool "Boosting for CFS tasks (EXPERIMENTAL)" + depends on SMP help This option enables the system-wide support for task boosting. When this support is enabled a new sysctl interface is exposed to diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c6a85f813dfd..174762d8695b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,9 +12,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69cb6fb6793e..1b12b14756ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4150,8 +4150,14 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util(cpu) +#endif +#ifdef CONFIG_SMP static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4164,8 +4170,7 @@ static void update_capacity_of(int cpu) req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } - -static bool cpu_overutilized(int cpu); +#endif /* * The enqueue_task method is called before nr_running is @@ -4177,8 +4182,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; int task_wakeup = flags & ENQUEUE_WAKEUP; +#endif for_each_sched_entity(se) { if (se->on_rq) @@ -4210,8 +4217,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) add_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4228,6 +4239,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4285,8 +4298,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) sub_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { schedtune_dequeue_task(p, cpu_of(rq)); /* @@ -4304,6 +4321,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) set_cfs_cpu_capacity(cpu_of(rq), false, 0); } } + +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -5692,6 +5712,8 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -8716,10 +8738,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); +#ifdef CONFIG_SMP if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5713513adb9d..a06c19c48057 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1277,6 +1277,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); @@ -1359,8 +1360,6 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); - static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From d753e92e19660a6adcc3b8e0076a6c6078e5f59d Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 1 Aug 2016 11:34:05 +0100 Subject: [PATCH 248/607] sched/cpufreq_sched: Consolidated update Contains: sched/cpufreq_sched: use shorter throttle for raising OPP Avoid cases where a brief drop in load causes a change to a low OPP for the full throttle period. Use a shorter throttle period for raising OPP than for lowering OPP. sched-freq: Fix handling of max/min frequency This reverts commit 9726142608f5b3bf5df4280243c9d324e692a510. Change-Id: Ia78095354f7ad9492f00deb509a2b45112361eda sched/cpufreq: Increasing throttle_down_nsec to 50ms Change-Id: I2d8969cf2a64fa719b9dd86f43f9dd14b1ff84fe sched-freq: make throttle times tunable Change-Id: I127879645367425b273441d7f0306bb15d5633cb Signed-off-by: Srinath Sridharan Signed-off-by: Todd Kjos Signed-off-by: Juri Lelli [jstultz: Fwdported to 4.4] Signed-off-by: John Stultz --- drivers/cpufreq/Kconfig | 2 +- kernel/sched/cpufreq_sched.c | 175 +++++++++++++++++++++++++++++++---- 2 files changed, 160 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a2d63d86bb70..d423f51d3276 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -105,7 +105,7 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_SCHED + select CPU_FREQ_GOV_INTERACTIVE help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index e1d208e101ed..3f8c67a3ea0f 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -19,7 +19,8 @@ #include "sched.h" -#define THROTTLE_NSEC 50000000 /* 50ms default */ +#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ +#define THROTTLE_UP_NSEC 500000 /* 500us default */ struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; static bool __read_mostly cpufreq_driver_slow; @@ -33,8 +34,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); /** * gov_data - per-policy data internal to the governor - * @throttle: next throttling period expiry. Derived from throttle_nsec - * @throttle_nsec: throttle period length in nanoseconds + * @up_throttle: next throttling period expiry if increasing OPP + * @down_throttle: next throttling period expiry if decreasing OPP + * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP + * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP * @task: worker thread for dvfs transition that may block/sleep * @irq_work: callback used to wake up worker thread * @requested_freq: last frequency requested by the sched governor @@ -48,11 +51,14 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); * call down_write(policy->rwsem). */ struct gov_data { - ktime_t throttle; - unsigned int throttle_nsec; + ktime_t up_throttle; + ktime_t down_throttle; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; + int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -66,25 +72,29 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); up_write(&policy->rwsem); } -static bool finish_last_request(struct gov_data *gd) +static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) { ktime_t now = ktime_get(); - if (ktime_after(now, gd->throttle)) + ktime_t throttle = gd->requested_freq < cur_freq ? + gd->down_throttle : gd->up_throttle; + + if (ktime_after(now, throttle)) return false; while (1) { - int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + int usec_left = ktime_to_ns(ktime_sub(throttle, now)); usec_left /= NSEC_PER_USEC; trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); - if (ktime_after(now, gd->throttle)) + if (ktime_after(now, throttle)) return true; } } @@ -128,7 +138,7 @@ static int cpufreq_sched_thread(void *data) * if the frequency thread sleeps while waiting to be * unthrottled, start over to check for a newer request */ - if (finish_last_request(gd)) + if (finish_last_request(gd, policy->cur)) continue; last_request = new_request; cpufreq_sched_try_driver_target(policy, new_request); @@ -183,16 +193,21 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) goto out; freq_new = policy->freq_table[index_new].frequency; + if (freq_new > policy->max) + freq_new = policy->max; + + if (freq_new < policy->min) + freq_new = policy->min; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, gd->requested_freq); - if (freq_new == gd->requested_freq) goto out; @@ -246,10 +261,17 @@ static inline void clear_sched_freq(void) static_key_slow_dec(&__sched_freq); } +static struct attribute_group sched_attr_group_gov_pol; +static struct attribute_group *get_sysfs_attr(void) +{ + return &sched_attr_group_gov_pol; +} + static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) { struct gov_data *gd; int cpu; + int rc; for_each_cpu(cpu, policy->cpus) memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, @@ -259,11 +281,20 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (!gd) return -ENOMEM; - gd->throttle_nsec = policy->cpuinfo.transition_latency ? + gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? policy->cpuinfo.transition_latency : - THROTTLE_NSEC; + THROTTLE_UP_NSEC; + gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->throttle_nsec); + __func__, gd->up_throttle_nsec); + + gd->max = policy->max; + + rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + if (rc) { + pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); + goto err; + } if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; @@ -301,6 +332,8 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } + sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + policy->governor_data = NULL; kfree(gd); @@ -317,6 +350,32 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) return 0; } +static void cpufreq_sched_limits(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", + policy->cpu, policy->min, policy->max, + policy->cur); + + if (!down_write_trylock(&policy->rwsem)) + return; + /* + * Need to keep track of highest max frequency for + * capacity calculations + */ + gd = policy->governor_data; + if (gd->max < policy->max) + gd->max = policy->max; + + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + + up_write(&policy->rwsem); +} + static int cpufreq_sched_stop(struct cpufreq_policy *policy) { int cpu; @@ -340,11 +399,95 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy, case CPUFREQ_GOV_STOP: return cpufreq_sched_stop(policy); case CPUFREQ_GOV_LIMITS: + cpufreq_sched_limits(policy); break; } return 0; } +/* Tunables */ +static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->up_throttle_nsec); +} + +static ssize_t store_up_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->up_throttle_nsec = val; + return count; +} + +static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->down_throttle_nsec); +} + +static ssize_t store_down_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->down_throttle_nsec = val; + return count; +} + +/* + * Create show/store routines + * - sys: One governor instance for complete SYSTEM + * - pol: One governor instance per struct cpufreq_policy + */ +#define show_gov_pol_sys(file_name) \ +static ssize_t show_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, char *buf) \ +{ \ + return show_##file_name(policy->governor_data, buf); \ +} + +#define store_gov_pol_sys(file_name) \ +static ssize_t store_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, const char *buf, size_t count) \ +{ \ + return store_##file_name(policy->governor_data, buf, count); \ +} + +#define gov_pol_attr_rw(_name) \ + static struct freq_attr _name##_gov_pol = \ + __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) + +#define show_store_gov_pol_sys(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name) +#define tunable_handlers(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name); \ + gov_pol_attr_rw(file_name) + +tunable_handlers(down_throttle_nsec); +tunable_handlers(up_throttle_nsec); + +/* Per policy governor instance */ +static struct attribute *sched_attributes_gov_pol[] = { + &up_throttle_nsec_gov_pol.attr, + &down_throttle_nsec_gov_pol.attr, + NULL, +}; + +static struct attribute_group sched_attr_group_gov_pol = { + .attrs = sched_attributes_gov_pol, + .name = "sched", +}; + #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED static #endif From 2e9abbc942e6211f9ac4dadb14debc39bb9d1418 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 09:57:29 +0100 Subject: [PATCH 249/607] sched: EAS: take cstate into account when selecting idle core Introduce a new sysctl for this option, 'sched_cstate_aware'. When this is enabled, select_idle_sibling in CFS is modified to choose the idle CPU in the sibling group which has the lowest idle state index - idle state indexes are assumed to increase as sleep depth and hence wakeup latency increase. In this way, we attempt to minimise wakeup latency when an idle CPU is required. Signed-off-by: Srinath Sridharan Includes: sched: EAS: fix select_idle_sibling when sysctl_sched_cstate_aware is enabled, best_idle cpu will not be chosen in the original flow because it will goto done directly Bug: 30107557 Change-Id: Ie09c2e3960cafbb976f8d472747faefab3b4d6ac Signed-off-by: martin_liu --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 55 +++++++++++++++++++++++++++--------- kernel/sysctl.c | 7 +++++ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4479e48c7712..7d021393b0da 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b12b14756ec..14c8527d7bcb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,6 +51,7 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_cstate_aware = 1; /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5468,15 +5469,20 @@ static int select_idle_sibling(struct task_struct *p, int target) struct sched_domain *sd; struct sched_group *sg; int i = task_cpu(p); + int best_idle = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; - if (idle_cpu(target)) - return target; + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) + return target; - /* - * If the prevous cpu is cache affine and idle, don't be stupid. - */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -5489,18 +5495,41 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; - target = cpumask_first_and(sched_group_cpus(sg), + if (i == target && new_usage <= capacity_curr_of(target)) + return target; + + if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { + best_idle = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); - goto done; + goto done; + } next: sg = sg->next; } while (sg != sd->groups); } + if (best_idle > 0) + target = best_idle; + done: return target; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98c0f6ef01f1..cc0bf3ecdbda 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,13 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_cstate_aware", + .data = &sysctl_sched_cstate_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, From 4a5e890ec60d2e341fa560d8149997f5f0c48d67 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 29 Jul 2016 14:04:11 +0100 Subject: [PATCH 250/607] sched/fair: add tunable to force selection at cpu granularity EAS assumes that clusters with smaller capacity cores are more energy-efficient. This may not be true on non-big-little devices, so EAS can make incorrect cluster selections when finding a CPU to wake. The "sched_is_big_little" hint can be used to cause a cpu-based selection instead of cluster-based selection. This change incorporates the addition of the sync hint enable patch EAS did not honour synchronous wakeup hints, a new sysctl is created to ask EAS to use this information when selecting a CPU. The control is called "sched_sync_hint_enable". Also contains: EAS: sched/fair: for SMP bias toward idle core with capacity For SMP devices, on wakeup bias towards idle cores that have capacity vs busy devices that need a higher OPP eas: favor idle cpus for boosted tasks BUG: 29533997 BUG: 29512132 Change-Id: I0cc9a1b1b88fb52916f18bf2d25715bdc3634f9c Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan eas/sched/fair: Favoring busy cpus with low OPPs BUG: 29533997 BUG: 29512132 Change-Id: I9305b3239698d64278db715a2e277ea0bb4ece79 Signed-off-by: Juri Lelli --- include/linux/sched/sysctl.h | 2 + kernel/sched/fair.c | 191 +++++++++++++++++++++++++++-------- kernel/sysctl.c | 14 +++ 3 files changed, 167 insertions(+), 40 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7d021393b0da..4883dcf3e1a9 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_is_big_little; +extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14c8527d7bcb..a22df661aaa6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,7 +51,10 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_is_big_little = 0; +unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; + /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5534,7 +5537,97 @@ done: return target; } -static int energy_aware_wake_cpu(struct task_struct *p, int target) +static inline int find_best_target(struct task_struct *p) +{ + int i, boosted; + int target_cpu = -1; + int target_capacity = 0; + int backup_capacity = 0; + int idle_cpu = -1; + int best_idle_cstate = INT_MAX; + int backup_cpu = -1; + unsigned long task_util_boosted, new_util; + + /* + * Favor 1) busy cpu with most capacity at current OPP + * 2) idle_cpu with capacity at current OPP + * 3) busy cpu with capacity at higher OPP + */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p); +#else + boosted = 0; +#endif + task_util_boosted = boosted_task_util(p); + for_each_cpu(i, tsk_cpus_allowed(p)) { + int cur_capacity = capacity_curr_of(i); + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util_boosted; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + + if (new_util > capacity_orig_of(i)) + continue; + + /* + * For boosted tasks we favor idle cpus unconditionally to + * improve latency. + */ + if (idle_idx >= 0 && boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + continue; + } + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + if (target_capacity == 0 || + target_capacity > cur_capacity) { + /* busy CPU with most capacity at current OPP */ + target_cpu = i; + target_capacity = cur_capacity; + } + } else if (!boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + } + } else if (backup_capacity == 0 || + backup_capacity > cur_capacity) { + /* first busy CPU with capacity at higher OPP */ + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + + if (!boosted && target_cpu < 0) { + target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; + } + + if (boosted && idle_cpu >= 0) + target_cpu = idle_cpu; + return target_cpu; +} + +static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) { struct sched_domain *sd; struct sched_group *sg, *sg_target; @@ -5542,6 +5635,14 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) int target_cpu = task_cpu(p); int i; + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); if (!sd) @@ -5550,50 +5651,60 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) sg = sd->groups; sg_target = sg; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + if (sysctl_sched_is_big_little) { /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + boosted_task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + } else { /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. + * Find a cpu with sufficient capacity */ - int new_util = cpu_util(i) + boosted_task_util(p); - - if (new_util > capacity_orig_of(i)) - continue; - - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; - } - - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; + int tmp_target = find_best_target(p); + if (tmp_target >= 0) + target_cpu = tmp_target; } if (target_cpu != task_cpu(p)) { @@ -5670,7 +5781,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu); + new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc0bf3ecdbda..4671761ae3c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,20 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_is_big_little", + .data = &sysctl_sched_is_big_little, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From c50cc2299cb1a9ae769c955f5dd09a9648b1600e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 11 Mar 2016 16:44:16 -0800 Subject: [PATCH 251/607] sched/fair: add tunable to set initial task load The choice of initial task load upon fork has a large influence on CPU and OPP selection when scheduler-driven DVFS is in use. Make this tuneable by adding a new sysctl "sched_initial_task_util". If the sched governor is not used, the default remains at SCHED_LOAD_SCALE Otherwise, the value from the sysctl is used. This defaults to 0. Signed-off-by: "Todd Kjos " --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 5 ++++- kernel/sysctl.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4883dcf3e1a9..2834841c507e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,7 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; +extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a22df661aaa6..88aca75a6659 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -53,6 +53,7 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; /* @@ -687,7 +688,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_avg = sched_freq() ? + sysctl_sched_initial_task_util : + scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4671761ae3c5..4457e107ec20 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -317,6 +317,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_initial_task_util", + .data = &sysctl_sched_initial_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From 28e8cb961c2f71a89e391335a9304c3dd8b38d8f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 16:39:27 +0100 Subject: [PATCH 252/607] FIX: sched/tune: update usage of boosted task utilisation on CPU selection A boosted task needs to be scheduled on a CPU which can grant a minimum capacity which is higher than its utilization. However, a task can be allocated on a CPU which already provides an utilization which is higher than the task boosted utilization itself. Moreover, with the previous approach a task 100% boosted is not fitting any CPU. This patch makes use of the boosted task utilization just as a threashold which defines the minimum capacity should be available on a CPU to host that task. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 88aca75a6659..9e1935b6e340 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5636,6 +5636,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) struct sched_group *sg, *sg_target; int target_max_cap = INT_MAX; int target_cpu = task_cpu(p); + unsigned long task_util_boosted, new_util; int i; if (sysctl_sched_sync_hint_enable && sync) { @@ -5679,6 +5680,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) } } while (sg = sg->next, sg != sd->groups); + task_util_boosted = boosted_task_util(p); /* Find cpu with sufficient capacity */ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /* @@ -5686,8 +5688,13 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + boosted_task_util(p); + new_util = cpu_util(i) + task_util_boosted; + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ if (new_util > capacity_orig_of(i)) continue; From 6ba071d89dd72b080b9f0e4abf587cad99d5320b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:45:57 +0100 Subject: [PATCH 253/607] FIX: sched/tune: move schedtune_nornalize_energy into fair.c The energy normalization function is required to get the proper values for the P-E space filtering function to work. That normalization is part of the hot wakeup path and currently implemented with a function call. Moving the normalization function into fair.c allows the compiler to further optimize that code by reducing overheads in the wakeup hot path. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++---------------- kernel/sched/tune.c | 42 +-------------- kernel/sched/tune.h | 12 ++++- 3 files changed, 91 insertions(+), 84 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e1935b6e340..2b23dfefe6f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4975,44 +4975,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -#ifdef CONFIG_SCHED_TUNE -static int energy_diff_evaluate(struct energy_env *eenv) -{ - unsigned int boost; - int nrg_delta; - - /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif - if (boost == 0) - return eenv->nrg.diff; - - /* Compute normalized energy diff */ - nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); - eenv->nrg.delta = nrg_delta; - - eenv->payoff = schedtune_accept_deltas( - eenv->nrg.delta, - eenv->cap.delta, - eenv->task); - - /* - * When SchedTune is enabled, the energy_diff() function will return - * the computed energy payoff value. Since the energy_diff() return - * value is expected to be negative by its callers, this evaluation - * function return a negative value each time the evaluation return a - * positive payoff, which is the condition for the acceptance of - * a scheduling decision - */ - return -eenv->payoff; -} -#else /* CONFIG_SCHED_TUNE */ -#define energy_diff_evaluate(eenv) eenv->nrg.diff -#endif - /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5020,12 +4982,11 @@ static int energy_diff_evaluate(struct energy_env *eenv) * utilization is removed from or added to the system (e.g. task wake-up). If * both are specified, the utilization is migrated. */ -static int energy_diff(struct energy_env *eenv) +static inline int __energy_diff(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; - int result; struct energy_env eenv_before = { .util_delta = 0, @@ -5069,17 +5030,91 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - result = energy_diff_evaluate(eenv); - trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); - return result; + return eenv->nrg.diff; } +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; +#ifdef CONFIG_SCHED_DEBUG + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +static inline int +energy_diff(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b40d40dc3c49..8ca8db2de818 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -3,24 +3,17 @@ #include #include #include -#include #include #include #include #include "sched.h" +#include "tune.h" unsigned int sysctl_sched_cfs_boost __read_mostly; -/* - * System energy normalization constants - */ -static struct target_nrg { - unsigned long min_power; - unsigned long max_power; - struct reciprocal_value rdiv; -} schedtune_target_nrg; +extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; @@ -587,37 +580,6 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, return 0; } -/* - * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], - * corresponding to the specified energy variation. - */ -int -schedtune_normalize_energy(int energy_diff) -{ - u32 normalized_nrg; - int max_delta; - -#ifdef CONFIG_SCHED_DEBUG - /* Check for boundaries */ - max_delta = schedtune_target_nrg.max_power; - max_delta -= schedtune_target_nrg.min_power; - WARN_ON(abs(energy_diff) >= max_delta); -#endif - - /* Do scaling using positive numbers to increase the range */ - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; - - /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; - - /* Normalize on max energy for target platform */ - normalized_nrg = reciprocal_divide( - normalized_nrg, schedtune_target_nrg.rdiv); - - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; -} - #ifdef CONFIG_SCHED_DEBUG static void schedtune_test_nrg(unsigned long delta_pwr) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index f7273a5d994a..7d2aa7951554 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -1,6 +1,17 @@ #ifdef CONFIG_SCHED_TUNE +#include + +/* + * System energy normalization constants + */ +struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +}; + #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); @@ -25,7 +36,6 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) -#define schedtune_normalize_energy(energy) energy #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta #endif /* CONFIG_SCHED_TUNE */ From 00aae8d5d5cd6f28d7603e0c1c4ac5cf91cb4aa3 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 28 Jul 2016 17:28:55 +0100 Subject: [PATCH 254/607] sched/tune: Add support for negative boost values Change-Id: I164ee04ba98c3a776605f18cb65ee61b3e917939 Contains also: eas/stune: schedtune cpu boost_max must be non-negative. This is to avoid under-accounting cpu capacity which may cause task stacking and frequency spikes. Change-Id: Ie1c1cbd52a6edb77b4c15a830030aa748dff6f29 --- include/trace/events/sched.h | 20 +++++++++---------- kernel/sched/fair.c | 37 ++++++++++++++++++++---------------- kernel/sched/tune.c | 25 +++++++++++++++--------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 07ed295d0f23..7dc39dd384de 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -682,14 +682,14 @@ TRACE_EVENT(sched_tune_config, */ TRACE_EVENT(sched_boost_cpu, - TP_PROTO(int cpu, unsigned long util, unsigned long margin), + TP_PROTO(int cpu, unsigned long util, long margin), TP_ARGS(cpu, util, margin), TP_STRUCT__entry( __field( int, cpu ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field(long, margin ) ), TP_fast_assign( @@ -698,7 +698,7 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin = margin; ), - TP_printk("cpu=%d util=%lu margin=%lu", + TP_printk("cpu=%d util=%lu margin=%ld", __entry->cpu, __entry->util, __entry->margin) @@ -710,7 +710,7 @@ TRACE_EVENT(sched_boost_cpu, TRACE_EVENT(sched_tune_tasks_update, TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, - unsigned int boost, unsigned int max_boost), + int boost, int max_boost), TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), @@ -720,8 +720,8 @@ TRACE_EVENT(sched_tune_tasks_update, __field( int, cpu ) __field( int, tasks ) __field( int, idx ) - __field( unsigned int, boost ) - __field( unsigned int, max_boost ) + __field( int, boost ) + __field( int, max_boost ) ), TP_fast_assign( @@ -735,7 +735,7 @@ TRACE_EVENT(sched_tune_tasks_update, ), TP_printk("pid=%d comm=%s " - "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", __entry->pid, __entry->comm, __entry->cpu, __entry->tasks, __entry->idx, __entry->boost, __entry->max_boost) @@ -771,7 +771,7 @@ TRACE_EVENT(sched_tune_boostgroup_update, */ TRACE_EVENT(sched_boost_task, - TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + TP_PROTO(struct task_struct *tsk, unsigned long util, long margin), TP_ARGS(tsk, util, margin), @@ -779,7 +779,7 @@ TRACE_EVENT(sched_boost_task, __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field( long, margin ) ), @@ -790,7 +790,7 @@ TRACE_EVENT(sched_boost_task, __entry->margin = margin; ), - TP_printk("comm=%s pid=%d util=%lu margin=%lu", + TP_printk("comm=%s pid=%d util=%lu margin=%ld", __entry->comm, __entry->pid, __entry->util, __entry->margin) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b23dfefe6f1..aef41d9acd83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5250,22 +5250,25 @@ static bool cpu_overutilized(int cpu) #ifdef CONFIG_SCHED_TUNE -static unsigned long -schedtune_margin(unsigned long signal, unsigned long boost) +static long +schedtune_margin(unsigned long signal, long boost) { - unsigned long long margin = 0; + long long margin = 0; /* * Signal proportional compensation (SPC) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S) + * M = B * (SCHED_LOAD_SCALE - S), if B is positive + * M = B * S, if B is negative * The obtained M could be used by the caller to "boost" S. */ - margin = SCHED_LOAD_SCALE - signal; - margin *= boost; - + if (boost >= 0) { + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; /* * Fast integer division by constant: * Constant : (C) = 100 @@ -5281,13 +5284,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) margin *= 1311; margin >>= 17; + if (boost < 0) + margin *= -1; return margin; } -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost; + int boost; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_cpu_boost(cpu); @@ -5300,12 +5305,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } -static inline unsigned long +static inline long schedtune_task_margin(struct task_struct *task) { - unsigned int boost; + int boost; unsigned long util; - unsigned long margin; + long margin; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_task_boost(task); @@ -5323,13 +5328,13 @@ schedtune_task_margin(struct task_struct *task) #else /* CONFIG_SCHED_TUNE */ -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } -static inline unsigned int +static inline int schedtune_task_margin(struct task_struct *task) { return 0; @@ -5341,7 +5346,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util, cpu); + long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); @@ -5352,7 +5357,7 @@ static inline unsigned long boosted_task_util(struct task_struct *task) { unsigned long util = task_util(task); - unsigned long margin = schedtune_task_margin(task); + long margin = schedtune_task_margin(task); trace_sched_boost_task(task, util, margin); diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 8ca8db2de818..afc4a7747161 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -213,10 +213,11 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ - unsigned boost_max; + bool idle; + int boost_max; struct { /* The boost for tasks on that boost group */ - unsigned boost; + int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; @@ -229,7 +230,7 @@ static void schedtune_cpu_update(int cpu) { struct boost_groups *bg; - unsigned boost_max; + int boost_max; int idx; bg = &per_cpu(cpu_boost_groups, cpu); @@ -243,9 +244,13 @@ schedtune_cpu_update(int cpu) */ if (bg->group[idx].tasks == 0) continue; + boost_max = max(boost_max, bg->group[idx].boost); } - + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); bg->boost_max = boost_max; } @@ -391,7 +396,7 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } -static u64 +static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); @@ -401,11 +406,13 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft) static int boost_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 boost) + s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; - if (boost < 0 || boost > 100) + if (boost < -100 || boost > 100) return -EINVAL; st->boost = boost; @@ -423,8 +430,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, static struct cftype files[] = { { .name = "boost", - .read_u64 = boost_read, - .write_u64 = boost_write, + .read_s64 = boost_read, + .write_s64 = boost_write, }, { } /* terminate */ }; From 254f5095a8143f0b852c5db7672957cf3b4d65d3 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:38:25 +0100 Subject: [PATCH 255/607] FIXUP: sched/tune: fix payoff calculation for boost region The definition of the acceptance regions as well as the translation of these regions into a payoff value was both wrong which turned out in: a) a wrong definition of payoff for the performance boost region b) a correct "by chance" definition of the payoff for the performance constraint region (i.e. two sign errors together fixing the formula) This patch provides a better description of the cut regions as well as a fixed version of the payoff computations, which are now reduced to a single formula usable for both cases. Reported-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 79 +++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index afc4a7747161..6d5fbde9c70e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -51,50 +51,51 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_boost_idx, int perf_constrain_idx) { int payoff = -INT_MAX; + int gain_idx = -1; /* Performance Boost (B) region */ - if (nrg_delta > 0 && cap_delta > 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta < cap_gain / nrg_gain - * which is: - * nrg_delta * cap_gain > cap_delta * nrg_gain - */ - payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; - payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_boost_idx].nrg_gain, - threshold_gains[perf_boost_idx].cap_gain, - payoff, 8); - - return payoff; - } - + if (nrg_delta >= 0 && cap_delta > 0) + gain_idx = perf_boost_idx; /* Performance Constraint (C) region */ - if (nrg_delta < 0 && cap_delta < 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta > cap_gain / nrg_gain - * which is: - * cap_delta * nrg_gain > nrg_delta * cap_gain - */ - payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; - payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_constrain_idx].nrg_gain, - threshold_gains[perf_constrain_idx].cap_gain, - payoff, 6); - - return payoff; - } + else if (nrg_delta < 0 && cap_delta <= 0) + gain_idx = perf_constrain_idx; /* Default: reject schedule candidate */ + if (gain_idx == -1) + return payoff; + + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * + * - Performance Boost (B) region + * + * Condition: nrg_delta > 0 && cap_delta > 0 + * Payoff criteria: + * cap_gain / nrg_gain < cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since both nrg_gain and nrg_delta are positive, the + * inequality does not change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * - Performance Constraint (C) region + * + * Condition: nrg_delta < 0 && cap_delta < 0 + * payoff criteria: + * cap_gain / nrg_gain > cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since nrg_gain > 0 while nrg_delta < 0, the + * inequality change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * This means that, in case of same positive defined {cap,nrg}_gain + * for both the B and C regions, we can use the same payoff formula + * where a positive value represents the accept condition. + */ + payoff = cap_delta * threshold_gains[gain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain; + return payoff; } From 274bbcfbe44127c5575ed847d48d201a66cc29ce Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:42:36 +0100 Subject: [PATCH 256/607] sched/{fair,tune}: simplify fair.c code The usage of conditional compiled code is discouraged in fair.c. This patch clean up a bit fair.c by moving schedtune_{cpu.task}_boost definitions into tune.h. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 21 +++------------------ kernel/sched/tune.h | 6 ++++++ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aef41d9acd83..6390b4cb916c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5077,18 +5077,13 @@ normalize_energy(int energy_diff) static inline int energy_diff(struct energy_env *eenv) { - unsigned int boost; + int boost = schedtune_task_boost(eenv->task); int nrg_delta; /* Conpute "absolute" energy diff */ __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return eenv->nrg.diff; @@ -5292,13 +5287,8 @@ schedtune_margin(unsigned long signal, long boost) static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - int boost; + int boost = schedtune_cpu_boost(cpu); -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_cpu_boost(cpu); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; @@ -5308,15 +5298,10 @@ schedtune_cpu_margin(unsigned long util, int cpu) static inline long schedtune_task_margin(struct task_struct *task) { - int boost; + int boost = schedtune_task_boost(task); unsigned long util; long margin; -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 7d2aa7951554..99637758a8af 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -22,6 +22,9 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #else /* CONFIG_CGROUP_SCHEDTUNE */ +#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() +#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -33,6 +36,9 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #else /* CONFIG_SCHED_TUNE */ +#define schedtune_cpu_boost(cpu) 0 +#define schedtune_task_boost(tsk) 0 + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 7f8f24a0eafe312899818cc07af8e9f1b33d9c39 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:19:41 +0100 Subject: [PATCH 257/607] sched/tune: use a single initialisation function With the introduction of initialization function required to compute the energy normalization constants from DTB at boot time, we have now a late_initcall which is already used by SchedTune. This patch consolidate within that function the other initialization bits which was previously deferred to the first CGroup creation. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/tune.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 6d5fbde9c70e..a691b8db2888 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -410,8 +410,6 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); - unsigned threshold_idx; - int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; @@ -456,33 +454,14 @@ schedtune_boostgroup_init(struct schedtune *st) return 0; } -static int -schedtune_init(void) -{ - struct boost_groups *bg; - int cpu; - - /* Initialize the per CPU boost groups */ - for_each_possible_cpu(cpu) { - bg = &per_cpu(cpu_boost_groups, cpu); - memset(bg, 0, sizeof(struct boost_groups)); - } - - pr_info(" schedtune configured to support %d boost groups\n", - BOOSTGROUPS_COUNT); - return 0; -} - static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { struct schedtune *st; int idx; - if (!parent_css) { - schedtune_init(); + if (!parent_css) return &root_schedtune.css; - } /* Allow only single level hierachies */ if (parent_css != &root_schedtune.css) { @@ -543,6 +522,22 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +static inline void +schedtune_init_cgroups(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info("schedtune: configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); +} + #else /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -690,7 +685,7 @@ schedtune_add_cluster_nrg( * that bind the EM to the topology information. */ static int -schedtune_init_late(void) +schedtune_init(void) { struct target_nrg *ste = &schedtune_target_nrg; unsigned long delta_pwr = 0; @@ -730,10 +725,17 @@ schedtune_init_late(void) ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); schedtune_test_nrg(delta_pwr); + +#ifdef CONFIG_CGROUP_SCHEDTUNE + schedtune_init_cgroups(); +#else + pr_info("schedtune: configured to support global boosting only\n"); +#endif + return 0; nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init_late); +late_initcall(schedtune_init); From af14760e19ebce67e8913fc0efd32cf839282a95 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 18:44:40 +0100 Subject: [PATCH 258/607] FIXUP: sched/tune: fix accounting for runnable tasks Contains: sched/tune: fix accounting for runnable tasks (1/5) The accounting for tasks into boost groups of different CPUs is currently broken mainly because: a) we do not properly track the change of boost group of a RUNNABLE task b) there are race conditions between migration code and accounting code This patch provides a fixes to ensure enqueue/dequeue accounting also for throttled tasks. Without this patch is can happen that a task is enqueued into a throttled RQ thus not being accounted for the boosting of the corresponding RQ. We could argue that a throttled task should not boost a CPU, however: a) properly implementing CPU boosting considering throttled tasks will increase a lot the complexity of the solution b) it's not easy to quantify the benefits introduced by such a more complex solution Since task throttling requires the usage of the CFS bandwidth controller, which is not widely used on mobile systems (at least not by Android kernels so far), for the time being we go for the simple solution and boost also for throttled RQs. sched/tune: fix accounting for runnable tasks (2/5) This patch provides the code required to enforce proper locking. A per boost group spinlock has been added to grant atomic accounting of tasks as well as to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. sched/tune: fix accounting for runnable tasks (3/5) This patch adds cgroups {allow,can,cancel}_attach callbacks. Since a task can be migrated between boost groups while it's running, the CGroups's attach callbacks have been added to properly migrate boost contributions of RUNNABLE tasks. The RQ's lock is used to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. While the SchedTune's CPU lock is used to grant atrocity of the accounting within the CPU. NOTE: the current implementation does not allows a concurrent CPU migration and CGroups change. sched/tune: fix accounting for runnable tasks (4/5) This fixes accounting for exiting tasks by adding a dedicated call early in the do_exit() syscall, which disables SchedTune accounting as soon as a task is flagged PF_EXITING. This flag is set before the multiple dequeue/enqueue dance triggered by cgroup_exit() which is useful only to inject useless tasks movements thus increasing possibilities for race conditions with the migration code. The schedtune_exit_task() call does the last dequeue of a task from its current boost group. This is a solution more aligned with what happens in mainline kernels (>v4.4) where the exit_cgroup does not move anymore a dying task to the root control group. sched/tune: fix accounting for runnable tasks (5/5) To avoid accounting issues at startup, this patch disable the SchedTune accounting until the required data structures have been properly initialized. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/exit.c | 5 ++ kernel/sched/core.c | 12 +++ kernel/sched/fair.c | 11 ++- kernel/sched/sched.h | 3 + kernel/sched/tune.c | 182 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/tune.h | 6 ++ 6 files changed, 195 insertions(+), 24 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..92ff63200287 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,8 @@ #include #include +#include "sched/tune.h" + #include #include #include @@ -699,6 +701,9 @@ void do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + schedtune_exit_task(tsk); + /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 42c45ccc1f62..f56a528c8ae7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000; /* cpus with isolated domains */ cpumask_var_t cpu_isolated_map; +struct rq * +lock_rq_of(struct task_struct *p, unsigned long *flags) +{ + return task_rq_lock(p, flags); +} + +void +unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags) +{ + task_rq_unlock(rq, p, flags); +} + /* * this_rq_lock - lock this runqueue and disable interrupts. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6390b4cb916c..a26f40cb7fd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4234,8 +4234,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; - schedtune_enqueue_task(p, cpu_of(rq)); - /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4246,6 +4244,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } + + /* Update SchedTune accouting */ + schedtune_enqueue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -4311,7 +4313,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { - schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -4329,6 +4330,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } + /* Update SchedTune accouting */ + schedtune_dequeue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5604,7 +5608,6 @@ static inline int find_best_target(struct task_struct *p) * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a06c19c48057..144d4782a455 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } +extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags); +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags); + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a691b8db2888..4c77cc23e65b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -11,6 +11,10 @@ #include "sched.h" #include "tune.h" +#ifdef CONFIG_CGROUP_SCHEDTUNE +static bool schedtune_initialized = false; +#endif + unsigned int sysctl_sched_cfs_boost __read_mostly; extern struct target_nrg schedtune_target_nrg; @@ -222,6 +226,8 @@ struct boost_groups { /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ @@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { - struct boost_groups *bg; - int tasks; - - bg = &per_cpu(cpu_boost_groups, cpu); + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; /* Update boosted tasks count while avoiding to make it negative */ - if (task_count < 0 && bg->group[idx].tasks <= -task_count) - bg->group[idx].tasks = 0; - else - bg->group[idx].tasks += task_count; - - /* Boost group activation or deactivation on that RQ */ - tasks = bg->group[idx].tasks; - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + bg->group[idx].tasks = max(0, tasks); trace_sched_tune_tasks_update(p, cpu, tasks, idx, bg->group[idx].boost, bg->boost_max); + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); } /* @@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. @@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, 1); + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_allow_attach(struct cgroup_taskset *tset) +{ + /* We always allows tasks to be moved between existing CGroups */ + return 0; +} + +int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct boost_groups *bg; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + + cgroup_taskset_for_each(task, css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = lock_rq_of(task, &irq_flags); + + if (!task->on_rq) { + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} + +void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); } /* @@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. - * The last dequeue will be done by cgroup exit() callback. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). */ if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, -1); + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct schedtune *st; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + rq = lock_rq_of(tsk, &irq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + unlock_rq_of(rq, tsk, &irq_flags); } int schedtune_cpu_boost(int cpu) @@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, +// .allow_attach = schedtune_allow_attach, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 99637758a8af..be1785eb1c5b 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +void schedtune_exit_task(struct task_struct *tsk); + void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); @@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_cpu_boost(cpu) 0 #define schedtune_task_boost(tsk) 0 +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 369bcbbae4407a5b50f472a0fa3d569b9a832081 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 29 Jul 2016 14:41:25 +0100 Subject: [PATCH 259/607] sched/fair: optimize idle cpu selection for boosted tasks find_best_target CPU selection is biased towards lower CPU IDs. Bias towards higher CPUs for boosted tasks. For boosted tasks unconditionally use the idle CPU returned by find_best_target. BUG: 29512132 Change-Id: I3d650051752163fcf3dc7909751d1fde3f9d17c0 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 70 +++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a26f40cb7fd0..6efdcad51603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5569,32 +5569,30 @@ done: return target; } -static inline int find_best_target(struct task_struct *p) +static inline int find_best_target(struct task_struct *p, bool boosted) { - int i, boosted; + int iter_cpu; int target_cpu = -1; int target_capacity = 0; int backup_capacity = 0; - int idle_cpu = -1; + int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; unsigned long task_util_boosted, new_util; - /* - * Favor 1) busy cpu with most capacity at current OPP - * 2) idle_cpu with capacity at current OPP - * 3) busy cpu with capacity at higher OPP - */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boosted = schedtune_task_boost(p); -#else - boosted = 0; -#endif task_util_boosted = boosted_task_util(p); - for_each_cpu(i, tsk_cpus_allowed(p)) { - int cur_capacity = capacity_curr_of(i); - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + struct rq *rq; + int idle_idx; + + /* + * favor higher cpus for boosted tasks + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) + continue; /* * p's blocked utilization is still accounted for on prev_cpu @@ -5615,46 +5613,43 @@ static inline int find_best_target(struct task_struct *p) * For boosted tasks we favor idle cpus unconditionally to * improve latency. */ - if (idle_idx >= 0 && boosted) { - if (idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - idle_cpu = i; - } + if (idle_cpu(i) && boosted) { + if (best_idle_cpu < 0) + best_idle_cpu = i; continue; } + cur_capacity = capacity_curr_of(i); + rq = cpu_rq(i); + idle_idx = idle_get_state_idx(rq); + if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { if (target_capacity == 0 || target_capacity > cur_capacity) { - /* busy CPU with most capacity at current OPP */ target_cpu = i; target_capacity = cur_capacity; } } else if (!boosted) { - if (idle_cpu < 0 || + if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { best_idle_cstate = idle_idx; - idle_cpu = i; + best_idle_cpu = i; } } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { - /* first busy CPU with capacity at higher OPP */ backup_capacity = cur_capacity; backup_cpu = i; } } - if (!boosted && target_cpu < 0) { - target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; - } + if (boosted && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + else if (target_cpu < 0) + target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; - if (boosted && idle_cpu >= 0) - target_cpu = idle_cpu; return target_cpu; } @@ -5740,9 +5735,16 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) /* * Find a cpu with sufficient capacity */ - int tmp_target = find_best_target(p); +#ifdef CONFIG_CGROUP_SCHEDTUNE + bool boosted = schedtune_task_boost(p) > 0; +#else + bool boosted = 0; +#endif + int tmp_target = find_best_target(p, boosted); if (tmp_target >= 0) target_cpu = tmp_target; + if (boosted && idle_cpu(target_cpu)) + return target_cpu; } if (target_cpu != task_cpu(p)) { From 345be81dba1ba6c7ac1df2b9ba355fc290d5a73b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:32:26 +0100 Subject: [PATCH 260/607] sched/tune: fix PB and PC cuts indexes definition The current definition of the Performance Boost (PB) and Performance Constraint (PC) regions is has two main issues: 1) in the computation of the boost index we overflow the thresholds_gains table for boost=100 2) the two cuts had _NOT_ the same ratio The last point means that when boost=0 we do _not_ have a "standard" EAS behaviour, i.e. accepting all candidate which decrease energy regardless of their impact on performances. Instead, we accept only schedule candidate which are in the Optimal region, i.e. decrease energy while increasing performances. This behaviour can have a negative impact also on CPU selection policies which tries to spread tasks to reduce latencies. Indeed, for example we could end up rejecting a schedule candidate which want to move a task from a congested CPU to an idle one while, specifically in the case where the target CPU will be running on a lower OPP. This patch fixes these two issues by properly clamping the boost value in the appropriate range to compute the threshold indexes as well as by using the same threshold index for both cuts. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: fix update of threshold index for boost groups When SchedTune is configured to work with CGroup mode, each time we update the boost value of a group we do not update the threshed indexes for the definition of the Performance Boost (PC) and Performance Constraint (PC) region. This means that while the OPP boosting and CPU biasing selection is working as expected, the __schedtune_accept_deltas function is always using the initial values for these cuts. This patch ensure that each time a new boost value is configured for a boost group, the cuts for the PB and PC region are properly updated too. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: update PC and PB cuts definition The current definition of Performance Boost (PB) and Performance Constraint (PC) cuts defines two "dead regions": - up to 20% boost: we are in energy-reduction only mode, i.e. accept all candidate which reduce energy - over 70% boost: we are in performance-increase only mode, i.e. accept only sched candidate which do not reduce performances This patch uses a more fine grained configuration where these two "dead regions" are reduced to: up to 10% and over 90%. This should allow to have some boosting benefits starting from 10% boost values as well as not being to much permissive starting from boost values of 80%. Suggested-by: Leo Yan Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan bug: 28312446 Change-Id: Ia326c66521e38c98e7a7eddbbb7c437875efa1ba Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4c77cc23e65b..d24f365b0c90 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -38,16 +38,16 @@ struct threshold_params { */ static struct threshold_params threshold_gains[] = { - { 0, 4 }, /* >= 0% */ - { 0, 4 }, /* >= 10% */ - { 1, 4 }, /* >= 20% */ - { 2, 4 }, /* >= 30% */ - { 3, 4 }, /* >= 40% */ - { 4, 3 }, /* >= 50% */ - { 4, 2 }, /* >= 60% */ - { 4, 1 }, /* >= 70% */ - { 4, 0 }, /* >= 80% */ - { 4, 0 } /* >= 90% */ + { 0, 5 }, /* < 10% */ + { 1, 5 }, /* < 20% */ + { 2, 5 }, /* < 30% */ + { 3, 5 }, /* < 40% */ + { 4, 5 }, /* < 50% */ + { 5, 4 }, /* < 60% */ + { 5, 3 }, /* < 70% */ + { 5, 2 }, /* < 80% */ + { 5, 1 }, /* < 90% */ + { 5, 0 } /* <= 100% */ }; static int @@ -549,13 +549,29 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; + boost_pct = boost; + + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + st->perf_boost_idx = threshold_idx; + st->perf_constrain_idx = threshold_idx; st->boost = boost; - if (css == &root_schedtune.css) + if (css == &root_schedtune.css) { sysctl_sched_cfs_boost = boost; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; + } /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); @@ -710,17 +726,25 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + unsigned threshold_idx; + int boost_pct; if (ret || !write) return ret; - /* Performance Boost (B) region threshold params */ - perf_boost_idx = sysctl_sched_cfs_boost; - perf_boost_idx /= 10; + if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100) + return -EINVAL; + boost_pct = sysctl_sched_cfs_boost; - /* Performance Constraint (C) region threshold params */ - perf_constrain_idx = 100 - sysctl_sched_cfs_boost; - perf_constrain_idx /= 10; + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; return 0; } From efb86bd08a2e9217d0b3c33753cf63d27e7c86da Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 31 May 2016 09:08:38 -0700 Subject: [PATCH 261/607] sched: Introduce Window Assisted Load Tracking (WALT) use a window based view of time in order to track task demand and CPU utilization in the scheduler. Window Assisted Load Tracking (WALT) implementation credits: Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla and Todd Kjos Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708 Includes fixes for issues: eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a race resulting in watchdog resets BUG: 29353986 Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb Handle walt accounting anomoly during resume During resume, there is a corner case where on wakeup, a task's prev_runnable_sum can go negative. This is a workaround that fixes the condition and warns (instead of crashing). BUG: 29464099 Change-Id: I173e7874324b31a3584435530281708145773508 Signed-off-by: Todd Kjos Signed-off-by: Srinath Sridharan Signed-off-by: Juri Lelli [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- include/linux/sched.h | 53 ++ include/linux/sched/sysctl.h | 5 + include/trace/events/sched.h | 149 +++++ init/Kconfig | 9 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 43 +- kernel/sched/fair.c | 20 + kernel/sched/rt.c | 4 + kernel/sched/sched.h | 34 ++ kernel/sched/stop_task.c | 3 + kernel/sched/walt.c | 1098 ++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 57 ++ kernel/sysctl.c | 23 + 13 files changed, 1498 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/walt.c create mode 100644 kernel/sched/walt.h diff --git a/include/linux/sched.h b/include/linux/sched.h index ee59abcab30d..ce8db12f9e72 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -317,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!( /* Task command name length */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + #include /* @@ -1274,6 +1283,41 @@ struct sched_statistics { }; #endif +#ifdef CONFIG_SCHED_WALT +#define RAVG_HIST_SIZE_MAX 5 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window' represents task's contribution to cpu busy time + * statistics (rq->curr_runnable_sum) in current window + * + * 'prev_window' represents task's contribution to cpu busy time + * statistics (rq->prev_runnable_sum) in previous window + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 curr_window, prev_window; + u16 active_windows; +}; +#endif + struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; @@ -1431,6 +1475,15 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2834841c507e..710f58a28d63 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7dc39dd384de..33b1c719c9d7 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -888,6 +888,155 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#ifdef CONFIG_SCHED_WALT +struct rq; + +TRACE_EVENT(walt_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, int evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) + __field( u64, wallclock ) + __field( u64, mark_start ) + __field( u64, delta_m ) + __field( u64, win_start ) + __field( u64, delta ) + __field( u64, irqtime ) + __field( int, evt ) + __field(unsigned int, demand ) + __field(unsigned int, sum ) + __field( int, cpu ) + __field( u64, cs ) + __field( u64, ps ) + __field( u32, curr_window ) + __field( u32, prev_window ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" + " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" + , __entry->wallclock, __entry->win_start, __entry->delta, + __entry->evt, __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->cs, __entry->ps, + __entry->curr_window, __entry->prev_window, + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows + ) +); + +TRACE_EVENT(walt_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + int evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, runtime ) + __field( int, samples ) + __field( int, evt ) + __field( u64, demand ) + __field(unsigned int, walt_avg ) + __field(unsigned int, pelt_avg ) + __array( u32, hist, RAVG_HIST_SIZE_MAX) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->pelt_avg = p->se.avg.util_avg; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %d demand %llu" + " walt %u pelt %u (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, __entry->evt, + __entry->demand, + __entry->walt_avg, + __entry->pelt_avg, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(walt_migration_update_sum, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __field(int, cpu ) + __field(int, pid ) + __field( u64, cs ) + __field( u64, ps ) + __field( s64, nt_cs ) + __field( s64, nt_ps ) + ), + + TP_fast_assign( + __entry->cpu = cpu_of(rq); + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->nt_cs = (s64)rq->nt_curr_runnable_sum; + __entry->nt_ps = (s64)rq->nt_prev_runnable_sum; + __entry->pid = p->pid; + ), + + TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d", + __entry->cpu, __entry->cs, __entry->ps, + __entry->nt_cs, __entry->nt_ps, __entry->pid) +); +#endif /* CONFIG_SCHED_WALT */ + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/init/Kconfig b/init/Kconfig index facb2b587d5b..33f4e3965e3c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING endchoice +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 174762d8695b..623ce4bde0d5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f56a528c8ae7..b6f41850ed00 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -89,6 +89,7 @@ #define CREATE_TRACE_POINTS #include +#include "walt.h" DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1085,7 +1086,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); + double_unlock_balance(rq, cpu_rq(new_cpu)); raw_spin_unlock(&rq->lock); rq = cpu_rq(new_cpu); @@ -1309,6 +1312,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); + + walt_fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1937,6 +1942,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; +#ifdef CONFIG_SMP + struct rq *rq; + u64 wallclock; +#endif /* * If we are going to wake up a thread waiting for CONDITION we @@ -1994,6 +2003,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + rq = cpu_rq(task_cpu(p)); + + raw_spin_lock(&rq->lock); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + raw_spin_unlock(&rq->lock); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2001,10 +2018,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); @@ -2053,8 +2072,13 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + u64 wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); @@ -2120,6 +2144,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -2387,6 +2412,9 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + + walt_init_new_task_load(p); + /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2399,6 +2427,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); + walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2948,9 +2977,12 @@ void scheduler_tick(void) sched_clock_tick(); raw_spin_lock(&rq->lock); + walt_set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); @@ -3188,6 +3220,7 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3249,6 +3282,9 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -5669,6 +5705,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; break; @@ -5688,6 +5727,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -7532,6 +7572,7 @@ void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; + walt_init_cpu_efficiency(); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6efdcad51603..79d54592b53a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,11 +30,13 @@ #include #include #include +#include #include #include "sched.h" #include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -56,6 +58,10 @@ unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -4209,6 +4215,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4216,6 +4223,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4230,6 +4238,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4279,6 +4288,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4299,6 +4309,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4313,6 +4324,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_dec_cumulative_runnable_avg(rq, p); /* * We want to potentially trigger a freq switch @@ -5207,6 +5219,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif return p->se.avg.util_avg; } @@ -6599,7 +6617,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9694204660b7..be700bfa1ae4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,6 +8,8 @@ #include #include +#include "walt.h" + int sched_rr_timeslice = RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -1261,6 +1263,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1272,6 +1275,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 144d4782a455..bfd99f9bc6f9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -410,6 +410,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; @@ -663,6 +667,27 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_WALT + /* + * max_freq = user or thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq, max_possible_freq; + struct cpumask freq_domain_cpumask; + + u64 cumulative_runnable_avg; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + int capacity; + int max_possible_capacity; + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +#endif /* CONFIG_SCHED_WALT */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1513,6 +1538,10 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_ravg_window; +extern unsigned int walt_disabled; + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -1544,6 +1573,11 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / + walt_ravg_window; +#endif delta += util; if (delta < 0) return 0; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..61f852d46858 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -42,12 +43,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 000000000000..1dff3d2e2358 --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1098 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * Window Assisted Load Tracking (WALT) implementation credits: + * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, + * Pavan Kumar Kondeti, Olav Haugan + * + * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla + * and Todd Kjos + */ + +#include +#include +#include +#include "sched.h" +#include "walt.h" + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +static __read_mostly unsigned int walt_ravg_hist_size = 5; +static __read_mostly unsigned int walt_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +static __read_mostly unsigned int walt_account_wait_time = 1; +static __read_mostly unsigned int walt_freq_account_wait_time = 0; +static __read_mostly unsigned int walt_io_is_busy = 0; + +unsigned int sysctl_sched_walt_init_task_load_pct = 15; + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +static unsigned int max_possible_efficiency = 1024; +static unsigned int min_possible_efficiency = 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +static unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +static unsigned int min_max_freq = 1; + +static unsigned int max_capacity = 1024; +static unsigned int min_capacity = 1024; +static unsigned int max_load_scale_factor = 1024; +static unsigned int max_possible_capacity = 1024; + +/* Mask of all CPUs that have max_possible_capacity */ +static cpumask_t mpc_mask = CPU_MASK_ALL; + +/* Window size (in ns) */ +__read_mostly unsigned int walt_ravg_window = 20000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +static unsigned int sync_cpu; +static ktime_t ktime_last; +static bool walt_ktime_suspended; + +static unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +void +walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg += p->ravg.demand; +} + +void +walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg -= p->ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); +} + +static void +fixup_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p, s64 task_load_delta) +{ + rq->cumulative_runnable_avg += task_load_delta; + if ((s64)rq->cumulative_runnable_avg < 0) + panic("cra less than zero: tld: %lld, task_load(p) = %u\n", + task_load_delta, task_load(p)); +} + +u64 walt_ktime_clock(void) +{ + if (unlikely(walt_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void walt_resume(void) +{ + walt_ktime_suspended = false; +} + +static int walt_suspend(void) +{ + ktime_last = ktime_get(); + walt_ktime_suspended = true; + return 0; +} + +static struct syscore_ops walt_syscore_ops = { + .resume = walt_resume, + .suspend = walt_suspend +}; + +static int __init walt_init_ops(void) +{ + register_syscore_ops(&walt_syscore_ops); + return 0; +} +late_initcall(walt_init_ops); + +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg += p->ravg.demand; +} + +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg -= p->ravg.demand; +} + +static int exiting_task(struct task_struct *p) +{ + if (p->flags & PF_EXITING) { + if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) { + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + } + return 1; + } + return 0; +} + +static int __init set_walt_ravg_window(char *str) +{ + get_option(&str, &walt_ravg_window); + + walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + return 0; +} + +early_param("walt_ravg_window", set_walt_ravg_window); + +static void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < walt_ravg_window) + return; + + nr_windows = div64_u64(delta, walt_ravg_window); + rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; +} + +static u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned int cur_freq = rq->cur_freq; + int sf; + + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = rq->max_possible_freq; + + /* round up div64 */ + delta = div64_u64(delta * cur_freq + max_possible_freq - 1, + max_possible_freq); + + sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); + + delta *= sf; + delta >>= 10; + + return delta; +} + +static int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!walt_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE || + event == TASK_UPDATE) + return 1; + + /* Only TASK_MIGRATE && PICK_NEXT_TASK left */ + return walt_freq_account_wait_time; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, nr_full_windows = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = walt_ravg_window; + u64 delta; + + new_window = mark_start < window_start; + if (new_window) { + nr_full_windows = div64_u64((window_start - mark_start), + window_size); + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + /* Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!nr_full_windows) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO */ + + if (!new_window) + return; + + /* A new window has started. The RQ demand must be rolled + * over if p is the current task. */ + if (p_is_curr_task) { + u64 prev_sum = 0; + + /* p is either idle task or an exiting task */ + if (!nr_full_windows) { + prev_sum = rq->curr_runnable_sum; + } + + rq->prev_runnable_sum = prev_sum; + rq->curr_runnable_sum = 0; + } + + return; + } + + if (!new_window) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + rq->curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + rq->prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum += delta; + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + + delta += rq->curr_runnable_sum; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + + } + /* + * Rollover for normal runnable sum is done here by overwriting + * the values in prev_runnable_sum and curr_runnable_sum. + * Rollover for new task runnable sum has completed by previous + * if-else statement. + */ + rq->prev_runnable_sum = delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum = delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. */ + rq->prev_runnable_sum = rq->curr_runnable_sum; + if (mark_start > window_start) { + rq->curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + rq->prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* No need to bother updating task demand for exiting tasks + * or the idle task. */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. */ + if (event == TASK_WAKE || (!walt_account_wait_time && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = walt_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (walt_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (walt_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, walt_ravg_hist_size); + if (walt_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + fixup_cumulative_runnable_avg(rq, p, demand); + + p->ravg.demand = demand; + +done: + trace_walt_update_history(rq, p, runtime, samples, event); + return; +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, + u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > walt_ravg_window)) + p->ravg.sum = walt_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = walt_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* The simple case - busy time contained within the existing + * window. */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* Roll window_start back to current to process any remainder + * in current window. */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + if (walt_disabled || !rq->window_start) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + +done: + trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime); + + p->ravg.mark_start = wallclock; +} + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +void walt_init_cpu_efficiency(void) +{ + int i, efficiency; + unsigned int max = 0, min = UINT_MAX; + + for_each_possible_cpu(i) { + efficiency = arch_get_cpu_efficiency(i); + cpu_rq(i)->efficiency = efficiency; + + if (efficiency > max) + max = efficiency; + if (efficiency < min) + min = efficiency; + } + + if (max) + max_possible_efficiency = max; + + if (min) + min_possible_efficiency = min; +} + +static void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void walt_mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start) { + reset_task_stats(p); + return; + } + + wallclock = walt_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +void walt_set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start) + return; + + if (cpu == sync_cpu) { + rq->window_start = walt_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +void walt_migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +void walt_fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + + if (p->ravg.curr_window) { + src_rq->curr_runnable_sum -= p->ravg.curr_window; + dest_rq->curr_runnable_sum += p->ravg.curr_window; + } + + if (p->ravg.prev_window) { + src_rq->prev_runnable_sum -= p->ravg.prev_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + } + + if ((s64)src_rq->prev_runnable_sum < 0) { + src_rq->prev_runnable_sum = 0; + WARN_ON(1); + } + if ((s64)src_rq->curr_runnable_sum < 0) { + src_rq->curr_runnable_sum = 0; + WARN_ON(1); + } + + trace_walt_migration_update_sum(src_rq, p); + trace_walt_migration_update_sum(dest_rq, p); + + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max = 0, min = INT_MAX; + + for_each_online_cpu(i) { + if (cpu_rq(i)->capacity > max) + max = cpu_rq(i)->capacity; + if (cpu_rq(i)->capacity < min) + min = cpu_rq(i)->capacity; + } + + max_capacity = max; + min_capacity = min; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long capacity_scale_cpu_efficiency(int cpu) +{ + return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(int cpu) +{ + return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static unsigned long load_scale_cpu_efficiency(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cpu_rq(cpu)->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static unsigned long load_scale_cpu_freq(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); +} + +static int compute_capacity(int cpu) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cpu); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cpu); + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(int cpu) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cpu); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cpu); + load_scale >>= 10; + + return load_scale; +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + int i, update_max = 0; + u64 highest_mpc = 0, highest_mplsf = 0; + const struct cpumask *cpus = policy->related_cpus; + unsigned int orig_min_max_freq = min_max_freq; + unsigned int orig_max_possible_freq = max_possible_freq; + /* Initialized to policy->max in case policy->related_cpus is empty! */ + unsigned int orig_max_freq = policy->max; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + for_each_cpu(i, policy->related_cpus) { + cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, + policy->related_cpus); + orig_max_freq = cpu_rq(i)->max_freq; + cpu_rq(i)->min_freq = policy->min; + cpu_rq(i)->max_freq = policy->max; + cpu_rq(i)->cur_freq = policy->cur; + cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + /* Changes to policy other than max_freq don't require any updates */ + if (orig_max_freq == policy->max) + return 0; + + /* + * A changed min_max_freq or max_possible_freq (possible during bootup) + * needs to trigger re-computation of load_scale_factor and capacity for + * all possible cpus (even those offline). It also needs to trigger + * re-computation of nr_big_task count on all online cpus. + * + * A changed rq->max_freq otoh needs to trigger re-computation of + * load_scale_factor and capacity for just the cluster of cpus involved. + * Since small task definition depends on max_load_scale_factor, a + * changed load_scale_factor of one cluster could influence + * classification of tasks in another cluster. Hence a changed + * rq->max_freq will need to trigger re-computation of nr_big_task + * count on all online cpus. + * + * While it should be sufficient for nr_big_tasks to be + * re-computed for only online cpus, we have inadequate context + * information here (in policy notifier) with regard to hotplug-safety + * context in which notification is issued. As a result, we can't use + * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is + * fixed up to issue notification always in hotplug-safe context, + * re-compute nr_big_task for all possible cpus. + */ + + if (orig_min_max_freq != min_max_freq || + orig_max_possible_freq != max_possible_freq) { + cpus = cpu_possible_mask; + update_max = 1; + } + + /* + * Changed load_scale_factor can trigger reclassification of tasks as + * big or small. Make this change "atomic" so that tasks are accounted + * properly due to changed load_scale_factor + */ + for_each_cpu(i, cpus) { + struct rq *rq = cpu_rq(i); + + rq->capacity = compute_capacity(i); + rq->load_scale_factor = compute_load_scale_factor(i); + + if (update_max) { + u64 mpc, mplsf; + + mpc = div_u64(((u64) rq->capacity) * + rq->max_possible_freq, rq->max_freq); + rq->max_possible_capacity = (int) mpc; + + mplsf = div_u64(((u64) rq->load_scale_factor) * + rq->max_possible_freq, rq->max_freq); + + if (mpc > highest_mpc) { + highest_mpc = mpc; + cpumask_clear(&mpc_mask); + cpumask_set_cpu(i, &mpc_mask); + } else if (mpc == highest_mpc) { + cpumask_set_cpu(i, &mpc_mask); + } + + if (mplsf > highest_mplsf) + highest_mplsf = mplsf; + } + } + + if (update_max) { + max_possible_capacity = highest_mpc; + max_load_scale_factor = highest_mplsf; + } + + __update_min_max_capacity(); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + int i; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_rq(cpu)->cur_freq == new_freq) + return 0; + + for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { + struct rq *rq = cpu_rq(i); + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); + rq->cur_freq = new_freq; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +void walt_init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = + div64_u64((u64)sysctl_sched_walt_init_task_load_pct * + (u64)walt_ravg_window, 100); + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)walt_ravg_window, 100); + } + + p->ravg.demand = init_load_windows; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..cabc193a683d --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_fixup_busy_time(struct task_struct *p, int new_cpu); +void walt_init_new_task_load(struct task_struct *p); +void walt_mark_task_starting(struct task_struct *p); +void walt_set_window_start(struct rq *rq); +void walt_migrate_sync_cpu(int cpu); +void walt_init_cpu_efficiency(void); +u64 walt_ktime_clock(void); + +#else /* CONFIG_SCHED_WALT */ + +static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void walt_init_new_task_load(struct task_struct *p) { } +static inline void walt_mark_task_starting(struct task_struct *p) { } +static inline void walt_set_window_start(struct rq *rq) { } +static inline void walt_migrate_sync_cpu(int cpu) { } +static inline void walt_init_cpu_efficiency(void) { } +static inline u64 walt_ktime_clock(void) { return 0; } + +#endif /* CONFIG_SCHED_WALT */ + +extern unsigned int walt_disabled; + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4457e107ec20..43c59da62f1e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -310,6 +310,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "sched_sync_hint_enable", .data = &sysctl_sched_sync_hint_enable, From 519c62750eb6ebbb5783315272398ced72d7a036 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 22 Jul 2016 13:21:15 +0100 Subject: [PATCH 262/607] sched/walt: Accounting for number of irqs pending on each core Schedules on a core whose irq count is less than a threshold. Improves I/O performance of EAS. Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5 --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 5 +++ kernel/sched/cputime.c | 16 +++++++++ kernel/sched/fair.c | 7 +++- kernel/sched/sched.h | 3 ++ kernel/sched/walt.c | 65 ++++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 5 +++ kernel/sysctl.c | 7 ++++ 8 files changed, 108 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 710f58a28d63..d68e88c9d4d7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware; extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_walt_cpu_high_irqload; #endif enum sched_tunable_scaling { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b6f41850ed00..deb45d1b4e49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7750,6 +7750,11 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 05de80b48586..442a9f7a2832 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,7 @@ #include #include #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; @@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); +#ifdef CONFIG_SCHED_WALT + else + account = false; +#endif irq_time_write_end(); +#ifdef CONFIG_SCHED_WALT + if (account) + walt_account_irqtime(cpu, curr, delta, wallclock); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79d54592b53a..0ba5653682e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,6 +61,8 @@ unsigned int sysctl_sched_cstate_aware = 1; #ifdef CONFIG_SCHED_WALT unsigned int sysctl_sched_use_walt_cpu_util = 1; unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); #endif /* * The initial- and re-scaling of tunables is configurable @@ -4258,7 +4260,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) schedtune_enqueue_task(p, cpu_of(rq)); #endif /* CONFIG_SMP */ - hrtick_update(rq); } @@ -5627,6 +5628,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util > capacity_orig_of(i)) continue; +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif /* * For boosted tasks we favor idle cpus unconditionally to * improve latency. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bfd99f9bc6f9..911d3a94690e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -685,6 +685,9 @@ struct rq { u64 prev_runnable_sum; u64 nt_curr_runnable_sum; u64 nt_prev_runnable_sum; + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; #endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 1dff3d2e2358..b9ae8d5c4393 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -221,6 +221,71 @@ static int cpu_is_waiting_on_io(struct rq *rq) return atomic_read(&rq->nr_iowait); } +void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + + +#define WALT_HIGH_IRQ_TIMEOUT 3 + +u64 walt_irqload(int cpu) { + struct rq *rq = cpu_rq(cpu); + s64 delta; + delta = get_jiffies_64() - rq->irqload_ts; + + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < WALT_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +int walt_cpu_high_irqload(int cpu) { + return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload; +} + static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, u64 irqtime, int event) { diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index cabc193a683d..e181c87a928d 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -31,6 +31,11 @@ void walt_set_window_start(struct rq *rq); void walt_migrate_sync_cpu(int cpu); void walt_init_cpu_efficiency(void); u64 walt_ktime_clock(void); +void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta, + u64 wallclock); + +u64 walt_irqload(int cpu); +int walt_cpu_high_irqload(int cpu); #else /* CONFIG_SCHED_WALT */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 43c59da62f1e..2b89e8ff0688 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -332,6 +332,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_walt_cpu_high_irqload", + .data = &sysctl_sched_walt_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { .procname = "sched_sync_hint_enable", From dfc1151b46bb5862154a8be2be867dbc1c8aaeb8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:00:41 +0100 Subject: [PATCH 263/607] FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use The CPU utilization reported when WALT is in use already tracks the contributions due to RT and DL workloads. However, SchedFreq exposes different capacity update functions, one for each class, and does classes utilization internally at update_cpu_capacity_request() call time. This patch ensures that when WALT is in use, the: cpu_sched_capacity_reqs::cfs value is tracking just the load generated by SCHED_OTHER tasks. Change-Id: Ibd9c9a10874a1d91f62477034548f7664e57cd6a Signed-off-by: Patrick Bellasi --- kernel/sched/sched.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 911d3a94690e..3111d74a5a85 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1611,8 +1611,27 @@ void update_cpu_capacity_request(int cpu, bool request); static inline void set_cfs_cpu_capacity(int cpu, bool request, unsigned long capacity) { - if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + int rtdl = scr->rt + scr->dl; + /* + * WALT tracks the utilization of a CPU considering the load + * generated by all the scheduling classes. + * Since the following call to: + * update_cpu_capacity + * is already adding the RT and DL utilizations let's remove + * these contributions from the WALT signal. + */ + if (capacity > rtdl) + capacity -= rtdl; + else + capacity = 0; + } +#endif + if (scr->cfs != capacity) { + scr->cfs = capacity; update_cpu_capacity_request(cpu, request); } } From 782c9d64a1d12aaf3d6decf7b54e212343bb3f7e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 29 Jun 2016 11:30:07 -0700 Subject: [PATCH 264/607] sched: EAS: Avoid causing spikes to max-freq unnecessarily During scheduler tick handling, the frequency was being set to max-freq if the current frequency is less than the current utilization. Change to just request "right" frequency instead of max. BUG: 29871410 Change-Id: I6fe65b14413da44b1520ba116f72320083eb92f8 --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index deb45d1b4e49..f07615a1674f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2940,7 +2940,7 @@ static unsigned long sum_capacity_reqs(unsigned long cfs_cap, static void sched_freq_tick(int cpu) { struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr; + unsigned long capacity_orig, capacity_curr, capacity_sum; if (!sched_freq()) return; @@ -2953,12 +2953,15 @@ static void sched_freq_tick(int cpu) /* * To make free room for a task that is building up its "real" * utilization and to harm its performance the least, request - * a jump to max OPP as soon as the margin of free capacity is - * impacted (specified by capacity_margin). + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) - set_cfs_cpu_capacity(cpu, true, capacity_max); + capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); + if (capacity_curr < capacity_sum) { + set_cfs_cpu_capacity(cpu, true, capacity_sum); + } } #else static inline void sched_freq_tick(int cpu) { } From 5156b672040dfd29ba141e02689e1e92e34d525f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:09:24 +0100 Subject: [PATCH 265/607] FIXUP: sched: fix SchedFreq integration for both PELT and WALT The current kernel allows to use either PELT or WALT to track CPUs utilizations. One of the main differences between the two approaches is that PELT tracks only utilization of SCHED_OTHER classes while WALT tracks all tasks with a single signal. The current sched_freq_tick does not make this distinction and, when WALT is in use, we end up adding multiple time the contribution related to the RT and DL classes. This patch fixes this issue by: 1. providing two different code paths for PELT and WALT, thus granting that when we switch to PELT we get the original behaviour based on the assumption that class aggregations is done underneath by SchedFreq. 2. avoiding the double accounting of DL and RT workloads, when WALT is in use, by just adding a margin to the original WALT signal when we need to check if the CFS capacity has to be increased. Change-Id: I7326fd50e868e97fb5e12351917e9d2969bfdae7 Signed-off-by: Patrick Bellasi --- kernel/sched/core.c | 91 +++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f07615a1674f..02743ffe7ff3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2926,21 +2926,77 @@ unsigned long long task_sched_runtime(struct task_struct *p) } #ifdef CONFIG_CPU_FREQ_GOV_SCHED -static unsigned long sum_capacity_reqs(unsigned long cfs_cap, - struct sched_capacity_reqs *scr) -{ - unsigned long total = cfs_cap + scr->rt; - total = total * capacity_margin; - total /= SCHED_CAPACITY_SCALE; - total += scr->dl; - return total; +static inline +unsigned long add_capacity_margin(unsigned long cpu_capacity) +{ + cpu_capacity = cpu_capacity * capacity_margin; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; } +static inline +unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = add_capacity_margin(cfs_cap + scr->rt); + return total += scr->dl; +} + +static void sched_freq_tick_pelt(int cpu) +{ + unsigned long cpu_utilization = capacity_max; + unsigned long capacity_curr = capacity_curr_of(cpu); + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); +} + +#ifdef CONFIG_SCHED_WALT +static void sched_freq_tick_walt(int cpu) +{ + unsigned long cpu_utilization = cpu_util(cpu); + unsigned long capacity_curr = capacity_curr_of(cpu); + + if (walt_disabled || !sysctl_sched_use_walt_cpu_util) + return sched_freq_tick_pelt(cpu); + + /* + * Add a margin to the WALT utilization. + * NOTE: WALT tracks a single CPU signal for all the scheduling + * classes, thus this margin is going to be added to the DL class as + * well, which is something we do not do in sched_freq_tick_pelt case. + */ + cpu_utilization = add_capacity_margin(cpu_utilization); + if (cpu_utilization <= capacity_curr) + return; + + /* + * It is likely that the load is growing so we + * keep the added margin in our request as an + * extra boost. + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); + +} +#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) +#else +#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) +#endif /* CONFIG_SCHED_WALT */ + static void sched_freq_tick(int cpu) { - struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr, capacity_sum; + unsigned long capacity_orig, capacity_curr; if (!sched_freq()) return; @@ -2950,22 +3006,11 @@ static void sched_freq_tick(int cpu) if (capacity_curr == capacity_orig) return; - /* - * To make free room for a task that is building up its "real" - * utilization and to harm its performance the least, request - * a jump to a higher OPP as soon as the margin of free capacity - * is impacted (specified by capacity_margin). - */ - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); - if (capacity_curr < capacity_sum) { - set_cfs_cpu_capacity(cpu, true, capacity_sum); - } + _sched_freq_tick(cpu); } #else static inline void sched_freq_tick(int cpu) { } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ /* * This function gets called by the timer code, with HZ frequency. From 740d312ce8aae83956ed1b34b8a4e72b60b04a8f Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 16 Jun 2016 16:33:54 -0700 Subject: [PATCH 266/607] FIXUP: sched/fair: Fix hang during suspend in sched_group_energy BUG: 29353986 Change-Id: I0d0d8d5c107a2e0bd219819e036091106bb40e11 --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0ba5653682e8..0b065ff2f4f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4980,6 +4980,7 @@ static int sched_group_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); continue; } From 8935b6b4d230ae7969ccdfaf7baf09d818f88c8b Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 4 Jul 2016 15:04:45 +0100 Subject: [PATCH 267/607] FIXUP: sched: Fix double-release of spinlock in move_queued_task BUG: 29519455 Change-Id: I4d1c27a1b4bcbba03d4b175d170cfe1701a90ffd --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3111d74a5a85..4e2e44e3cc7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1832,7 +1832,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); + if (this_rq != busiest) + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } From 23ed57dbcc14871af54db667d460aba64eaf6efe Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 25 Jul 2016 15:13:58 +0100 Subject: [PATCH 268/607] arch_timer: add error handling when the MPM global timer is cleared Bug: 29000863 Signed-off-by: albert.zl_huang Change-Id: I2b5a28b0a9edb31bdaa1ca2310397dd2f36f6c23 Updated to use arch_timer_read_counter() as arch_counter_get_cntvct doesn't exist in this kernel. Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index b9ae8d5c4393..d9d09914ce30 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -185,7 +185,14 @@ update_window_start(struct rq *rq, u64 wallclock) int nr_windows; delta = wallclock - rq->window_start; - BUG_ON(delta < 0); + /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ + if (delta < 0) { + if (arch_timer_read_counter() == 0) + delta = 0; + else + BUG_ON(1); + } + if (delta < walt_ravg_window) return; From d4cda03828f5c8eae35efcb08f520f8f1a35950e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 13 Jul 2016 16:13:47 -0700 Subject: [PATCH 269/607] sched: use util instead of capacity to select busy cpu If cpus are busy, the cpu selection algorithm was favoring cpus with lower capacity. This can result in uneven packing since there will be a bias toward the same cpu until there is a capacity change. Instead use the utilization so there is immediate feedback as tasks are assigned BUG: 30115868 Change-Id: I0ac7ae3ab5d8f2f5a5838c29bb6da2c3e8ef44e8 --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b065ff2f4f3..7b6e95aa7360 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5593,7 +5593,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) { int iter_cpu; int target_cpu = -1; - int target_capacity = 0; + int target_util = 0; int backup_capacity = 0; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; @@ -5649,10 +5649,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_capacity == 0 || - target_capacity > cur_capacity) { + if (target_util == 0 || + target_util > new_util) { target_cpu = i; - target_capacity = cur_capacity; + target_util = new_util; } } else if (!boosted) { if (best_idle_cpu < 0 || From c5a00c2dad8d161da3c2086cccd6375d8ad5b04f Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 13:09:03 -0700 Subject: [PATCH 270/607] sched/tune: Introducing a new schedtune attribute prefer_idle Hint to enable biasing of tasks towards idle cpus, even when a given task is negatively boosted. The mechanism allows upto 20% reduction in camera power without hurting performance. bug: 28312446 Change-Id: I97ea5671aa1e6bcb165408b41e17bc82e41c2c9e --- kernel/sched/fair.c | 23 +++++++++++++---------- kernel/sched/tune.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 2 ++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b6e95aa7360..e099ce747345 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5589,7 +5589,7 @@ done: return target; } -static inline int find_best_target(struct task_struct *p, bool boosted) +static inline int find_best_target(struct task_struct *p, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5607,9 +5607,9 @@ static inline int find_best_target(struct task_struct *p, bool boosted) int idle_idx; /* - * favor higher cpus for boosted tasks + * favor higher cpus for tasks that prefer idle cores */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5634,10 +5634,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) continue; #endif /* - * For boosted tasks we favor idle cpus unconditionally to + * Unconditionally favoring tasks that prefer idle cpus to * improve latency. */ - if (idle_cpu(i) && boosted) { + if (idle_cpu(i) && prefer_idle) { if (best_idle_cpu < 0) best_idle_cpu = i; continue; @@ -5654,7 +5654,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) target_cpu = i; target_util = new_util; } - } else if (!boosted) { + } else if (!prefer_idle) { if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { @@ -5669,7 +5669,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) } } - if (boosted && best_idle_cpu >= 0) + if (prefer_idle && best_idle_cpu >= 0) target_cpu = best_idle_cpu; else if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; @@ -5761,14 +5761,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) */ #ifdef CONFIG_CGROUP_SCHEDTUNE bool boosted = schedtune_task_boost(p) > 0; + bool prefer_idle = schedtune_prefer_idle(p) > 0; #else bool boosted = 0; + bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted); - if (tmp_target >= 0) + int tmp_target = find_best_target(p, boosted || prefer_idle); + if (tmp_target >= 0) { target_cpu = tmp_target; - if (boosted && idle_cpu(target_cpu)) + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) return target_cpu; + } } if (target_cpu != task_cpu(p)) { diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d24f365b0c90..644f8e9ee96f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -125,6 +125,10 @@ struct schedtune { /* Performance Constraint (C) region threshold params */ int perf_constrain_idx; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards idle CPUs */ + int prefer_idle; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -156,6 +160,7 @@ root_schedtune = { .boost = 0, .perf_boost_idx = 0, .perf_constrain_idx = 0, + .prefer_idle = 0, }; int @@ -536,6 +541,38 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } +int schedtune_prefer_idle(struct task_struct *p) +{ + struct schedtune *st; + int prefer_idle; + + /* Get prefer_idle value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_idle = st->prefer_idle; + rcu_read_unlock(); + + return prefer_idle; +} + +static u64 +prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_idle; +} + +static int +prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_idle) +{ + struct schedtune *st = css_st(css); + st->prefer_idle = prefer_idle; + + return 0; +} + static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -587,6 +624,11 @@ static struct cftype files[] = { .read_s64 = boost_read, .write_s64 = boost_write, }, + { + .name = "prefer_idle", + .read_u64 = prefer_idle_read, + .write_u64 = prefer_idle_write, + }, { } /* terminate */ }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index be1785eb1c5b..4f6441771e4c 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +int schedtune_prefer_idle(struct task_struct *tsk); + void schedtune_exit_task(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); From 93db70f21c7bf04dfe5bb3345400da8f97c3ad29 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 10 Feb 2016 09:24:36 +0000 Subject: [PATCH 271/607] DEBUG: sched: add tracepoint for RD overutilized Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 33b1c719c9d7..8006e3a46723 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -888,6 +888,26 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +/* + * Tracepoint for system overutilized flag + */ +TRACE_EVENT(sched_overutilized, + + TP_PROTO(bool overutilized), + + TP_ARGS(overutilized), + + TP_STRUCT__entry( + __field( bool, overutilized ) + ), + + TP_fast_assign( + __entry->overutilized = overutilized; + ), + + TP_printk("overutilized=%d", + __entry->overutilized ? 1 : 0) +); #ifdef CONFIG_SCHED_WALT struct rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e099ce747345..b84277a8530d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4242,8 +4242,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) + cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } /* * We want to potentially trigger a freq switch @@ -7503,12 +7505,17 @@ next_group: env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) + if (env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } } else { - if (!env->dst_rq->rd->overutilized && overutilized) + if (!env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } + } /** @@ -8948,8 +8955,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } rq->misfit_task = !task_fits_max(curr, rq->cpu); #endif From b9534b8f0128efdb00865af03106a45ce9d489cb Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 16:09:03 +0100 Subject: [PATCH 272/607] FIXUP: sched/tune: do initialization as a postcore_initicall SchedTune needs to walk the scheduling domains to compute the energy normalization constants used for PE space filtering. To build such constants we need the energy model data for each CPU in the system. However, by walking the SDs as a late initcall stage, the userspace has been already initialized and it could happen that some CPUs are hotplugged out. For example, this could happen if a user-space thermal manager daemon detects that CPUs are to much hot during the boot process. To avoid such a race condition we can anticipate the SchedTune initialization code to be a postcore_initicall. This allows to keep the SchedTune initialization code as simple as an initcall while still safely relaying on SDs provided data. Such calls are executed before user-space is initialized and thus, apart from the case of unlucky early-init kernel space generated hotplugs, this solution should be safe enough to get all the data we need. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 644f8e9ee96f..bd7f319ce53e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -946,4 +946,4 @@ nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init); +postcore_initcall(schedtune_init); From c80a9af21aece64d3262900c78bdf39f2fdd6c2e Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Tue, 2 Aug 2016 14:05:46 -0700 Subject: [PATCH 273/607] sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs When idle cpus cannot be found for Top-app/FG tasks, the cpu selection algorithm picks a cpu with lowest OPP amongst the busy cpus as a second choice. Mitigates the "runnable" time for ui and render threads. bug: 30481949 bug: 30342017 bug: 30508678 Change-Id: I5a97e31d33284895c0fa6f6942102713ee576d77 --- kernel/sched/fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b84277a8530d..b7b1f2759278 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5651,10 +5651,22 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; + if(prefer_idle) { + // Find a target cpu with lowest + // utilization. + if (target_util == 0 || + target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + // Find a target cpu with highest + // utilization. + if (target_util == 0 || + target_util > new_util) { + target_cpu = i; + target_util = new_util; + } } } else if (!prefer_idle) { if (best_idle_cpu < 0 || @@ -5666,6 +5678,7 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { + // Find a backup cpu with least capacity. backup_capacity = cur_capacity; backup_cpu = i; } From bf93a3672c5e4c1b803958dc6c038f0b163df5cf Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 4 Aug 2016 12:20:04 +0100 Subject: [PATCH 274/607] sched/cpufreq_sched: fix thermal capping events cpufreq_sched_limits (called when CPUFREQ_GOV_LIMITS event happens) bails out if policy->rwsem is already locked. However, that rwsem is always guaranteed to be locked when we get here after a thermal throttling event happens: th_throttling -> cpufreq_update_policy() ... down_write(&policy->rwsem); ... cpufreq_set_policy() -> ... __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); -> cpufreq_sched_limits() ... if (!down_write_trylock(&policy->rwsem)) return; <-- BAIL OUT! So, we don't currently react immediately to thermal capping event (even if reaction is still quick in practice, ~1ms, as lots of events are likely to trigger a frequency selection on a high loaded system). Fix this bug by removing the bail out condition. While we are at it we also slightly change handling of the new limits by clamping the last requested_freq between policy's max and min. Doing so gives us the oppurtunity to correctly restore the last requested frequency as soon as a thermal unthrottling event happens. bug: 30481949 Change-Id: I3c13e818f238c1ffa66b34e419e8b87314b57427 Suggested-by: Javi Merino Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/cpufreq_sched.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 3f8c67a3ea0f..4fea269a6598 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -58,7 +58,6 @@ struct gov_data { struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; - int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -193,7 +192,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) @@ -288,8 +287,6 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - gd->max = policy->max; - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); @@ -352,28 +349,17 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) static void cpufreq_sched_limits(struct cpufreq_policy *policy) { - struct gov_data *gd; + unsigned int clamp_freq; + struct gov_data *gd = policy->governor_data;; pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", policy->cpu, policy->min, policy->max, policy->cur); - if (!down_write_trylock(&policy->rwsem)) - return; - /* - * Need to keep track of highest max frequency for - * capacity calculations - */ - gd = policy->governor_data; - if (gd->max < policy->max) - gd->max = policy->max; + clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); - - up_write(&policy->rwsem); + if (policy->cur != clamp_freq) + __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); } static int cpufreq_sched_stop(struct cpufreq_policy *policy) From 74b4fa8e5cfe7655adb40bc88d6d88f916cee250 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 13 May 2016 11:54:04 +0100 Subject: [PATCH 275/607] sched/fair: call OPP update when going idle after migration When a task leaves a rq because it is migrated away it carries its utilization with him. In this case and OPP update on the src rq might be needed. The corresponding update at dst rq will happen at enqueue time. Change-Id: I22754a43760fc8d22a488fe15044af93787ea7a8 sched/fair: Fix uninitialised variable in idle_balance compiler warned, looks legit. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7b1f2759278..d9af0384dbb0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8322,6 +8322,7 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + long removed_util=0; idle_enter_fair(this_rq); @@ -8345,6 +8346,17 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); + /* + * If removed_util_avg is !0 we most probably migrated some task away + * from this_cpu. In this case we might be willing to trigger an OPP + * update, but we want to do so if we don't find anybody else to pull + * here (we will trigger an OPP update with the pulled task's enqueue + * anyway). + * + * Record removed_util before calling update_blocked_averages, and use + * it below (before returning) to see if an OPP update is required. + */ + removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -8409,6 +8421,12 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } else if (removed_util) { + /* + * No task pulled and someone has been migrated away. + * Good case to trigger an OPP update. + */ + update_capacity_of(this_cpu); } return pulled_task; From 142b2acc79777f3cac93144d7ebf9caa53e97f9b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: [PATCH 276/607] vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Change-Id: Idf256cfacb40b4dc8dbb6795cf06b34e8fec7a06 Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Git-commit: 0eb77e9880321915322d42913c3b53241739c8aa [shashim@codeaurora.org: resolve minor merge conflicts] Signed-off-by: Shiraz Hashim [jstultz: fwdport to 4.4] Signed-off-by: John Stultz --- include/linux/vmstat.h | 2 ++ kernel/sched/idle.c | 1 + mm/vmstat.c | 69 +++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3e5d9075960f..73fae8c4a5fb 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); +void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); @@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } +static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cbc130efbc5b..917c94abf5bb 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -220,6 +220,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/mm/vmstat.c b/mm/vmstat.c index c54fd2924f25..83a003bc3cae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w) } } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1449,7 +1468,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { From 07ec7db165f1990ca563e18888e82991eac1c5e5 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 29 Jul 2016 17:50:11 +0100 Subject: [PATCH 277/607] sched/fair: Favor higher cpus only for boosted tasks This CL separates the notion of boost and prefer_idle schedtune attributes in cpu selection. Today only top-app tasks are boosted. The CPU selection is slightly tweaked such that higher order cpus are preferred only for boosted tasks (top-app) and the rest would be skewed towards lower order cpus. This avoids starvation issues for fg tasks when interacting with high priority top-app tasks (a problem often seen in the case of system_server). bug: 30245369 bug: 30292998 Change-Id: I0377e00893b9f6586eec55632a265518fd2fa8a1 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9af0384dbb0..da9323b9ecab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5591,7 +5591,7 @@ done: return target; } -static inline int find_best_target(struct task_struct *p, bool prefer_idle) +static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5609,9 +5609,9 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) int idle_idx; /* - * favor higher cpus for tasks that prefer idle cores + * Iterate from higher cpus for boosted tasks. */ - int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5781,7 +5781,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) bool boosted = 0; bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted || prefer_idle); + int tmp_target = find_best_target(p, boosted, prefer_idle); if (tmp_target >= 0) { target_cpu = tmp_target; if ((boosted || prefer_idle) && idle_cpu(target_cpu)) From 3276c3d7bca25f229f396cc182d1af29366ab109 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:34 +0200 Subject: [PATCH 278/607] UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity() Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it: __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the bug by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org Signed-off-by: Ingo Molnar (cherry picked from commit 25834c73f93af7f0712c98ca4593691592e6b360) Signed-off-by: Punit Agrawal BUG=chrome-os-partner:44828 TEST=Boot kernel on Oak. TEST=smaug-release and strago-release trybots. Change-Id: Id3c898c5ee1a22ed704e83f2ecf5f78199280d38 Reviewed-on: https://chromium-review.googlesource.com/321264 Commit-Ready: Ricky Liang Tested-by: Ricky Liang Reviewed-by: Ricky Liang Conflicts: kernel/sched/core.c --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 02743ffe7ff3..40c8e8d1e07f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5157,6 +5157,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock(&rq->lock); __sched_fork(0, idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); From 989f33f789a85cc53f4c3f75e31ddfce65c6d873 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Tue, 17 Jun 2014 21:43:35 -0700 Subject: [PATCH 279/607] sched/rt: print RT tasks when RT throttling is activated Existing debug prints do not provide any clues about which tasks may have triggered RT throttling. Print the names and PIDs of all tasks on the throttled rt_rq to help narrow down the source of the problem. Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa --- kernel/sched/rt.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index be700bfa1ae4..2b9121ea91bf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -891,6 +891,42 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *rt_se; + char buf[500]; + char *pos = buf; + char *end = buf + sizeof(buf); + int idx; + + pos += snprintf(pos, sizeof(buf), + "sched: RT throttling activated for rt_rq %p (cpu %d)\n", + rt_rq, cpu_of(rq_of_rt_rq(rt_rq))); + + if (bitmap_empty(array->bitmap, MAX_RT_PRIO)) + goto out; + + pos += snprintf(pos, end - pos, "potential CPU hogs:\n"); + idx = sched_find_first_bit(array->bitmap); + while (idx < MAX_RT_PRIO) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pos < end) + pos += snprintf(pos, end - pos, "\t%s (%d)\n", + p->comm, p->pid); + } + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + } +out: + printk_deferred("%s", buf); +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -914,8 +950,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + + if (!once) { + once = true; + dump_throttled_rt_tasks(rt_rq); + } } else { /* * In case we did anyway, make it go away, From 2a4445395f1be46db1200e1329fae06d034675e5 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Thu, 19 Jun 2014 14:23:33 -0700 Subject: [PATCH 280/607] sched/rt: Add Kconfig option to enable panicking for RT throttling This may be useful for detecting and debugging RT throttling issues. Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9 Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa [jstultz: forwardported to 4.4] Signed-off-by: John Stultz --- kernel/sched/rt.c | 9 +++++++++ lib/Kconfig.debug | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2b9121ea91bf..8a16cba968c4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -924,7 +924,16 @@ static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); } out: +#ifdef CONFIG_PANIC_ON_RT_THROTTLING + /* + * Use pr_err() in the BUG() case since printk_sched() will + * not get flushed and deadlock is not a concern. + */ + pr_err("%s", buf); + BUG(); +#else printk_deferred("%s", buf); +#endif } static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8c15b29d5adc..07ccfa681577 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -855,6 +855,15 @@ config SCHED_INFO bool default n +config PANIC_ON_RT_THROTTLING + bool "Panic on RT throttling" + help + Say Y here to enable the kernel to panic when a realtime + runqueue is throttled. This may be useful for detecting + and debugging RT throttling issues. + + Say N if unsure. + config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS From fb8741a0ba7445503ebe648fd96cccc34d7d2088 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 1 Aug 2016 16:49:07 -0700 Subject: [PATCH 281/607] FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE check when adjusting self In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS) to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds when p == current, but the CAP_SYS_NICE doesn't. Thus while the previous commit was intended to loosen the needed privledges to modify a processes timerslack, it needlessly restricted a task modifying its own timerslack via the proc//timerslack_ns (which is permitted also via the PR_SET_TIMERSLACK method). This patch corrects this by checking if p == current before checking the CAP_SYS_NICE value. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Mailing-list-url: http://www.spinics.net/lists/kernel/msg2317488.html Change-Id: Ia3e8aff07c2d41f55b6617502d33c39b7d781aac Signed-off-by: John Stultz --- fs/proc/base.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ac5d2aea63b5..49348349e156 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2260,15 +2260,17 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - count = -EPERM; - goto out; - } + if (p != current) { + if (!capable(CAP_SYS_NICE)) { + count = -EPERM; + goto out; + } - err = security_task_setscheduler(p); - if (err) { - count = err; - goto out; + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } } task_lock(p); @@ -2294,14 +2296,16 @@ static int timerslack_ns_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - err = -EPERM; - goto out; - } + if (p != current) { - err = security_task_getscheduler(p); - if (err) - goto out; + if (!capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + err = security_task_getscheduler(p); + if (err) + goto out; + } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); From ef58c0a2695707ab8b49368aa77ab7b4828aa1a7 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 12 Aug 2016 11:24:50 +0530 Subject: [PATCH 282/607] ANDROID: net: fib: remove duplicate assignment Remove duplicate FRA_GOTO assignment. Fixes: fd2cf795f3ab ("net: core: Support UID-based routing.") Change-Id: I462c24b16fdef42ae2332571a0b95de3ef9d2e25 Signed-off-by: Amit Pundir --- include/net/fib_rules.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 55b5419cb6a7..bdd985f41022 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -89,7 +89,6 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_UID_START] = { .type = NLA_U32 }, \ [FRA_UID_END] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ From 0b848391c27294982778254abb2d16723977e839 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 11 Aug 2016 19:13:22 +0530 Subject: [PATCH 283/607] ANDROID: net: core: fix UID-based routing Fix RTA_UID enum to match it with the Android userspace code which assumes RTA_UID=18. With this patch all Android kernel networking unit tests mentioned here https://source.android.com/devices/tech/config/kernel_network_tests.html are success. Without this patch multinetwork_test.py unit test fails. Change-Id: I3ff36670f7d4e5bf5f01dce584ae9d53deabb3ed Signed-off-by: Amit Pundir --- include/uapi/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 355eea225dd9..3eb02a1d6d8c 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -306,12 +306,12 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, + RTA_UID, RTA_VIA, RTA_NEWDST, RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, - RTA_UID, __RTA_MAX }; From 1da2a42de4b9371d2988ddf6969ce5efd1560270 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:29 -0400 Subject: [PATCH 284/607] UPSTREAM: Revert "ecryptfs: forbid opening files without mmap handler" (cherry picked from commit 78c4e172412de5d0456dc00d2b34050aa0b683b5) This reverts upstream commit 2f36db71009304b3f0b95afacd8eba1f9f046b87. It fixed a local root exploit but also introduced a dependency on the lower file system implementing an mmap operation just to open a file, which is a bit of a heavy hammer. The right fix is to have mmap depend on the existence of the mmap handler instead. Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org Signed-off-by: Tyler Hicks Fixes: Change-Id I0be77c7f8bd3046bc34cd87ef577529792d479bc ("UPSTREAM: ecryptfs: forbid opening files without mmap handler") Change-Id: Ib9bc87099f7f89e4e12dbc1a79e884dcadb1befb Signed-off-by: Amit Pundir --- fs/ecryptfs/kthread.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index e818f5ac7a26..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto have_file; + goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) { + if (IS_ERR(*lower_file)) rc = PTR_ERR(*lower_file); - goto out; - } -have_file: - if ((*lower_file)->f_op->mmap == NULL) { - fput(*lower_file); - *lower_file = NULL; - rc = -EMEDIUMTYPE; - } out: return rc; } From 7997255b0d0ffed60ad402c5022eacf0161dcdc0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:30 -0400 Subject: [PATCH 285/607] UPSTREAM: ecryptfs: don't allow mmap when the lower fs doesn't support it (cherry picked from commit f0fe970df3838c202ef6c07a4c2b36838ef0a88b) There are legitimate reasons to disallow mmap on certain files, notably in sysfs or procfs. We shouldn't emulate mmap support on file systems that don't offer support natively. CVE-2016-1583 Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org [tyhicks: clean up f_op check by using ecryptfs_file_to_lower()] Signed-off-by: Tyler Hicks Change-Id: I66e3670771630a25b0608f10019d1584e9ce73a6 Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..11309683d65f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -170,6 +170,19 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode speciying file to open @@ -364,7 +377,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, From 4804692cb18d09c49ecfce3eef2969a69859e413 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 4 May 2016 14:04:13 -0400 Subject: [PATCH 286/607] UPSTREAM: ecryptfs: fix handling of directory opening (cherry picked from commit 6a480a7842545ec520a91730209ec0bae41694c1) First of all, trying to open them r/w is idiocy; it's guaranteed to fail. Moreover, assigning ->f_pos and assuming that everything will work is blatantly broken - try that with e.g. tmpfs as underlying layer and watch the fireworks. There may be a non-trivial amount of state associated with current IO position, well beyond the numeric offset. Using the single struct file associated with underlying inode is really not a good idea; we ought to open one for each ecryptfs directory struct file. Additionally, file_operations both for directories and non-directories are full of pointless methods; non-directories should *not* have ->iterate(), directories should not have ->flush(), ->fasync() and ->splice_read(). Signed-off-by: Al Viro Change-Id: I4813ce803f270fdd364758ce1dc108b76eab226e Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 11309683d65f..27794b137b24 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -236,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -260,6 +251,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -280,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -359,20 +402,16 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, From aa3cda16a57e73e9fc269e1de1eb3c35f3f0ed20 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 9 Aug 2016 12:47:37 -0700 Subject: [PATCH 287/607] ANDROID: dm-verity: adopt changes made to dm callbacks v4.4 introduced changes to the callbacks used for dm-linear and dm-verity-target targets. Move to those headers in dm-android-verity. Verified on hikey while having BOARD_USES_RECOVERY_AS_BOOT := true BOARD_BUILD_SYSTEM_ROOT_IMAGE := true BUG: 27339727 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ic64950c3b55f0a6eaa570bcedc2ace83bbf3005e --- drivers/md/dm-android-verity.c | 12 +++++------- drivers/md/dm-android-verity.h | 6 ++---- drivers/md/dm-linear.c | 2 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-verity.h | 6 ++---- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 1f4eb099209d..15ce2a81c1f4 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -59,8 +59,7 @@ static struct target_type android_verity_target = { .dtr = verity_dtr, .map = verity_map, .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, + .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, .io_hints = verity_io_hints, }; @@ -637,8 +636,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) android_verity_target.dtr = dm_linear_dtr, android_verity_target.map = dm_linear_map, android_verity_target.status = dm_linear_status, - android_verity_target.ioctl = dm_linear_ioctl, - android_verity_target.merge = dm_linear_merge, + android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl, android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; @@ -676,7 +674,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - u64 device_size; + u64 uninitialized_var(device_size); if (argc == 1) { /* Use the default keyid */ @@ -896,7 +894,7 @@ static int __init dm_android_verity_init(void) } file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, - (u32 *)&target_added); + &target_added); if (IS_ERR_OR_NULL(file)) { DMERR("Cannot create android_verity debugfs directory: %ld", @@ -906,7 +904,7 @@ static int __init dm_android_verity_init(void) } file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, - (u32 *)&verity_enabled); + &verity_enabled); if (IS_ERR_OR_NULL(file)) { DMERR("Cannot create android_verity debugfs directory: %ld", diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index f43b02fbb475..0c7ff6afec69 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -113,10 +113,8 @@ extern void dm_linear_dtr(struct dm_target *ti); extern int dm_linear_map(struct dm_target *ti, struct bio *bio); extern void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen); -extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg); -extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); +extern int dm_linear_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode); extern int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data); extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 74caca2888a6..8505a771de42 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -116,7 +116,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, } } -static int dm_linear_prepare_ioctl(struct dm_target *ti, +int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 65835f15a116..5214ed2c7507 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -656,7 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, +int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct dm_verity *v = ti->private; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index d9cf5e4939eb..75effca400a3 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -128,10 +128,8 @@ extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, extern void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen); -extern int verity_ioctl(struct dm_target *ti, unsigned cmd, - unsigned long arg); -extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); +extern int verity_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode); extern int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data); extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); From 1a0cf8ace145843e5d2af961078e0ee4eccf3588 Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Wed, 27 Jul 2016 10:03:19 +0800 Subject: [PATCH 288/607] UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget There may be a race condition if f_fs calls unregister_gadget_item in ffs_closed() when unregister_gadget is called by UDC store at the same time. this leads to a kernel NULL pointer dereference: [ 310.644928] Unable to handle kernel NULL pointer dereference at virtual address 00000004 [ 310.645053] init: Service 'adbd' is being killed... [ 310.658938] pgd = c9528000 [ 310.662515] [00000004] *pgd=19451831, *pte=00000000, *ppte=00000000 [ 310.669702] Internal error: Oops: 817 [#1] PREEMPT SMP ARM [ 310.675211] Modules linked in: [ 310.678294] CPU: 0 PID: 1537 Comm: ->transport Not tainted 4.1.15-03725-g793404c #2 [ 310.685958] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 310.692493] task: c8e24200 ti: c945e000 task.ti: c945e000 [ 310.697911] PC is at usb_gadget_unregister_driver+0xb4/0xd0 [ 310.703502] LR is at __mutex_lock_slowpath+0x10c/0x16c [ 310.708648] pc : [] lr : [] psr: 600f0113 [ 311.565585] [] (usb_gadget_unregister_driver) from [] (unregister_gadget_item+0x1c/0x34) [ 311.575426] [] (unregister_gadget_item) from [] (ffs_closed+0x8c/0x9c) [ 311.583702] [] (ffs_closed) from [] (ffs_data_reset+0xc/0xa0) [ 311.591194] [] (ffs_data_reset) from [] (ffs_data_closed+0x90/0xd0) [ 311.599208] [] (ffs_data_closed) from [] (ffs_ep0_release+0xc/0x14) [ 311.607224] [] (ffs_ep0_release) from [] (__fput+0x80/0x1d0) [ 311.614635] [] (__fput) from [] (task_work_run+0xb0/0xe8) [ 311.621788] [] (task_work_run) from [] (do_work_pending+0x7c/0xa4) [ 311.629718] [] (do_work_pending) from [] (work_pending+0xc/0x20) for functions using functionFS, i.e. android adbd will close /dev/usb-ffs/adb/ep0 when usb IO thread fails, but switch adb from on to off also triggers write "none" > UDC. These 2 operations both call unregister_gadget, which will lead to the panic above. add a mutex before calling unregister_gadget for api used in f_fs. Signed-off-by: Winter Wang Signed-off-by: Felipe Balbi --- drivers/usb/gadget/configfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index b27ce0747c18..8df96cb3bb58 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1737,7 +1737,9 @@ void unregister_gadget_item(struct config_item *item) { struct gadget_info *gi = to_gadget_info(item); + mutex_lock(&gi->lock); unregister_gadget(gi); + mutex_unlock(&gi->lock); } EXPORT_SYMBOL_GPL(unregister_gadget_item); From 34828bd3e72d5f7c1d921a3d090ab6b57f6e1ca7 Mon Sep 17 00:00:00 2001 From: Ricky Liang Date: Tue, 2 Feb 2016 01:12:06 +0800 Subject: [PATCH 289/607] FIXUP: sched: scheduler-driven cpu frequency selection Two fixups that have been reported on LKML. The next version of scheduler-driver cpu frequency selection patch set should include these fixes and we can drop this patch then. Signed-off-by: Ricky Liang Change-Id: Ia2f8b5c0dd5dac06580256eeb4b259929688af68 --- kernel/sched/cpufreq_sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 4fea269a6598..f6f9b9b3a4a8 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -131,6 +131,8 @@ static int cpufreq_sched_thread(void *data) new_request = gd->requested_freq; if (new_request == last_request) { set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; schedule(); } else { /* @@ -293,6 +295,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) goto err; } + policy->governor_data = gd; if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; gd->task = kthread_create(cpufreq_sched_thread, policy, @@ -309,12 +312,12 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); } - policy->governor_data = gd; set_sched_freq(); return 0; err: + policy->governor_data = NULL; kfree(gd); return -ENOMEM; } From 76554612b462f53e50f4a40ba80efef90ac7920a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Nov 2015 14:09:38 -0500 Subject: [PATCH 290/607] sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() Part of the responsibility of the update_sg_lb_stats() function is to update the idle_cpus statistical counter in struct sg_lb_stats. This check is done by calling idle_cpu(). The idle_cpu() function, in turn, checks a number of fields within the run queue structure such as rq->curr and rq->nr_running. With the current layout of the run queue structure, rq->curr and rq->nr_running are in separate cachelines. The rq->curr variable is checked first followed by nr_running. As nr_running is also accessed by update_sg_lb_stats() earlier, it makes no sense to load another cacheline when nr_running is not 0 as idle_cpu() will always return false in this case. This patch eliminates this redundant cacheline load by checking the cached nr_running before calling idle_cpu(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas Hatch Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448478580-26467-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar (cherry picked from commit a426f99c91d1036767a7819aaaba6bd3191b7f06) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da9323b9ecab..4a7d9d78ce6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7287,7 +7287,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload, bool *overutilized) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -7304,7 +7304,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -7312,7 +7313,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; if (cpu_overutilized(i)) { From ca9b7b070b5833b9a7f4282df8a8484e5498630f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: [PATCH 291/607] UPSTREAM: tcp: make challenge acks less predictable (cherry picked from commit 75ff39ccc1bd5d3c455b6822ab09e533c551f758) Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Change-Id: Ib46ba66f5e4a5a7c81bfccd7b0aa83c3d9e1b3bb Bug: 30809774 --- net/ipv4/tcp_input.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da34f830f4bc..2014a701663a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,7 +89,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3428,7 +3428,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ if (tcp_oow_rate_limited(sock_net(sk), skb, @@ -3436,13 +3436,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } From a20900c4aa3596f6318568a662c71e0943b2bbd9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 13 Jan 2016 17:48:01 +0100 Subject: [PATCH 292/607] UPSTREAM: ALSA: timer: Fix race among timer ioctls (cherry picked from commit af368027a49a751d6ff4ee9e3f9961f35bb4fede) ALSA timer ioctls have an open race and this may lead to a use-after-free of timer instance object. A simplistic fix is to make each ioctl exclusive. We have already tread_sem for controlling the tread, and extend this as a global mutex to be applied to each ioctl. The downside is, of course, the worse concurrency. But these ioctls aren't to be parallel accessible, in anyway, so it should be fine to serialize there. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Signed-off-by: Takashi Iwai Change-Id: I1ac52f1cba5e7408fd88c8fc1c30ca2e83967ebb Bug: 28694392 --- sound/core/timer.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 1701703b8d93..f19fc67413e6 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -73,7 +73,7 @@ struct snd_timer_user { struct timespec tstamp; /* trigger tstamp */ wait_queue_head_t qchange_sleep; struct fasync_struct *fasync; - struct mutex tread_sem; + struct mutex ioctl_lock; }; /* list of timers */ @@ -1265,7 +1265,7 @@ static int snd_timer_user_open(struct inode *inode, struct file *file) return -ENOMEM; spin_lock_init(&tu->qlock); init_waitqueue_head(&tu->qchange_sleep); - mutex_init(&tu->tread_sem); + mutex_init(&tu->ioctl_lock); tu->ticks = 1; tu->queue_size = 128; tu->queue = kmalloc(tu->queue_size * sizeof(struct snd_timer_read), @@ -1285,8 +1285,10 @@ static int snd_timer_user_release(struct inode *inode, struct file *file) if (file->private_data) { tu = file->private_data; file->private_data = NULL; + mutex_lock(&tu->ioctl_lock); if (tu->timeri) snd_timer_close(tu->timeri); + mutex_unlock(&tu->ioctl_lock); kfree(tu->queue); kfree(tu->tqueue); kfree(tu); @@ -1524,7 +1526,6 @@ static int snd_timer_user_tselect(struct file *file, int err = 0; tu = file->private_data; - mutex_lock(&tu->tread_sem); if (tu->timeri) { snd_timer_close(tu->timeri); tu->timeri = NULL; @@ -1568,7 +1569,6 @@ static int snd_timer_user_tselect(struct file *file, } __err: - mutex_unlock(&tu->tread_sem); return err; } @@ -1782,7 +1782,7 @@ enum { SNDRV_TIMER_IOCTL_PAUSE_OLD = _IO('T', 0x23), }; -static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, +static long __snd_timer_user_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_timer_user *tu; @@ -1799,17 +1799,11 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, { int xarg; - mutex_lock(&tu->tread_sem); - if (tu->timeri) { /* too late */ - mutex_unlock(&tu->tread_sem); + if (tu->timeri) /* too late */ return -EBUSY; - } - if (get_user(xarg, p)) { - mutex_unlock(&tu->tread_sem); + if (get_user(xarg, p)) return -EFAULT; - } tu->tread = xarg ? 1 : 0; - mutex_unlock(&tu->tread_sem); return 0; } case SNDRV_TIMER_IOCTL_GINFO: @@ -1842,6 +1836,18 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, return -ENOTTY; } +static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct snd_timer_user *tu = file->private_data; + long ret; + + mutex_lock(&tu->ioctl_lock); + ret = __snd_timer_user_ioctl(file, cmd, arg); + mutex_unlock(&tu->ioctl_lock); + return ret; +} + static int snd_timer_user_fasync(int fd, struct file * file, int on) { struct snd_timer_user *tu; From 50acf9e6e34a14148d6a8c715c79b1cb013f4f3d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 11 Feb 2016 19:37:27 +0000 Subject: [PATCH 293/607] UPSTREAM: af_unix: Guard against other == sk in unix_dgram_sendmsg (cherry picked from commit a5527dda344fff0514b7989ef7a755729769daa1) The unix_dgram_sendmsg routine use the following test if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { to determine if sk and other are in an n:1 association (either established via connect or by using sendto to send messages to an unrelated socket identified by address). This isn't correct as the specified address could have been bound to the sending socket itself or because this socket could have been connected to itself by the time of the unix_peer_get but disconnected before the unix_state_lock(other). In both cases, the if-block would be entered despite other == sk which might either block the sender unintentionally or lead to trying to unlock the same spin lock twice for a non-blocking send. Add a other != sk check to guard against this. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Reported-By: Philipp Hahn Signed-off-by: Rainer Weikusat Tested-by: Philipp Hahn Signed-off-by: David S. Miller Change-Id: I4ebef6a390df3487903b166b837e34c653e01cb2 Signed-off-by: Amit Pundir --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ef05cd9403d4..8badff627fef 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1765,7 +1765,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); From 8d525c512280bb7d8218fd59c04de985b1886eca Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Tue, 16 Aug 2016 15:51:34 -0700 Subject: [PATCH 294/607] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. Bug: 30677035 Change-Id: I625938135ea33e6e87cf6af1fc7edc136d8b4b32 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 80 ++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 ++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 6 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 81 +++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 320 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 33e2f62d5062..f95413fb7f2b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3539,3 +3539,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + int pct, num_elem; + u_int64_t elem; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %d):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %d):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 9fab52559a8c..b503249950df 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index fcf7829c759e..17068839c74b 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,8 +32,6 @@ #include "slot-gpio.h" #include "pwrseq.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif + mmc_latency_hist_sysfs_init(host); + mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 992bf5397633..bf38533406fd 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,6 +12,8 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 85cd2564c157..4167bdbf0ecf 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5327,6 +5356,54 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5653,6 +5733,7 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 2570d9477b37..f3780cf7d895 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,6 +532,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..faf2c0093527 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,6 +197,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1645,6 +1648,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 37967b6da03c..3349f0676acb 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,6 +136,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 40025b28c1fb..e4862f7cdede 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -379,6 +380,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From 1c6df5fdcb19a03fb6b652090b4b968a6377fb14 Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Thu, 25 Aug 2016 00:59:21 +0000 Subject: [PATCH 295/607] Revert "Android: MMC/UFS IO Latency Histograms." This reverts commit 8d525c512280bb7d8218fd59c04de985b1886eca. Change-Id: I69350b98d9de9b1c9f591e03a90f133e328ba72a --- block/blk-core.c | 80 -------------------------------------- drivers/mmc/core/core.c | 66 +------------------------------ drivers/mmc/core/host.c | 6 +-- drivers/mmc/core/host.h | 5 --- drivers/scsi/ufs/ufshcd.c | 81 --------------------------------------- drivers/scsi/ufs/ufshcd.h | 3 -- include/linux/blkdev.h | 76 ------------------------------------ include/linux/mmc/core.h | 2 - include/linux/mmc/host.h | 4 -- 9 files changed, 3 insertions(+), 320 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f95413fb7f2b..33e2f62d5062 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3539,83 +3539,3 @@ int __init blk_dev_init(void) return 0; } - -/* - * Blk IO latency support. We want this to be as cheap as possible, so doing - * this lockless (and avoiding atomics), a few off by a few errors in this - * code is not harmful, and we don't want to do anything that is - * perf-impactful. - * TODO : If necessary, we can make the histograms per-cpu and aggregate - * them when printing them out. - */ -void -blk_zero_latency_hist(struct io_latency_state *s) -{ - memset(s->latency_y_axis_read, 0, - sizeof(s->latency_y_axis_read)); - memset(s->latency_y_axis_write, 0, - sizeof(s->latency_y_axis_write)); - s->latency_reads_elems = 0; - s->latency_writes_elems = 0; -} - -ssize_t -blk_latency_hist_show(struct io_latency_state *s, char *buf) -{ - int i; - int bytes_written = 0; - int pct, num_elem; - u_int64_t elem; - - num_elem = s->latency_reads_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Read Latency Histogram (n = %d):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_read[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_read[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - num_elem = s->latency_writes_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Write Latency Histogram (n = %d):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_write[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_write[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - return bytes_written; -} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index b503249950df..9fab52559a8c 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,17 +183,6 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); - if (mrq->lat_hist_enabled) { - ktime_t completion; - u_int64_t delta_us; - - completion = ktime_get(); - delta_us = ktime_us_delta(completion, - mrq->io_start); - blk_update_latency_hist(&host->io_lat_s, - (mrq->data->flags & MMC_DATA_READ), - delta_us); - } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -638,11 +627,6 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { - if (host->latency_hist_enabled) { - areq->mrq->io_start = ktime_get(); - areq->mrq->lat_hist_enabled = 1; - } else - areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1980,7 +1964,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2923,54 +2907,6 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } -static ssize_t -latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mmc_host *host = cls_dev_to_mmc_host(dev); - - return blk_latency_hist_show(&host->io_lat_s, buf); -} - -/* - * Values permitted 0, 1, 2. - * 0 -> Disable IO latency histograms (default) - * 1 -> Enable IO latency histograms - * 2 -> Zero out IO latency histograms - */ -static ssize_t -latency_hist_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mmc_host *host = cls_dev_to_mmc_host(dev); - long value; - - if (kstrtol(buf, 0, &value)) - return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&host->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || - value == BLK_IO_LAT_HIST_DISABLE) - host->latency_hist_enabled = value; - return count; -} - -static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, - latency_hist_show, latency_hist_store); - -void -mmc_latency_hist_sysfs_init(struct mmc_host *host) -{ - if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) - dev_err(&host->class_dev, - "Failed to create latency_hist sysfs entry\n"); -} - -void -mmc_latency_hist_sysfs_exit(struct mmc_host *host) -{ - device_remove_file(&host->class_dev, &dev_attr_latency_hist); -} - subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 17068839c74b..fcf7829c759e 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,6 +32,8 @@ #include "slot-gpio.h" #include "pwrseq.h" +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -392,8 +394,6 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif - mmc_latency_hist_sysfs_init(host); - mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,8 +422,6 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif - mmc_latency_hist_sysfs_exit(host); - device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index bf38533406fd..992bf5397633 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,8 +12,6 @@ #define _MMC_CORE_HOST_H #include -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -23,8 +21,5 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); -void mmc_latency_hist_sysfs_init(struct mmc_host *host); -void mmc_latency_hist_sysfs_exit(struct mmc_host *host); - #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 4167bdbf0ecf..85cd2564c157 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,7 +39,6 @@ #include #include -#include #include "ufshcd.h" #include "unipro.h" @@ -1333,17 +1332,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } - - /* IO svc time latency histogram */ - if (hba != NULL && cmd->request != NULL) { - if (hba->latency_hist_enabled && - (cmd->request->cmd_type == REQ_TYPE_FS)) { - cmd->request->lat_hist_io_start = ktime_get(); - cmd->request->lat_hist_enabled = 1; - } else - cmd->request->lat_hist_enabled = 0; - } - WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3172,7 +3160,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; - struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3197,22 +3184,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); - req = cmd->request; - if (req) { - /* Update IO svc time latency histogram */ - if (req->lat_hist_enabled) { - ktime_t completion; - u_int64_t delta_us; - - completion = ktime_get(); - delta_us = ktime_us_delta(completion, - req->lat_hist_io_start); - /* rq_data_dir() => true if WRITE */ - blk_update_latency_hist(&hba->io_lat_s, - (rq_data_dir(req) == READ), - delta_us); - } - } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5356,54 +5327,6 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); -/* - * Values permitted 0, 1, 2. - * 0 -> Disable IO latency histograms (default) - * 1 -> Enable IO latency histograms - * 2 -> Zero out IO latency histograms - */ -static ssize_t -latency_hist_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ufs_hba *hba = dev_get_drvdata(dev); - long value; - - if (kstrtol(buf, 0, &value)) - return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&hba->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || - value == BLK_IO_LAT_HIST_DISABLE) - hba->latency_hist_enabled = value; - return count; -} - -ssize_t -latency_hist_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct ufs_hba *hba = dev_get_drvdata(dev); - - return blk_latency_hist_show(&hba->io_lat_s, buf); -} - -static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, - latency_hist_show, latency_hist_store); - -static void -ufshcd_init_latency_hist(struct ufs_hba *hba) -{ - if (device_create_file(hba->dev, &dev_attr_latency_hist)) - dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); -} - -static void -ufshcd_exit_latency_hist(struct ufs_hba *hba) -{ - device_create_file(hba->dev, &dev_attr_latency_hist); -} - /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5419,7 +5342,6 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); - ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5717,8 +5639,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); - ufshcd_init_latency_hist(hba); - /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5733,7 +5653,6 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); - ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index f3780cf7d895..2570d9477b37 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,9 +532,6 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; - - int latency_hist_enabled; - struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index faf2c0093527..c70e3588a48c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,9 +197,6 @@ struct request { /* for bidi */ struct request *next_rq; - - ktime_t lat_hist_io_start; - int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1648,79 +1645,6 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); - -/* - * X-axis for IO latency histogram support. - */ -static const u_int64_t latency_x_axis_us[] = { - 100, - 200, - 300, - 400, - 500, - 600, - 700, - 800, - 900, - 1000, - 1200, - 1400, - 1600, - 1800, - 2000, - 2500, - 3000, - 4000, - 5000, - 6000, - 7000, - 9000, - 10000 -}; - -#define BLK_IO_LAT_HIST_DISABLE 0 -#define BLK_IO_LAT_HIST_ENABLE 1 -#define BLK_IO_LAT_HIST_ZERO 2 - -struct io_latency_state { - u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_reads_elems; - u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_writes_elems; -}; - -static inline void -blk_update_latency_hist(struct io_latency_state *s, - int read, - u_int64_t delta_us) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { - if (delta_us < (u_int64_t)latency_x_axis_us[i]) { - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - break; - } - } - if (i == ARRAY_SIZE(latency_x_axis_us)) { - /* Overflowed the histogram */ - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - } - if (read) - s->latency_reads_elems++; - else - s->latency_writes_elems++; -} - -void blk_zero_latency_hist(struct io_latency_state *s); -ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); - #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 3349f0676acb..37967b6da03c 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,8 +136,6 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; - ktime_t io_start; - int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e4862f7cdede..40025b28c1fb 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -380,9 +379,6 @@ struct mmc_host { } embedded_sdio_data; #endif - int latency_hist_enabled; - struct io_latency_state io_lat_s; - unsigned long private[0] ____cacheline_aligned; }; From 8a58605e4661d18b4e09f1385fe22b1e83cd2cb6 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 23 Aug 2016 11:32:37 -0700 Subject: [PATCH 296/607] ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module Exports the device mapper callbacks of linear and dm-verity-target methods. Signed-off-by: Badhri Jagan Sridharan Change-Id: I0358be0615c431dce3cc78575aaac4ccfe3aacd7 --- drivers/md/Kconfig | 3 ++- drivers/md/Makefile | 5 +---- drivers/md/dm-linear.c | 6 ++++++ drivers/md/dm-verity-target.c | 7 +++++++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 96b419b544ed..6035794bc1f2 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -501,7 +501,7 @@ config DM_LOG_WRITES If unsure, say N. config DM_ANDROID_VERITY - bool "Android verity target support" + tristate "Android verity target support" depends on DM_VERITY depends on X509_CERTIFICATE_PARSER depends on SYSTEM_TRUSTED_KEYRING @@ -509,6 +509,7 @@ config DM_ANDROID_VERITY depends on KEYS depends on ASYMMETRIC_KEY_TYPE depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + depends on MD_LINEAR ---help--- This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked diff --git a/drivers/md/Makefile b/drivers/md/Makefile index c8fb00d8cc36..32b5d0a90d60 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o +obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o @@ -68,7 +69,3 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif - -ifeq ($(CONFIG_DM_ANDROID_VERITY),y) -dm-verity-objs += dm-android-verity.o -endif diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 8505a771de42..2ff5f32a4b99 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -66,6 +66,7 @@ int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(lc); return ret; } +EXPORT_SYMBOL_GPL(dm_linear_ctr); void dm_linear_dtr(struct dm_target *ti) { @@ -74,6 +75,7 @@ void dm_linear_dtr(struct dm_target *ti) dm_put_device(ti, lc->dev); kfree(lc); } +EXPORT_SYMBOL_GPL(dm_linear_dtr); static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { @@ -98,6 +100,7 @@ int dm_linear_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +EXPORT_SYMBOL_GPL(dm_linear_map); void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -115,6 +118,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(dm_linear_status); int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) @@ -132,6 +136,7 @@ int dm_linear_prepare_ioctl(struct dm_target *ti, return 1; return 0; } +EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl); int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -140,6 +145,7 @@ int dm_linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +EXPORT_SYMBOL_GPL(dm_linear_iterate_devices); static struct target_type linear_target = { .name = "linear", diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5214ed2c7507..9d3d4b297201 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -592,6 +592,7 @@ int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +EXPORT_SYMBOL_GPL(verity_map); /* * Status: V (valid) or C (corruption found) @@ -655,6 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(verity_status); int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) @@ -668,6 +670,7 @@ int verity_prepare_ioctl(struct dm_target *ti, return 1; return 0; } +EXPORT_SYMBOL_GPL(verity_prepare_ioctl); int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -676,6 +679,7 @@ int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } +EXPORT_SYMBOL_GPL(verity_iterate_devices); void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { @@ -689,6 +693,7 @@ void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } +EXPORT_SYMBOL_GPL(verity_io_hints); void verity_dtr(struct dm_target *ti) { @@ -719,6 +724,7 @@ void verity_dtr(struct dm_target *ti) kfree(v); } +EXPORT_SYMBOL_GPL(verity_dtr); static int verity_alloc_zero_digest(struct dm_verity *v) { @@ -1053,6 +1059,7 @@ bad: return r; } +EXPORT_SYMBOL_GPL(verity_ctr); static struct target_type verity_target = { .name = "verity", From 8f5bf76871c346bfc4fed9fcf85eff79948aade2 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 11 Jun 2016 20:32:06 +0200 Subject: [PATCH 297/607] ipv6: fix endianness error in icmpv6_err IPv6 ping socket error handler doesn't correctly convert the new 32 bit mtu to host endianness before using. [Cherry-pick of net dcb94b88c09ce82a80e188d49bcffdc83ba215a6] Bug: 29370996 Change-Id: Iea0ca79f16c2a1366d82b3b0a3097093d18da8b7 Cc: Lorenzo Colitti Fixes: 6d0bfe22611602f ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Hannes Frederic Sowa Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a3ec7a77a1ee..41e5c9520c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) - ping_err(skb, offset, info); + ping_err(skb, offset, ntohl(info)); } static int icmpv6_rcv(struct sk_buff *skb); From 75c591ea5787a59e69c8d2b803a7f4f0c29ec34e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Sat, 13 Aug 2016 01:13:38 +0900 Subject: [PATCH 298/607] net: ipv6: Fix ping to link-local addresses. ping_v6_sendmsg does not set flowi6_oif in response to sin6_scope_id or sk_bound_dev_if, so it is not possible to use these APIs to ping an IPv6 address on a different interface. Instead, it sets flowi6_iif, which is incorrect but harmless. Stop setting flowi6_iif, and support various ways of setting oif in the same priority order used by udpv6_sendmsg. [Backport of net 5e457896986e16c440c97bb94b9ccd95dd157292] Bug: 29370996 Change-Id: Ibe1b9434c00ed96f1e30acb110734c6570b087b8 Tested: https://android-review.googlesource.com/#/c/254470/ Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/ping.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 2cfaedf03252..9411c8d770a5 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -84,7 +84,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; - int iif = 0; + int oif = 0; struct flowi6 fl6; int err; int hlimit; @@ -106,25 +106,30 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != u->sin6_scope_id) { - return -EINVAL; - } daddr = &(u->sin6_addr); - iif = u->sin6_scope_id; + if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) + oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; } - if (!iif) - iif = sk->sk_bound_dev_if; + if (!oif) + oif = sk->sk_bound_dev_if; + + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; + + if (!oif && ipv6_addr_is_multicast(daddr)) + oif = np->mcast_oif; + else if (!oif) + oif = np->ucast_oif; addr_type = ipv6_addr_type(daddr); - if (__ipv6_addr_needs_scope_id(addr_type) && !iif) - return -EINVAL; - if (addr_type & IPV6_ADDR_MAPPED) + if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || + (addr_type & IPV6_ADDR_MAPPED) || + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) return -EINVAL; /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ @@ -134,17 +139,13 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; + fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sock_i_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -154,11 +155,6 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!np) return -EBADF; - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; pfh.icmph.checksum = 0; From 3228c5eb7af2b4cb981706b88ed3c3e81ab8e80a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Aug 2016 08:44:12 -0700 Subject: [PATCH 299/607] RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact Currently the percpu-rwsem switches to (global) atomic ops while a writer is waiting; which could be quite a while and slows down releasing the readers. This patch cures this problem by ordering the reader-state vs reader-count (see the comments in __percpu_down_read() and percpu_down_write()). This changes a global atomic op into a full memory barrier, which doesn't have the global cacheline contention. This also enables using the percpu-rwsem with rcu_sync disabled in order to bias the implementation differently, reducing the writer latency by adding some cost to readers. Mailing-list-URL: https://lkml.org/lkml/2016/8/9/181 Cc: Paul McKenney Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) [jstultz: Backported to 4.4] Change-Id: I8ea04b4dca2ec36f1c2469eccafde1423490572f Signed-off-by: John Stultz --- include/linux/percpu-rwsem.h | 84 +++++++++-- kernel/locking/percpu-rwsem.c | 265 +++++++++++++++++++--------------- 2 files changed, 225 insertions(+), 124 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c2fa3ecb0dce..146efefde2a1 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -10,30 +10,96 @@ struct percpu_rw_semaphore { struct rcu_sync rss; - unsigned int __percpu *fast_read_ctr; + unsigned int __percpu *read_count; struct rw_semaphore rw_sem; - atomic_t slow_read_ctr; - wait_queue_head_t write_waitq; + wait_queue_head_t writer; + int readers_block; }; -extern void percpu_down_read(struct percpu_rw_semaphore *); -extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); -extern void percpu_up_read(struct percpu_rw_semaphore *); +extern int __percpu_down_read(struct percpu_rw_semaphore *, int); +extern void __percpu_up_read(struct percpu_rw_semaphore *); + +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + might_sleep(); + + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); + + preempt_disable(); + /* + * We are in an RCU-sched read-side critical section, so the writer + * cannot both change sem->state from readers_fast and start checking + * counters while we are here. So if we see !sem->state, we know that + * the writer won't be checking until we're past the preempt_enable() + * and that one the synchronize_sched() is done, the writer will see + * anything we did within this RCU-sched read-size critical section. + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + __percpu_down_read(sem, false); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ +} + +static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) +{ + int ret = 1; + + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ + + if (ret) + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); + + return ret; +} + +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +{ + /* + * The barrier() in preempt_disable() prevents the compiler from + * bleeding the critical section out. + */ + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + if (likely(rcu_sync_is_idle(&sem->rss))) + __this_cpu_dec(*sem->read_count); + else + __percpu_up_read(sem); /* Unconditional memory barrier */ + preempt_enable(); + + rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); +} extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); + extern void percpu_free_rwsem(struct percpu_rw_semaphore *); -#define percpu_init_rwsem(brw) \ +#define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ - __percpu_init_rwsem(brw, #brw, &rwsem_key); \ + __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) - #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,151 +8,186 @@ #include #include -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) -{ - bool success; - - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); - - return success; -} - -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); - - if (likely(update_fast_ctr(brw, +1))) - return; - - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); - -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); - return 1; -} - -void percpu_up_read(struct percpu_rw_semaphore *brw) -{ - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; - - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); -} -EXPORT_SYMBOL_GPL(percpu_up_read); - -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } - - return sum; -} - -void percpu_down_write(struct percpu_rw_semaphore *brw) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. */ - rcu_sync_enter(&brw->rss); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* A matches D */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); + + if (try) + return 0; + + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); + + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); + return 1; +} +EXPORT_SYMBOL_GPL(__percpu_down_read); + +void __percpu_up_read(struct percpu_rw_semaphore *sem) +{ + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); + + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); +} +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) + +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) +{ + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ + + smp_mb(); /* C matches B */ + + return true; +} + +void percpu_down_write(struct percpu_rw_semaphore *sem) +{ + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + + /* + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. + */ + WRITE_ONCE(sem->readers_block, 1); + + smp_mb(); /* D matches A */ + + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ + + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. */ - rcu_sync_exit(&brw->rss); + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. + */ + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); From 0c3240a1ef2e840aaa17f593326e3642bc857aa7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Aug 2016 18:54:13 +0200 Subject: [PATCH 300/607] RFC: FROMLIST: cgroup: avoid synchronize_sched() in __cgroup_procs_write() The current percpu-rwsem read side is entirely free of serializing insns at the cost of having a synchronize_sched() in the write path. The latency of the synchronize_sched() is too high for cgroups. The commit 1ed1328792ff talks about the write path being a fairly cold path but this is not the case for Android which moves task to the foreground cgroup and back around binder IPC calls from foreground processes to background processes, so it is significantly hotter than human initiated operations. Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the problem, hopefully it should not be that slow after another commit 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact"). We could just add rcu_sync_enter() into cgroup_init() but we do not want another synchronize_sched() at boot time, so this patch adds the new helper which doesn't block but currently can only be called before the first use. Cc: Tejun Heo Cc: Paul McKenney Reported-by: John Stultz Reported-by: Dmitry Shmidt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov [jstultz: backported to 4.4] Change-Id: I34aa9c394d3052779b56976693e96d861bd255f2 Mailing-list-URL: https://lkml.org/lkml/2016/8/11/557 Signed-off-by: John Stultz --- include/linux/rcu_sync.h | 1 + kernel/cgroup.c | 6 ++++++ kernel/rcu/sync.c | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index a63a33e6196e..ece7ed9a4a70 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a57b7c86871c..bd541053efc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5346,6 +5346,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..e358313a0d6c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -82,6 +82,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) rsp->gp_type = type; } +/** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + /** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization From e91f1799ff2cc3883907b5f3e141507f9716ff0e Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 10 Aug 2016 15:43:06 -0400 Subject: [PATCH 301/607] RFC: FROMLIST: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork cgroup_threadgroup_rwsem is acquired in read mode during process exit and fork. It is also grabbed in write mode during __cgroups_proc_write(). I've recently run into a scenario with lots of memory pressure and OOM and I am beginning to see systemd __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 percpu_down_write+0x114/0x170 __cgroup_procs_write.isra.12+0xb8/0x3c0 cgroup_file_write+0x74/0x1a0 kernfs_fop_write+0x188/0x200 __vfs_write+0x6c/0xe0 vfs_write+0xc0/0x230 SyS_write+0x6c/0x110 system_call+0x38/0xb4 This thread is waiting on the reader of cgroup_threadgroup_rwsem to exit. The reader itself is under memory pressure and has gone into reclaim after fork. There are times the reader also ends up waiting on oom_lock as well. __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 jbd2_log_wait_commit+0xd4/0x180 ext4_evict_inode+0x88/0x5c0 evict+0xf8/0x2a0 dispose_list+0x50/0x80 prune_icache_sb+0x6c/0x90 super_cache_scan+0x190/0x210 shrink_slab.part.15+0x22c/0x4c0 shrink_zone+0x288/0x3c0 do_try_to_free_pages+0x1dc/0x590 try_to_free_pages+0xdc/0x260 __alloc_pages_nodemask+0x72c/0xc90 alloc_pages_current+0xb4/0x1a0 page_table_alloc+0xc0/0x170 __pte_alloc+0x58/0x1f0 copy_page_range+0x4ec/0x950 copy_process.isra.5+0x15a0/0x1870 _do_fork+0xa8/0x4b0 ppc_clone+0x8/0xc In the meanwhile, all processes exiting/forking are blocked almost stalling the system. This patch moves the threadgroup_change_begin from before cgroup_fork() to just before cgroup_canfork(). There is no nee to worry about threadgroup changes till the task is actually added to the threadgroup. This avoids having to call reclaim with cgroup_threadgroup_rwsem held. tj: Subject and description edits. Signed-off-by: Balbir Singh Acked-by: Zefan Li Cc: Oleg Nesterov Cc: Andrew Morton Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Tejun Heo [jstultz: Cherry-picked from: git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 568ac888215c7f] Change-Id: Ie8ece84fb613cf6a7b08cea1468473a8df2b9661 Signed-off-by: John Stultz --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7ec6e9939b2c..d6a6da547e41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1370,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1522,6 +1521,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1622,6 +1622,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: @@ -1652,7 +1653,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); From ba52437821164deacf9af90d559f2e3f4888ff28 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 14 Jul 2016 11:38:40 -0400 Subject: [PATCH 302/607] BACKPORT: tcp: enable per-socket rate limiting of all 'challenge acks' (cherry picked from commit 083ae308280d13d187512b9babe3454342a7987e) The per-socket rate limit for 'challenge acks' was introduced in the context of limiting ack loops: commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") And I think it can be extended to rate limit all 'challenge acks' on a per-socket basis. Since we have the global tcp_challenge_ack_limit, this patch allows for tcp_challenge_ack_limit to be set to a large value and effectively rely on the per-socket limit, or set tcp_challenge_ack_limit to a lower value and still prevents a single connections from consuming the entire challenge ack quota. It further moves in the direction of eliminating the global limit at some point, as Eric Dumazet has suggested. This a follow-up to: Subject: tcp: make challenge acks less predictable Cc: Eric Dumazet Cc: David S. Miller Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Yue Cao Signed-off-by: Jason Baron Signed-off-by: David S. Miller Change-Id: I622d5ae96e9387e775a0196c892d8d0e1a5564a7 Signed-off-by: Amit Pundir --- net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2014a701663a..2bf1110aa2ae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3391,6 +3391,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3404,21 +3421,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; + return false; - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; - -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3431,9 +3436,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; /* Then check host-wide RFC 5961 rate limit. */ From 258a8f519a47a2b3c0a42940182715fc0de690e7 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 29 Aug 2016 17:31:10 -0700 Subject: [PATCH 303/607] UPSTREAM: Input: powermate - fix oops with malicious USB descriptors The powermate driver expects at least one valid USB endpoint in its probe function. If given malicious descriptors that specify 0 for the number of endpoints, it will crash. Validate the number of endpoints on the interface before using them. The full report for this issue can be found here: http://seclists.org/bugtraq/2016/Mar/85 BUG: 28242610 Reported-by: Ralf Spenneberg Signed-off-by: Josh Boyer Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I1cb956a35f3bba73324240d5bd0a029f49d3c456 --- drivers/input/misc/powermate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/misc/powermate.c b/drivers/input/misc/powermate.c index 63b539d3daba..84909a12ff36 100644 --- a/drivers/input/misc/powermate.c +++ b/drivers/input/misc/powermate.c @@ -307,6 +307,9 @@ static int powermate_probe(struct usb_interface *intf, const struct usb_device_i int error = -ENOMEM; interface = intf->cur_altsetting; + if (interface->desc.bNumEndpoints < 1) + return -EINVAL; + endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -EIO; From 125d90ca2fd862ecb119a085d95f813a6263d218 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 29 Aug 2016 17:33:52 -0700 Subject: [PATCH 304/607] UPSTREAM: USB: cypress_m8: add endpoint sanity check commit c55aee1bf0e6b6feec8b2927b43f7a09a6d5f754 upstream. An attack using missing endpoints exists. CVE-2016-3137 BUG: 28242610 Signed-off-by: Oliver Neukum Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I1cc7957a5924175d24f12fdc41162ece67c907e5 --- drivers/usb/serial/cypress_m8.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c index 01bf53392819..244acb1299a9 100644 --- a/drivers/usb/serial/cypress_m8.c +++ b/drivers/usb/serial/cypress_m8.c @@ -447,6 +447,11 @@ static int cypress_generic_port_probe(struct usb_serial_port *port) struct usb_serial *serial = port->serial; struct cypress_private *priv; + if (!port->interrupt_out_urb || !port->interrupt_in_urb) { + dev_err(&port->dev, "required endpoint is missing\n"); + return -ENODEV; + } + priv = kzalloc(sizeof(struct cypress_private), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -606,12 +611,6 @@ static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port) cypress_set_termios(tty, port, &priv->tmp_termios); /* setup the port and start reading from the device */ - if (!port->interrupt_in_urb) { - dev_err(&port->dev, "%s - interrupt_in_urb is empty!\n", - __func__); - return -1; - } - usb_fill_int_urb(port->interrupt_in_urb, serial->dev, usb_rcvintpipe(serial->dev, port->interrupt_in_endpointAddress), port->interrupt_in_urb->transfer_buffer, From ee5c7a9739c6ddd0d54385c313b5d6f87652cd63 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:33:55 -0700 Subject: [PATCH 305/607] UPSTREAM: USB: mct_u232: add sanity checking in probe commit 4e9a0b05257f29cf4b75f3209243ed71614d062e upstream. An attack using the lack of sanity checking in probe is known. This patch checks for the existence of a second port. CVE-2016-3136 BUG: 28242610 Signed-off-by: Oliver Neukum [johan: add error message ] Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I284ad648c2087c34a098d67e0cc6d948a568413c --- drivers/usb/serial/mct_u232.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c index fd707d6a10e2..89726f702202 100644 --- a/drivers/usb/serial/mct_u232.c +++ b/drivers/usb/serial/mct_u232.c @@ -376,14 +376,21 @@ static void mct_u232_msr_to_state(struct usb_serial_port *port, static int mct_u232_port_probe(struct usb_serial_port *port) { + struct usb_serial *serial = port->serial; struct mct_u232_private *priv; + /* check first to simplify error handling */ + if (!serial->port[1] || !serial->port[1]->interrupt_in_urb) { + dev_err(&port->dev, "expected endpoint missing\n"); + return -ENODEV; + } + priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; /* Use second interrupt-in endpoint for reading. */ - priv->read_urb = port->serial->port[1]->interrupt_in_urb; + priv->read_urb = serial->port[1]->interrupt_in_urb; priv->read_urb->context = port; spin_lock_init(&priv->lock); From ae6790121ceb4608e1a1dec10e285c8d661463d5 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:35:32 -0700 Subject: [PATCH 306/607] UPSTREAM: USB: usb_driver_claim_interface: add sanity checking commit 0b818e3956fc1ad976bee791eadcbb3b5fec5bfd upstream. Attacks that trick drivers into passing a NULL pointer to usb_driver_claim_interface() using forged descriptors are known. This thwarts them by sanity checking. BUG: 28242610 Signed-off-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: Ib43ec5edb156985a9db941785a313f6801df092a --- drivers/usb/core/driver.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index 56593a9a8726..2057d91d8336 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -502,11 +502,15 @@ static int usb_unbind_interface(struct device *dev) int usb_driver_claim_interface(struct usb_driver *driver, struct usb_interface *iface, void *priv) { - struct device *dev = &iface->dev; + struct device *dev; struct usb_device *udev; int retval = 0; int lpm_disable_error; + if (!iface) + return -ENODEV; + + dev = &iface->dev; if (dev->driver) return -EBUSY; From b24e99b3ae1e3860d374d0ed497a371100533e83 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:37:07 -0700 Subject: [PATCH 307/607] UPSTREAM: USB: iowarrior: fix oops with malicious USB descriptors commit 4ec0ef3a82125efc36173062a50624550a900ae0 upstream. The iowarrior driver expects at least one valid endpoint. If given malicious descriptors that specify 0 for the number of endpoints, it will crash in the probe function. Ensure there is at least one endpoint on the interface before using it. The full report of this issue can be found here: http://seclists.org/bugtraq/2016/Mar/87 BUG: 28242610 Reported-by: Ralf Spenneberg Signed-off-by: Josh Boyer Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: If5161c23928e9ef77cb3359cba9b36622b1908df --- drivers/usb/misc/iowarrior.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index c6bfd13f6c92..1950e87b4219 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -787,6 +787,12 @@ static int iowarrior_probe(struct usb_interface *interface, iface_desc = interface->cur_altsetting; dev->product_id = le16_to_cpu(udev->descriptor.idProduct); + if (iface_desc->desc.bNumEndpoints < 1) { + dev_err(&interface->dev, "Invalid number of endpoints\n"); + retval = -EINVAL; + goto error; + } + /* set up the endpoint information */ for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; From 6087158f06e3cc66897cb4571cb673ac9fffb12d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:39:02 -0700 Subject: [PATCH 308/607] UPSTREAM: USB: cdc-acm: more sanity checking commit 8835ba4a39cf53f705417b3b3a94eb067673f2c9 upstream. An attack has become available which pretends to be a quirky device circumventing normal sanity checks and crashes the kernel by an insufficient number of interfaces. This patch adds a check to the code path for quirky devices. BUG: 28242610 Signed-off-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I9a5f7f3c704b65e866335054f470451fcfae9d1c --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 26ca4f910cb0..a7732f80a912 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1113,6 +1113,9 @@ static int acm_probe(struct usb_interface *intf, if (quirks == NO_UNION_NORMAL) { data_interface = usb_ifnum_to_if(usb_dev, 1); control_interface = usb_ifnum_to_if(usb_dev, 0); + /* we would crash */ + if (!data_interface || !control_interface) + return -ENODEV; goto skip_normal_probe; } From 4547c0f93518197834f7fc11c6558b566b7e14c8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 31 Aug 2016 09:52:16 -0700 Subject: [PATCH 309/607] ANDROID: rcu_sync: Export rcu_sync_lockdep_assert x86_64:allmodconfig fails to build with the following error. ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined! Introduced by commit 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact"). The applied upstream version exports the missing symbol, so let's do the same. Change-Id: If4e516715c3415fe8c82090f287174857561550d Fixes: 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize ...") Signed-off-by: Guenter Roeck --- kernel/rcu/sync.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e358313a0d6c..b49cf3ac2d47 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,7 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** From a49dcf2e745c25a67a76d8a1e9658c16827c9885 Mon Sep 17 00:00:00 2001 From: Yongqin Liu Date: Thu, 1 Sep 2016 22:06:04 +0530 Subject: [PATCH 310/607] ANDROID: base-cfg: enable SECCOMP config Enable following seccomp configs CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y Otherwise we will get mediacode error like this on Android N: E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument Change-Id: I2477b6a2cfdded5c0ebf6ffbb6150b0e5fe2ba12 Signed-off-by: Yongqin Liu Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 6496bb3961a2..288bab2d858d 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -141,6 +141,8 @@ CONFIG_PROFILING=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From af4104512cdb48d5eede620871d6a964a8680253 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Apr 2016 14:06:48 +0100 Subject: [PATCH 311/607] UPSTREAM: assoc_array: don't call compare_object() on a node (cherry picked from commit 8d4a2ec1e0b41b0cf9a0c5cd4511da7f8e4f3de2) Changes since V1: fixed the description and added KASan warning. In assoc_array_insert_into_terminal_node(), we call the compare_object() method on all non-empty slots, even when they're not leaves, passing a pointer to an unexpected structure to compare_object(). Currently it causes an out-of-bound read access in keyring_compare_object detected by KASan (see below). The issue is easily reproduced with keyutils testsuite. Only call compare_object() when the slot is a leave. KASan warning: ================================================================== BUG: KASAN: slab-out-of-bounds in keyring_compare_object+0x213/0x240 at addr ffff880060a6f838 Read of size 8 by task keyctl/1655 ============================================================================= BUG kmalloc-192 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in assoc_array_insert+0xfd0/0x3a60 age=69 cpu=1 pid=1647 ___slab_alloc+0x563/0x5c0 __slab_alloc+0x51/0x90 kmem_cache_alloc_trace+0x263/0x300 assoc_array_insert+0xfd0/0x3a60 __key_link_begin+0xfc/0x270 key_create_or_update+0x459/0xaf0 SyS_add_key+0x1ba/0x350 entry_SYSCALL_64_fastpath+0x12/0x76 INFO: Slab 0xffffea0001829b80 objects=16 used=8 fp=0xffff880060a6f550 flags=0x3fff8000004080 INFO: Object 0xffff880060a6f740 @offset=5952 fp=0xffff880060a6e5d1 Bytes b4 ffff880060a6f730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f740: d1 e5 a6 60 00 88 ff ff 0e 00 00 00 00 00 00 00 ...`............ Object ffff880060a6f750: 02 cf 8e 60 00 88 ff ff 02 c0 8e 60 00 88 ff ff ...`.......`.... Object ffff880060a6f760: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f770: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f790: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7d0: 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ CPU: 0 PID: 1655 Comm: keyctl Tainted: G B 4.5.0-rc4-kasan+ #291 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000000 000000001b2800b4 ffff880060a179e0 ffffffff81b60491 ffff88006c802900 ffff880060a6f740 ffff880060a17a10 ffffffff815e2969 ffff88006c802900 ffffea0001829b80 ffff880060a6f740 ffff880060a6e650 Call Trace: [] dump_stack+0x85/0xc4 [] print_trailer+0xf9/0x150 [] object_err+0x34/0x40 [] kasan_report_error+0x230/0x550 [] ? keyring_get_key_chunk+0x13e/0x210 [] __asan_report_load_n_noabort+0x5d/0x70 [] ? keyring_compare_object+0x213/0x240 [] keyring_compare_object+0x213/0x240 [] assoc_array_insert+0x86c/0x3a60 [] ? assoc_array_cancel_edit+0x70/0x70 [] ? __key_link_begin+0x20d/0x270 [] __key_link_begin+0xfc/0x270 [] key_create_or_update+0x459/0xaf0 [] ? trace_hardirqs_on+0xd/0x10 [] ? key_type_lookup+0xc0/0xc0 [] ? lookup_user_key+0x13d/0xcd0 [] ? memdup_user+0x53/0x80 [] SyS_add_key+0x1ba/0x350 [] ? key_get_type_from_user.constprop.6+0xa0/0xa0 [] ? retint_user+0x18/0x23 [] ? trace_hardirqs_on_caller+0x3fe/0x580 [] ? trace_hardirqs_on_thunk+0x17/0x19 [] entry_SYSCALL_64_fastpath+0x12/0x76 Memory state around the buggy address: ffff880060a6f700: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00 ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc >ffff880060a6f800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff880060a6f880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff880060a6f900: fc fc fc fc fc fc 00 00 00 00 00 00 00 00 00 00 ================================================================== Signed-off-by: Jerome Marchand Signed-off-by: David Howells cc: stable@vger.kernel.org Change-Id: I903935a221a5b9fb14cec14ef64bd2b6fa8eb222 Bug: 30513364 --- lib/assoc_array.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 03dd576e6773..59fd7c0b119c 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -524,7 +524,9 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, free_slot = i; continue; } - if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { + if (assoc_array_ptr_is_leaf(ptr) && + ops->compare_object(assoc_array_ptr_to_leaf(ptr), + index_key)) { pr_devel("replace in slot %d\n", i); edit->leaf_p = &node->slots[i]; edit->dead_leaf = node->slots[i]; From 687549af9e5df34f8071c32aced7efae0b46fb58 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: [PATCH 312/607] UPSTREAM: block: fix use-after-free in seq file (cherry picked from commit 77da160530dd1dc94f6ae15a981f24e5f0021e84) I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe Change-Id: I07b33f4b38341f60a37806cdd45b0a0c3ab4d84d Bug: 30942273 --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index 817c67c9e45e..82bc52cad1c1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -831,6 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } From c7a407d41ed5d2c6aa0c1892e1d9ebee58d5a928 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 28 Jan 2016 09:22:44 -0200 Subject: [PATCH 313/607] UPSTREAM: [media] xc2028: avoid use after free (cherry picked from commit 8dfbcc4351a0b6d2f2d77f367552f48ffefafe18) If struct xc2028_config is passed without a firmware name, the following trouble may happen: [11009.907205] xc2028 5-0061: type set to XCeive xc2028/xc3028 tuner [11009.907491] ================================================================== [11009.907750] BUG: KASAN: use-after-free in strcmp+0x96/0xb0 at addr ffff8803bd78ab40 [11009.907992] Read of size 1 by task modprobe/28992 [11009.907994] ============================================================================= [11009.907997] BUG kmalloc-16 (Tainted: G W ): kasan: bad access detected [11009.907999] ----------------------------------------------------------------------------- [11009.908008] INFO: Allocated in xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] age=0 cpu=3 pid=28992 [11009.908012] ___slab_alloc+0x581/0x5b0 [11009.908014] __slab_alloc+0x51/0x90 [11009.908017] __kmalloc+0x27b/0x350 [11009.908022] xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] [11009.908026] usb_hcd_submit_urb+0x1e8/0x1c60 [11009.908029] usb_submit_urb+0xb0e/0x1200 [11009.908032] usb_serial_generic_write_start+0xb6/0x4c0 [11009.908035] usb_serial_generic_write+0x92/0xc0 [11009.908039] usb_console_write+0x38a/0x560 [11009.908045] call_console_drivers.constprop.14+0x1ee/0x2c0 [11009.908051] console_unlock+0x40d/0x900 [11009.908056] vprintk_emit+0x4b4/0x830 [11009.908061] vprintk_default+0x1f/0x30 [11009.908064] printk+0x99/0xb5 [11009.908067] kasan_report_error+0x10a/0x550 [11009.908070] __asan_report_load1_noabort+0x43/0x50 [11009.908074] INFO: Freed in xc2028_set_config+0x90/0x630 [tuner_xc2028] age=1 cpu=3 pid=28992 [11009.908077] __slab_free+0x2ec/0x460 [11009.908080] kfree+0x266/0x280 [11009.908083] xc2028_set_config+0x90/0x630 [tuner_xc2028] [11009.908086] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908090] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908094] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908098] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908101] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908105] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908108] do_one_initcall+0x141/0x300 [11009.908111] do_init_module+0x1d0/0x5ad [11009.908114] load_module+0x6666/0x9ba0 [11009.908117] SyS_finit_module+0x108/0x130 [11009.908120] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908123] INFO: Slab 0xffffea000ef5e280 objects=25 used=25 fp=0x (null) flags=0x2ffff8000004080 [11009.908126] INFO: Object 0xffff8803bd78ab40 @offset=2880 fp=0x0000000000000001 [11009.908130] Bytes b4 ffff8803bd78ab30: 01 00 00 00 2a 07 00 00 9d 28 00 00 01 00 00 00 ....*....(...... [11009.908133] Object ffff8803bd78ab40: 01 00 00 00 00 00 00 00 b0 1d c3 6a 00 88 ff ff ...........j.... [11009.908137] CPU: 3 PID: 28992 Comm: modprobe Tainted: G B W 4.5.0-rc1+ #43 [11009.908140] Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0350.2015.0812.1722 08/12/2015 [11009.908142] ffff8803bd78a000 ffff8802c273f1b8 ffffffff81932007 ffff8803c6407a80 [11009.908148] ffff8802c273f1e8 ffffffff81556759 ffff8803c6407a80 ffffea000ef5e280 [11009.908153] ffff8803bd78ab40 dffffc0000000000 ffff8802c273f210 ffffffff8155ccb4 [11009.908158] Call Trace: [11009.908162] [] dump_stack+0x4b/0x64 [11009.908165] [] print_trailer+0xf9/0x150 [11009.908168] [] object_err+0x34/0x40 [11009.908171] [] kasan_report_error+0x230/0x550 [11009.908175] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908179] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908182] [] __asan_report_load1_noabort+0x43/0x50 [11009.908185] [] ? __asan_register_globals+0x50/0xa0 [11009.908189] [] ? strcmp+0x96/0xb0 [11009.908192] [] strcmp+0x96/0xb0 [11009.908196] [] xc2028_set_config+0x15c/0x630 [tuner_xc2028] [11009.908200] [] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908203] [] ? memset+0x28/0x30 [11009.908206] [] ? xc2028_set_config+0x630/0x630 [tuner_xc2028] [11009.908211] [] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908215] [] ? em28xx_dvb_init.part.3+0x37c/0x5cf4 [em28xx_dvb] [11009.908219] [] ? hauppauge_hvr930c_init+0x487/0x487 [em28xx_dvb] [11009.908222] [] ? lgdt330x_attach+0x1cc/0x370 [lgdt330x] [11009.908226] [] ? i2c_read_demod_bytes.isra.2+0x210/0x210 [lgdt330x] [11009.908230] [] ? ref_module.part.15+0x10/0x10 [11009.908233] [] ? module_assert_mutex_or_preempt+0x80/0x80 [11009.908238] [] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908242] [] ? em28xx_attach_xc3028.constprop.7+0x30d/0x30d [em28xx_dvb] [11009.908245] [] ? string+0x14d/0x1f0 [11009.908249] [] ? symbol_string+0xff/0x1a0 [11009.908253] [] ? uuid_string+0x6f0/0x6f0 [11009.908257] [] ? __kernel_text_address+0x7e/0xa0 [11009.908260] [] ? print_context_stack+0x7f/0xf0 [11009.908264] [] ? __module_address+0xb6/0x360 [11009.908268] [] ? is_ftrace_trampoline+0x99/0xe0 [11009.908271] [] ? __kernel_text_address+0x7e/0xa0 [11009.908275] [] ? debug_check_no_locks_freed+0x290/0x290 [11009.908278] [] ? dump_trace+0x11b/0x300 [11009.908282] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908285] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908289] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908292] [] ? trace_hardirqs_on+0xd/0x10 [11009.908296] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908299] [] ? mutex_trylock+0x400/0x400 [11009.908302] [] ? do_one_initcall+0x131/0x300 [11009.908306] [] ? call_rcu_sched+0x17/0x20 [11009.908309] [] ? put_object+0x48/0x70 [11009.908314] [] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908317] [] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908320] [] ? 0xffffffffa0150000 [11009.908324] [] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908327] [] do_one_initcall+0x141/0x300 [11009.908330] [] ? try_to_run_init_process+0x40/0x40 [11009.908333] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908337] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908340] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908343] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908346] [] ? __asan_register_globals+0x87/0xa0 [11009.908350] [] do_init_module+0x1d0/0x5ad [11009.908353] [] load_module+0x6666/0x9ba0 [11009.908356] [] ? symbol_put_addr+0x50/0x50 [11009.908361] [] ? em28xx_dvb_init.part.3+0x5989/0x5cf4 [em28xx_dvb] [11009.908366] [] ? module_frob_arch_sections+0x20/0x20 [11009.908369] [] ? open_exec+0x50/0x50 [11009.908374] [] ? ns_capable+0x5b/0xd0 [11009.908377] [] SyS_finit_module+0x108/0x130 [11009.908379] [] ? SyS_init_module+0x1f0/0x1f0 [11009.908383] [] ? lockdep_sys_exit_thunk+0x12/0x14 [11009.908394] [] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908396] Memory state around the buggy address: [11009.908398] ffff8803bd78aa00: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908401] ffff8803bd78aa80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908403] >ffff8803bd78ab00: fc fc fc fc fc fc fc fc 00 00 fc fc fc fc fc fc [11009.908405] ^ [11009.908407] ffff8803bd78ab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908409] ffff8803bd78ac00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908411] ================================================================== In order to avoid it, let's set the cached value of the firmware name to NULL after freeing it. While here, return an error if the memory allocation fails. Signed-off-by: Mauro Carvalho Chehab Change-Id: I945c841dcfb45de2056267e4aa50bbe176b527cf Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 4e941f00b600..082ff5608455 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1403,11 +1403,12 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) * in order to avoid troubles during device release. */ kfree(priv->ctrl.fname); + priv->ctrl.fname = NULL; memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); if (priv->ctrl.fname == NULL) - rc = -ENOMEM; + return -ENOMEM; } /* From 30439e51fc6f9ab169f7cb9e8e88faf7fb27ae99 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Feb 2016 13:34:00 -0200 Subject: [PATCH 314/607] UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config() (cherry picked from commit 210bd104c6acd31c3c6b8b075b3f12d4a9f6b60d) We have to unlock before returning -ENOMEM. Fixes: 8dfbcc4351a0 ('[media] xc2028: avoid use after free') Signed-off-by: Dan Carpenter Signed-off-by: Mauro Carvalho Chehab Change-Id: I7b6ba9fde5c6e29467e6de23d398af2fe56e2547 Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 082ff5608455..317ef63ee789 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1407,8 +1407,10 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); - if (priv->ctrl.fname == NULL) - return -ENOMEM; + if (priv->ctrl.fname == NULL) { + rc = -ENOMEM; + goto unlock; + } } /* @@ -1440,6 +1442,7 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) } else priv->state = XC2028_WAITING_FIRMWARE; } +unlock: mutex_unlock(&priv->lock); return rc; From a4e59955acf0ae25d805e0a0d251ac8f4b1808e7 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 May 2016 16:22:26 -0700 Subject: [PATCH 315/607] UPSTREAM: proc: prevent accessing /proc//environ until it's ready (cherry picked from commit 8148a73c9901a8794a50f950083c00ccf97d43b3) If /proc//environ gets read before the envp[] array is fully set up in create_{aout,elf,elf_fdpic,flat}_tables(), we might end up trying to read more bytes than are actually written, as env_start will already be set but env_end will still be zero, making the range calculation underflow, allowing to read beyond the end of what has been written. Fix this as it is done for /proc//cmdline by testing env_end for zero. It is, apparently, intentionally set last in create_*_tables(). This bug was found by the PaX size_overflow plugin that detected the arithmetic underflow of 'this_len = env_end - (env_start + src)' when env_end is still zero. The expected consequence is that userland trying to access /proc//environ of a not yet fully set up process may get inconsistent data as we're in the middle of copying in the environment variables. Fixes: https://forums.grsecurity.net/viewtopic.php?f=3&t=4363 Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=116461 Signed-off-by: Mathias Krause Cc: Emese Revfy Cc: Pax Team Cc: Al Viro Cc: Mateusz Guzik Cc: Alexey Dobriyan Cc: Cyrill Gorcunov Cc: Jarod Wilson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ia2f58d48c15478ed4b6e237b63e704c70ff21e96 Bug: 30951939 --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 49348349e156..df715a095328 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -954,7 +954,8 @@ static ssize_t environ_read(struct file *file, char __user *buf, int ret = 0; struct mm_struct *mm = file->private_data; - if (!mm) + /* Ensure the process spawned far enough to have an environment. */ + if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_TEMPORARY); From 8e975e71f5123f4d2567a98f01641f7c02f8d2a2 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 2 Sep 2016 10:13:21 +0530 Subject: [PATCH 316/607] ANDROID: base-cfg: drop SECCOMP_FILTER config Don't need to set SECCOMP_FILTER explicitly since CONFIG_SECCOMP=y will select that config anyway. Fixes: a49dcf2e745c ("ANDROID: base-cfg: enable SECCOMP config") Change-Id: Iff18ed4d2db5a55b9f9480d5ecbeef7b818b3837 Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 288bab2d858d..34fa64a669ac 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -142,7 +142,6 @@ CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y -CONFIG_SECCOMP_FILTER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From d25e3081baa6f22564c1d87b7650c07c8c0b12b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Aug 2016 05:56:26 -0700 Subject: [PATCH 317/607] UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit bb1fceca22492109be12640d49f5ea5a544c6bb4) When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the tail of the write queue using tcp_add_write_queue_tail() Then it attempts to copy user data into this fresh skb. If the copy fails, we undo the work and remove the fresh skb. Unfortunately, this undo lacks the change done to tp->highest_sack and we can leave a dangling pointer (to a freed skb) Later, tcp_xmit_retransmit_queue() can dereference this pointer and access freed memory. For regular kernels where memory is not unmapped, this might cause SACK bugs because tcp_highest_sack_seq() is buggy, returning garbage instead of tp->snd_nxt, but with various debug features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel. This bug was found by Marco Grassi thanks to syzkaller. Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb") Reported-by: Marco Grassi Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Reviewed-by: Cong Wang Signed-off-by: David S. Miller Change-Id: I58bb02d6e4e399612e8580b9e02d11e661df82f5 Bug: 31183296 --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index bcce99a951f8..30d1194ff1c5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1513,6 +1513,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (sk->sk_send_head == skb_unlinked) sk->sk_send_head = NULL; + if (tcp_sk(sk)->highest_sack == skb_unlinked) + tcp_sk(sk)->highest_sack = NULL; } static inline void tcp_init_send_head(struct sock *sk) From 87518a45ae570ada6a1e404331a6cc910014e1a9 Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Wed, 11 May 2016 13:18:35 -0700 Subject: [PATCH 318/607] BACKPORT: Don't show empty tag stats for unprivileged uids BUG: 27577101 BUG: 27532522 Signed-off-by: Mohamad Ayyash --- net/netfilter/xt_qtaguid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e2e7d54f9bb1..3bf0c59dab2f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1946,7 +1946,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) ); f_count = atomic_long_read( &sock_tag_entry->socket->file->f_count); - seq_printf(m, "sock=%p tag=0x%llx (uid=%u) pid=%u " + seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " "f_count=%lu\n", sock_tag_entry->sk, sock_tag_entry->tag, uid, @@ -2548,8 +2548,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, uid_t stat_uid = get_uid_from_tag(tag); struct proc_print_info *ppi = m->private; /* Detailed tags are not available to everybody */ - if (get_atag_from_tag(tag) && !can_read_other_uid_stats( - make_kuid(&init_user_ns,stat_uid))) { + if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) { CT_DEBUG("qtaguid: stats line: " "%s 0x%llx %u: insufficient priv " "from pid=%u tgid=%u uid=%u stats.gid=%u\n", From 8ac959cc06c8715cdc64aa88993b3a8b1ef61eaa Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:57:27 -0800 Subject: [PATCH 319/607] UPSTREAM: Add 'unsafe' user access functions for batched accesses The naming is meant to discourage random use: the helper functions are not really any more "unsafe" than the traditional double-underscore functions (which need the address range checking), but they do need even more infrastructure around them, and should not be used willy-nilly. In addition to checking the access range, these user access functions require that you wrap the user access with a "user_acess_{begin,end}()" around it. That allows architectures that implement kernel user access control (x86: SMAP, arm64: PAN) to do the user access control in the wrapping user_access_begin/end part, and then batch up the actual user space accesses using the new interfaces. The main (and hopefully only) use for these are for core generic access helpers, initially just the generic user string functions (strnlen_user() and strncpy_from_user()). Signed-off-by: Linus Torvalds Change-Id: Ic64efea41f97171bdbdabe3e531489aebd9b6fac (cherry picked from commit 5b24a7a2aa2040c8c50c3b71122901d01661ff78) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 25 +++++++++++++++++++++++++ include/linux/uaccess.h | 7 +++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 09b1b0ab94b7..6c4c39b137e5 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -745,5 +745,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #undef __copy_from_user_overflow #undef __copy_to_user_overflow +/* + * The "unsafe" user accesses aren't really "unsafe", but the naming + * is a big fat warning: you have to not only do the access_ok() + * checking before using them, but you have to surround them with the + * user_access_begin/end() pair. + */ +#define user_access_begin() __uaccess_begin() +#define user_access_end() __uaccess_end() + +#define unsafe_put_user(x, ptr) \ +({ \ + int __pu_err; \ + __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __builtin_expect(__pu_err, 0); \ +}) + +#define unsafe_get_user(x, ptr) \ +({ \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ +}) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 558129af828a..349557825428 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -111,4 +111,11 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define probe_kernel_address(addr, retval) \ probe_kernel_read(&retval, addr, sizeof(retval)) +#ifndef user_access_begin +#define user_access_begin() do { } while (0) +#define user_access_end() do { } while (0) +#define unsafe_get_user(x, ptr) __get_user(x, ptr) +#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#endif + #endif /* __LINUX_UACCESS_H__ */ From c1932b10476e4d60f314bc6111101d1d101599b3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 10:05:19 -0800 Subject: [PATCH 320/607] UPSTREAM: Use the new batched user accesses in generic user string handling This converts the generic user string functions to use the batched user access functions. It makes a big difference on Skylake, which is the first x86 microarchitecture to implement SMAP. The STAC/CLAC instructions are not very fast, and doing them for each access inside the loop that copies strings from user space (which is what the pathname handling does for every pathname the kernel uses, for example) is very inefficient. Signed-off-by: Linus Torvalds Change-Id: Ic39a686b4bb1ad9cd16ad0887bb669beed6fe8aa (cherry picked from commit 9fd4470ff4974c41b1db43c3b355b9085af9c12a) Signed-off-by: Sami Tolvanen --- lib/strncpy_from_user.c | 11 ++++++++--- lib/strnlen_user.c | 18 ++++++++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index e0af6ff73d14..33840324138c 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,7 +39,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) break; *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { @@ -55,7 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(__get_user(c,src+res))) + if (unlikely(unsafe_get_user(c,src+res))) return -EFAULT; dst[res] = c; if (!c) @@ -107,7 +107,12 @@ long strncpy_from_user(char *dst, const char __user *src, long count) src_addr = (unsigned long)src; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strncpy_from_user(dst, src, count, max); + long retval; + + user_access_begin(); + retval = do_strncpy_from_user(dst, src, count, max); + user_access_end(); + return retval; } return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 3a5f2b366d84..2625943625d7 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,7 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(__get_user(c,(unsigned long __user *)src))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) return 0; c |= aligned_byte_mask(align); @@ -61,7 +61,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) return 0; } res -= align; @@ -112,7 +112,12 @@ long strnlen_user(const char __user *str, long count) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, count, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, count, max); + user_access_end(); + return retval; } return 0; } @@ -141,7 +146,12 @@ long strlen_user(const char __user *str) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, ~0ul, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, ~0ul, max); + user_access_end(); + return retval; } return 0; } From fed752db02277e34e92262daac24ad4216e79c56 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 21:28:47 +0100 Subject: [PATCH 321/607] BACKPORT: ARM: 8583/1: mm: fix location of _etext The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. Just to be conservative, leave the kernel resource as it was, using __init_begin instead of _etext as the end mark. Signed-off-by: Kees Cook Signed-off-by: Russell King Change-Id: Ida514d1359dbe6f782f562ce29b4ba09ae72bfc0 (cherry picked from commit 14c4a533e0996f95a0a64dfd0b6252d788cebc74) Signed-off-by: Sami Tolvanen --- arch/arm/kernel/setup.c | 2 +- arch/arm/kernel/vmlinux.lds.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 20edd349d379..bf63b4693457 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -772,7 +772,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 8b60fde5ce48..be2ab6d3b91f 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -120,6 +120,8 @@ SECTIONS #ifdef CONFIG_DEBUG_RODATA . = ALIGN(1< Date: Thu, 23 Jun 2016 15:53:17 +0200 Subject: [PATCH 322/607] BACKPORT: arm64: mm: fix location of _etext As Kees Cook notes in the ARM counterpart of this patch [0]: The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. In particular, Kees is referring to the HARDENED_USERCOPY patch set [1], which rejects attempts to call copy_to_user() on kernel ranges containing executable code, but does allow access to the .rodata segment. Regardless of whether one may or may not agree with the distinction, it makes sense for _etext to have the same meaning across architectures. So let's put _etext where it belongs, between .text and .rodata, and fix up existing references to use __init_begin instead, which unlike _end_rodata includes the exception and notes sections as well. The _etext references in kaslr.c are left untouched, since its references to [_stext, _etext) are meant to capture potential jump instruction targets, and so disregarding .rodata is actually an improvement here. [0] http://article.gmane.org/gmane.linux.kernel/2245084 [1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502 Reported-by: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas Change-Id: I8f6582525217b9ca324f6a382ea52d30ce1d0dbd (cherry picked from commit 9fdc14c55cd6579d619ccd9d40982e0805e62b6d) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/setup.c | 2 +- arch/arm64/kernel/vmlinux.lds.S | 3 ++- arch/arm64/mm/mmu.c | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 8119479147db..7f0e7226795e 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -201,7 +201,7 @@ static void __init request_standard_resources(void) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 71426a78db12..f472809a7b45 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -114,11 +114,12 @@ SECTIONS } ALIGN_DEBUG_RO + _etext = .; /* End of text section */ + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES ALIGN_DEBUG_RO - _etext = .; /* End of text and rodata section */ ALIGN_DEBUG_RO_MIN(PAGE_SIZE) __init_begin = .; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 873e363048c6..9a8dc4a79d8b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -337,7 +337,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end) end - kernel_x_end, PAGE_KERNEL); } - } #else static void __init __map_memblock(phys_addr_t start, phys_addr_t end) @@ -425,7 +424,7 @@ static void __init fixup_executable(void) void mark_rodata_ro(void) { create_mapping_late(__pa(_stext), (unsigned long)_stext, - (unsigned long)_etext - (unsigned long)_stext, + (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_ROX); } From 5f1b3400f0ccc2368dc444b5d9a7c93be2e700da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: [PATCH 323/607] UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds Change-Id: Ib905a84a04d46984320f6fd1056da4d72f3d6b53 (cherry picked from commit 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 16 ++++++++-------- include/linux/uaccess.h | 4 ++-- lib/strncpy_from_user.c | 8 ++++---- lib/strnlen_user.c | 7 +++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 6c4c39b137e5..5abdd0221fb4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -754,21 +754,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 349557825428..f30c187ed785 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33840324138c..5a003a2ebd96 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -55,8 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -75,6 +74,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } From 1e701cdc5bbce1f5561ee25ff8709a18dc0b2282 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 19 Jul 2016 15:00:04 -0700 Subject: [PATCH 324/607] UPSTREAM: mm: Add is_migrate_cma_page Code such as hardened user copy[1] needs a way to tell if a page is CMA or not. Add is_migrate_cma_page in a similar way to is_migrate_isolate_page. [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 Signed-off-by: Laura Abbott Signed-off-by: Kees Cook Change-Id: I1f9aa13d8d063038fa70b93282a836648fbb4f6d (cherry picked from commit 7c15d9bb8231f998ae7dc0b72415f5215459f7fb) Signed-off-by: Sami Tolvanen --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e23a9e704536..bab4053fb795 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -65,8 +65,10 @@ enum { #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false +# define is_migrate_cma_page(_page) false #endif #define for_each_migratetype_order(order, type) \ From c30d7340ee0b167a45cd8d6d5c48add8f62db9a5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jul 2016 16:19:48 -0700 Subject: [PATCH 325/607] BACKPORT: mm: Implement stack frame object validation This creates per-architecture function arch_within_stack_frames() that should validate if a given object is contained by a kernel stack frame. Initial implementation is on x86. This is based on code from PaX. Signed-off-by: Kees Cook Change-Id: I1f3b299bb8991d65dcdac6af85d633d4b7776df1 (cherry picked from commit 0f60a8efe4005ab5e65ce000724b04d4ca04a199) Signed-off-by: Sami Tolvanen --- arch/Kconfig | 9 ++++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++ include/linux/thread_info.h | 9 ++++++ 4 files changed, 63 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 31a318a56d98..98f64ad1caf1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG endchoice +config HAVE_ARCH_WITHIN_STACK_FRAMES + bool + help + An architecture should select this if it can walk the kernel stack + frames to determine if an object is part of either the arguments + or local variables (i.e. that it excludes saved return addresses, + and similar) by implementing an inline arch_within_stack_frames(), + which is used by CONFIG_HARDENED_USERCOPY. + config HAVE_CONTEXT_TRACKING bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ae3c83e8bde8..4786de7750b1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_BPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..0c977fc124a7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,6 +177,50 @@ static inline unsigned long current_stack_pointer(void) return sp; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3..5ecb68e86968 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -145,6 +145,15 @@ static inline bool test_and_clear_restore_sigmask(void) #error "no set_restore_sigmask() provided and default one won't work" #endif +#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ From d677b3104d3c8e927211f20083a6c2e97e003b87 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 7 Jun 2016 11:05:33 -0700 Subject: [PATCH 326/607] BACKPORT: mm: Hardened usercopy This is the start of porting PAX_USERCOPY into the mainline kernel. This is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The work is based on code by PaX Team and Brad Spengler, and an earlier port from Casey Schaufler. Additional non-slab page tests are from Rik van Riel. This patch contains the logic for validating several conditions when performing copy_to_user() and copy_from_user() on the kernel object being copied to/from: - address range doesn't wrap around - address range isn't NULL or zero-allocated (with a non-zero copy size) - if on the slab allocator: - object size must be less than or equal to copy size (when check is implemented in the allocator, which appear in subsequent patches) - otherwise, object must not span page allocations (excepting Reserved and CMA ranges) - if on the stack - object must not extend before/after the current process stack - object must be contained by a valid stack frame (when there is arch/build support for identifying stack frames) - object must not overlap with kernel text Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Tested-by: Michael Ellerman Change-Id: Iff3b5f1ddb04acd99ccf9a9046c7797363962b2a (cherry picked from commit f5509cc18daa7f82bcc553be70df2117c8eedc16) Signed-off-by: Sami Tolvanen --- include/linux/slab.h | 12 ++ include/linux/thread_info.h | 15 ++ mm/Makefile | 4 + mm/usercopy.c | 269 ++++++++++++++++++++++++++++++++++++ security/Kconfig | 28 ++++ 5 files changed, 328 insertions(+) create mode 100644 mm/usercopy.c diff --git a/include/linux/slab.h b/include/linux/slab.h index 2037a861e367..4ef384b172e0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -144,6 +144,18 @@ void kfree(const void *); void kzfree(const void *); size_t ksize(const void *); +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page); +#else +static inline const char *__check_heap_object(const void *ptr, + unsigned long n, + struct page *page) +{ + return NULL; +} +#endif + /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 5ecb68e86968..0ae29ff9ccfd 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -154,6 +154,21 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif +#ifdef CONFIG_HARDENED_USERCOPY +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + __check_object_size(ptr, n, to_user); +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..8b532c94008f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,9 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +# Since __builtin_frame_address does work as used, disable the warning. +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -81,3 +84,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 000000000000..816feccebeb0 --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,269 @@ +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: fully on the stack (when can't do frame-checking) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * the check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + return GOOD_STACK; +} + +static void report_usercopy(const void *ptr, unsigned long len, + bool to_user, const char *type) +{ + pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", ptr, type ? : "unknown", len); + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const void *ptr, unsigned long n, unsigned long low, + unsigned long high) +{ + unsigned long check_low = (uintptr_t)ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high < low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, + unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + return ""; + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)__va(__pa(textlow)); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return NULL; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)__va(__pa(texthigh)); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + return ""; + + return NULL; +} + +static inline const char *check_bogus_address(const void *ptr, unsigned long n) +{ + /* Reject if object wraps past end of memory. */ + if (ptr + n < ptr) + return ""; + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + return ""; + + return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page, *endpage; + const void *end = ptr + n - 1; + bool is_reserved, is_cma; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* + * Sometimes the kernel data regions are not marked Reserved (see + * check below). And sometimes [_sdata,_edata) does not cover + * rodata and/or bss, so check each range explicitly. + */ + + /* Allow reads of kernel rodata region (if not marked as Reserved). */ + if (ptr >= (const void *)__start_rodata && + end <= (const void *)__end_rodata) { + if (!to_user) + return ""; + return NULL; + } + + /* Allow kernel data region (if not marked as Reserved). */ + if (ptr >= (const void *)_sdata && end <= (const void *)_edata) + return NULL; + + /* Allow kernel bss region (if not marked as Reserved). */ + if (ptr >= (const void *)__bss_start && + end <= (const void *)__bss_stop) + return NULL; + + /* Is the object wholly within one base page? */ + if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == + ((unsigned long)end & (unsigned long)PAGE_MASK))) + return NULL; + + /* Allow if start and end are inside the same compound page. */ + endpage = virt_to_head_page(end); + if (likely(endpage == page)) + return NULL; + + /* + * Reject if range is entirely either Reserved (i.e. special or + * device memory), or CMA. Otherwise, reject since the object spans + * several independently allocated pages. + */ + is_reserved = PageReserved(page); + is_cma = is_migrate_cma_page(page); + if (!is_reserved && !is_cma) + goto reject; + + for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { + page = virt_to_head_page(ptr); + if (is_reserved && !PageReserved(page)) + goto reject; + if (is_cma && !is_migrate_cma_page(page)) + goto reject; + } + + return NULL; + +reject: + return ""; +} + +/* + * Validates that the given object is: + * - not bogus address + * - known-safe heap or stack object + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + const char *err; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + err = check_bogus_address(ptr, n); + if (err) + goto report; + + /* Check for bad heap object. */ + err = check_heap_object(ptr, n, to_user); + if (err) + goto report; + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + err = ""; + goto report; + } + + /* Check for object in kernel to avoid text exposure. */ + err = check_kernel_text_object(ptr, n); + if (!err) + return; + +report: + report_usercopy(ptr, n, to_user, err); +} +EXPORT_SYMBOL(__check_object_size); diff --git a/security/Kconfig b/security/Kconfig index 30a2603e8c85..aba6a8d4d1f4 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -127,6 +127,34 @@ config LSM_MMAP_MIN_ADDR this low address space will need the permission specific to the systems running LSM. +config HAVE_HARDENED_USERCOPY_ALLOCATOR + bool + help + The heap allocator implements __check_heap_object() for + validating memory ranges against heap object sizes in + support of CONFIG_HARDENED_USERCOPY. + +config HAVE_ARCH_HARDENED_USERCOPY + bool + help + The architecture supports CONFIG_HARDENED_USERCOPY by + calling check_object_size() just before performing the + userspace copies in the low level implementation of + copy_to_user() and copy_from_user(). + +config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_ARCH_HARDENED_USERCOPY + select BUG + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and + copy_from_user() functions) by rejecting memory ranges that + are larger than the specified heap object, span multiple + separately allocates pages, are not on the process stack, + or are part of the kernel text. This kills entire classes + of heap overflow exploits and similar kernel memory exposures. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 9c61fcc2b39760263436b13b97071ae6e8eeb5f2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:04:01 -0700 Subject: [PATCH 327/607] BACKPORT: x86/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in copy_*_user() and __copy_*_user() because copy_*_user() actually calls down to _copy_*_user() and not __copy_*_user(). Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Change-Id: I260db1d4572bdd2f779200aca99d03a170658440 (cherry picked from commit 5b710f34e194c6b7710f69fdb5d798fdf35b98c1) Signed-off-by: Sami Tolvanen --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 10 ++++++---- arch/x86/include/asm/uaccess_32.h | 2 ++ arch/x86/include/asm/uaccess_64.h | 2 ++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4786de7750b1..9e3b1e57499a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -79,6 +79,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5abdd0221fb4..32a047ce0806 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -714,9 +714,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) * case, and do only runtime checking for non-constant sizes. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_from_user_overflow(); else __copy_from_user_overflow(sz, n); @@ -732,9 +733,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_to_user_overflow(); else __copy_to_user_overflow(sz, n); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index f5dcb5204dcd..2ffc9188cc70 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -43,6 +43,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); if (__builtin_constant_p(n)) { unsigned long ret; @@ -143,6 +144,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2f9b39b274a..512ffd621fc3 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -53,6 +53,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -103,6 +104,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { From 073fcda55b357777cc3edaa2e67060cd8d78dbb3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:06:53 -0700 Subject: [PATCH 328/607] BACKPORT: ARM: uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Change-Id: I03a44ca7a8c56832f15a6a74ac32e9330df3ac3b (cherry picked from commit dfd45b6103c973bfcea2341d89e36faf947dbc33) Signed-off-by: Sami Tolvanen --- arch/arm/Kconfig | 1 + arch/arm/include/asm/uaccess.h | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d3159ff4de5b..586134e01928 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -33,6 +33,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT) select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 select HAVE_ARCH_MMAP_RND_BITS if MMU diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 35c9db857ebe..7fb59199c6bb 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n); static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(to, n, false); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_from_user(to, from, n); uaccess_restore(__ua_flags); return n; @@ -511,11 +514,15 @@ static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { #ifndef CONFIG_UACCESS_WITH_MEMCPY - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(from, n, true); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_to_user(to, from, n); uaccess_restore(__ua_flags); return n; #else + check_object_size(from, n, true); return arm_copy_to_user(to, from, n); #endif } From 2f40fdd6bc0e1beca522657d3c9242e7b25917a2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:59:42 -0700 Subject: [PATCH 329/607] BACKPORT: arm64/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm64. As done by KASAN in -next, renames the low-level functions to __arch_copy_*_user() so a static inline can do additional work before the copy. Signed-off-by: Kees Cook Change-Id: I1286cae8e6ffcf12ea54ddd62f1a6d2ce742c8d0 (cherry picked from commit faf5b63e294151d6ac24ca6906d6f221bd3496cd) Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/uaccess.h | 29 ++++++++++++++++++++++------- arch/arm64/kernel/arm64ksyms.c | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 579ab688312d..3ad9d9ea374b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -48,6 +48,7 @@ config ARM64 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b2ede967fe7d..2a44b9795532 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -247,24 +247,39 @@ do { \ -EFAULT; \ }) -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + check_object_size(to, n, false); + return __arch_copy_from_user(to, from, n); +} + +static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + check_object_size(from, n, true); + return __arch_copy_to_user(to, from, n); +} + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else /* security hole - plug it */ + if (access_ok(VERIFY_READ, from, n)) { + check_object_size(to, n, false); + n = __arch_copy_from_user(to, from, n); + } else /* security hole - plug it */ memset(to, 0, n); return n; } static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + if (access_ok(VERIFY_WRITE, to, n)) { + check_object_size(from, n, true); + n = __arch_copy_to_user(to, from, n); + } return n; } diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 3b6d8cc9dfe0..c654df05b7d7 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -33,8 +33,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); /* user mem (segment) */ -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); +EXPORT_SYMBOL(__arch_copy_from_user); +EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_in_user); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4699cd74f87e..281e75db899a 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -66,7 +66,7 @@ .endm end .req x5 -ENTRY(__copy_from_user) +ENTRY(__arch_copy_from_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret -ENDPROC(__copy_from_user) +ENDPROC(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7512bbbc07ac..db4d187de61f 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -65,7 +65,7 @@ .endm end .req x5 -ENTRY(__copy_to_user) +ENTRY(__arch_copy_to_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) mov x0, #0 ret -ENDPROC(__copy_to_user) +ENDPROC(__arch_copy_to_user) .section .fixup,"ax" .align 2 From 103aa7df683e7811e88e82f52c2c3f16e419a7b1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:20:59 -0700 Subject: [PATCH 330/607] UPSTREAM: mm: SLAB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLAB allocator to catch any copies that may span objects. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Change-Id: Ib910a71fdc2ab808e1a45b6d33e9bae1681a1f4a (cherry picked from commit 04385fc5e8fffed84425d909a783c0f0c587d847) Signed-off-by: Sami Tolvanen --- init/Kconfig | 1 + mm/slab.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..fa031a140397 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1719,6 +1719,7 @@ choice config SLAB bool "SLAB" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work well in all environments. It organizes cache hot objects in diff --git a/mm/slab.c b/mm/slab.c index 4765c97ce690..24a615d42d74 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4228,6 +4228,36 @@ static int __init slab_proc_init(void) module_init(slab_proc_init); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + /* Find and validate object. */ + cachep = page->slab_cache; + objnr = obj_to_index(cachep, page, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within object size. */ + if (offset <= cachep->object_size && n <= cachep->object_size - offset) + return NULL; + + return cachep->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + /** * ksize - get the actual amount of memory allocated for a given object * @objp: Pointer to the object From 3a9c8260c66ce387754641134089ba6ea5793bd5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:24:05 -0700 Subject: [PATCH 331/607] UPSTREAM: mm: SLUB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLUB allocator to catch any copies that may span objects. Includes a redzone handling fix discovered by Michael Ellerman. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Michael Ellerman Reviwed-by: Laura Abbott Change-Id: I52dc6fb3a3492b937d52b5cf9c046bf03dc40a3a (cherry picked from commit ed18adc1cdd00a5c55a20fbdaed4804660772281) Signed-off-by: Sami Tolvanen --- init/Kconfig | 1 + mm/slub.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index fa031a140397..e1d1d6936f92 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1727,6 +1727,7 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). diff --git a/mm/slub.c b/mm/slub.c index 46997517406e..470f5e561727 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3585,6 +3585,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *s; + unsigned long offset; + size_t object_size; + + /* Find object and usable object size. */ + s = page->slab_cache; + object_size = slab_ksize(s); + + /* Reject impossible pointers. */ + if (ptr < page_address(page)) + return s->name; + + /* Find offset within object. */ + offset = (ptr - page_address(page)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (offset < s->red_left_pad) + return s->name; + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; + + return s->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + static size_t __ksize(const void *object) { struct page *page; From 43243ce74a56007bf71e4fb3310f475fd4441ce4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Aug 2016 12:15:22 -0700 Subject: [PATCH 332/607] UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math check_bogus_address() checked for pointer overflow using this expression, where 'ptr' has type 'const void *': ptr + n < ptr Since pointer wraparound is undefined behavior, gcc at -O2 by default treats it like the following, which would not behave as intended: (long)n < 0 Fortunately, this doesn't currently happen for kernel code because kernel code is compiled with -fno-strict-overflow. But the expression should be fixed anyway to use well-defined integer arithmetic, since it could be treated differently by different compilers in the future or could be reported by tools checking for undefined behavior. Signed-off-by: Eric Biggers Signed-off-by: Kees Cook Change-Id: I73b13be651cf35c03482f2014bf2c3dd291518ab (cherry picked from commit 7329a655875a2f4bd6984fe8a7e00a6981e802f3) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 816feccebeb0..f5c4c3a6f2df 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -125,7 +125,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if (ptr + n < ptr) + if ((unsigned long)ptr + n < (unsigned long)ptr) return ""; /* Reject if NULL or ZERO-allocation. */ From 19787e3f1cd696356b4ec02fa3a345dcb4aaf3dc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Aug 2016 11:53:59 -0500 Subject: [PATCH 333/607] UPSTREAM: usercopy: fix overlap check for kernel text When running with a local patch which moves the '_stext' symbol to the very beginning of the kernel text area, I got the following panic with CONFIG_HARDENED_USERCOPY: usercopy: kernel memory exposure attempt detected from ffff88103dfff000 () (4096 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:79! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1 Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016 task: ffff880817444140 task.stack: ffff880816274000 RIP: 0010:[] __check_object_size+0x76/0x413 RSP: 0018:ffff880816277c40 EFLAGS: 00010246 RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8 RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000 R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000 R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001 FS: 00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0 Stack: ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000 0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28 000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6 Call Trace: [] copy_page_to_iter_iovec+0xa6/0x1c0 [] copy_page_to_iter+0x16/0x90 [] generic_file_read_iter+0x3e3/0x7c0 [] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs] [] ? down_read+0x12/0x40 [] xfs_file_buffered_aio_read+0x51/0xc0 [xfs] [] xfs_file_read_iter+0x62/0xb0 [xfs] [] __vfs_read+0xdf/0x130 [] vfs_read+0x8e/0x140 [] SyS_read+0x55/0xc0 [] do_syscall_64+0x67/0x160 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00 RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00 RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004 RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000 R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000 Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3 RIP [] __check_object_size+0x76/0x413 RSP The checked object's range [ffff88103dfff000, ffff88103e000000) is valid, so there shouldn't have been a BUG. The hardened usercopy code got confused because the range's ending address is the same as the kernel's text starting address at 0xffff88103e000000. The overlap check is slightly off. Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Josh Poimboeuf Signed-off-by: Kees Cook Change-Id: I839dbf4ddbb4d9874026a42abed557eb9b3f8bef (cherry picked from commit 94cd97af690dd9537818dc9841d0ec68bb1dd877) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index f5c4c3a6f2df..f78015e8b1e5 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -84,7 +84,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; From 65733da1dd5c3643e310d371bdb2420a6f3d8e97 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Thu, 25 Aug 2016 18:31:01 -0700 Subject: [PATCH 334/607] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. This commit fixes the 32 bit build breakage in the previous commit. Tested on both 32 bit and 64 bit arm devices. Bug: 30677035 Change-Id: I9a615a16616d80f87e75676ac4d078a5c429dcf9 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 82 +++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 ++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 6 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 81 ++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 322 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 33e2f62d5062..840713279726 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -40,6 +40,8 @@ #include "blk.h" #include "blk-mq.h" +#include + EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -3539,3 +3541,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + u_int64_t num_elem, elem; + int pct; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 9fab52559a8c..b503249950df 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index fcf7829c759e..17068839c74b 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,8 +32,6 @@ #include "slot-gpio.h" #include "pwrseq.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif + mmc_latency_hist_sysfs_init(host); + mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 992bf5397633..bf38533406fd 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,6 +12,8 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 85cd2564c157..4167bdbf0ecf 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5327,6 +5356,54 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5653,6 +5733,7 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 2570d9477b37..f3780cf7d895 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,6 +532,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..faf2c0093527 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,6 +197,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1645,6 +1648,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 37967b6da03c..3349f0676acb 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,6 +136,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 40025b28c1fb..e4862f7cdede 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -379,6 +380,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From ef1cf0cfecdb16c520e32628ef8e6d9fa86fc08f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Aug 2016 12:47:01 -0700 Subject: [PATCH 335/607] UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator The kernel test robot reported a usercopy failure in the new hardened sanity checks, due to a page-crossing copy of the FPU state into the task structure. This happened because the kernel test robot was testing with SLOB, which doesn't actually do the required book-keeping for slab allocations, and as a result the hardening code didn't realize that the task struct allocation was one single allocation - and the sanity checks fail. Since SLOB doesn't even claim to support hardening (and you really shouldn't use it), the straightforward solution is to just make the usercopy hardening code depend on the allocator supporting it. Reported-by: kernel test robot Cc: Kees Cook Signed-off-by: Linus Torvalds Change-Id: I37d51f866f873341bf7d5297249899b852e1c6ce (cherry picked from commit 6040e57658eee6eb1315a26119101ca832d1f854) Signed-off-by: Sami Tolvanen --- security/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/security/Kconfig b/security/Kconfig index aba6a8d4d1f4..2b42c225de28 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -145,6 +145,7 @@ config HAVE_ARCH_HARDENED_USERCOPY config HARDENED_USERCOPY bool "Harden memory copies between kernel and userspace" depends on HAVE_ARCH_HARDENED_USERCOPY + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR select BUG help This option checks for obviously wrong memory regions when From 0715ce3b882b9982b6acdbf9e46fccd1e6f180d7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:45:09 -0800 Subject: [PATCH 336/607] UPSTREAM: x86: reorganize SMAP handling in user space accesses This reorganizes how we do the stac/clac instructions in the user access code. Instead of adding the instructions directly to the same inline asm that does the actual user level access and exception handling, add them at a higher level. This is mainly preparation for the next step, where we will expose an interface to allow users to mark several accesses together as being user space accesses, but it does already clean up some code: - the inlined trivial cases of copy_in_user() now do stac/clac just once over the accesses: they used to do one pair around the user space read, and another pair around the write-back. - the {get,put}_user_ex() macros that are used with the catch/try handling don't do any stac/clac at all, because that happens in the try/catch surrounding them. Other than those two cleanups that happened naturally from the re-organization, this should not make any difference. Yet. Signed-off-by: Linus Torvalds Change-Id: Iaad8756bc8e95876e2e2a7d7bbd333fc478ab441 (cherry picked from commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 53 +++++++++++------ arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 32a047ce0806..be439e246d91 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -134,6 +134,9 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); +#define __uaccess_begin() stac() +#define __uaccess_end() clac() + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -304,6 +307,10 @@ do { \ } \ } while (0) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __put_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -358,9 +365,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %2,%"rtype"1\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -370,6 +377,10 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __get_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -400,7 +411,9 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __uaccess_begin(); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __uaccess_end(); \ __builtin_expect(__pu_err, 0); \ }) @@ -408,7 +421,9 @@ do { \ ({ \ int __gu_err; \ unsigned long __gu_val; \ + __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ }) @@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %"rtype"1,%2\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; }; */ #define uaccess_try do { \ current_thread_info()->uaccess_err = 0; \ - stac(); \ + __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ - clac(); \ + __uaccess_end(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ } while (0) @@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ + __uaccess_begin(); \ switch (size) { \ case 1: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 2: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 4: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void) if (!IS_ENABLED(CONFIG_X86_64)) \ __cmpxchg_wrong_size(); \ \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void) default: \ __cmpxchg_wrong_size(); \ } \ + __uaccess_end(); \ *__uval = __old; \ __ret; \ }) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 512ffd621fc3..2957c8237c28 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -57,35 +57,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + case 1: + __uaccess_begin(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); + __uaccess_end(); return ret; - case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + case 2: + __uaccess_begin(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; - case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + case 4: + __uaccess_begin(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); + __uaccess_end(); return ret; - case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + case 8: + __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); + if (likely(!ret)) + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); + if (likely(!ret)) + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + __uaccess_end(); return ret; default: return copy_user_generic(dst, (__force void *)src, size); @@ -108,35 +122,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + case 1: + __uaccess_begin(); + __put_user_asm(*(u8 *)src, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; - case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + case 2: + __uaccess_begin(); + __put_user_asm(*(u16 *)src, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; - case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + case 4: + __uaccess_begin(); + __put_user_asm(*(u32 *)src, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; - case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + case 8: + __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 10); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + } + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 16); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "er", 8); + } + __uaccess_end(); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -162,39 +192,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) switch (size) { case 1: { u8 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u8 __user *)src, ret, "b", "b", "=q", 1); if (likely(!ret)) __put_user_asm(tmp, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; } case 2: { u16 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u16 __user *)src, ret, "w", "w", "=r", 2); if (likely(!ret)) __put_user_asm(tmp, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; } case 4: { u32 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u32 __user *)src, ret, "l", "k", "=r", 4); if (likely(!ret)) __put_user_asm(tmp, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; } case 8: { u64 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u64 __user *)src, ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; } default: From a5eeb9b4123eee6aa88bb7a43dba919641abeeb6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: [PATCH 337/607] UPSTREAM: mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ib893a17ecabd692e6c402e864196bf89cd6781a5 (cherry picked from commit d86bd1bece6fc41d59253002db5441fe960a37f6) Signed-off-by: Sami Tolvanen --- include/linux/slub_def.h | 1 + mm/slub.c | 100 +++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33885118523c..f4e857e920cd 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,6 +81,7 @@ struct kmem_cache { int reserved; /* Reserved bytes at the end of slabs */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ + int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif diff --git a/mm/slub.c b/mm/slub.c index 470f5e561727..cfd3579cbaeb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -489,6 +497,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -817,6 +850,10 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; @@ -1468,7 +1505,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -3283,7 +3320,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3292,6 +3329,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* From cd55f3c2be7a55bb708ba0f5fb02ae1c08cc1d33 Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Tue, 6 Sep 2016 15:16:25 -0700 Subject: [PATCH 338/607] cpuset: Make cpusets restore on hotplug This deliberately changes the behavior of the per-cpuset cpus file to not be effected by hotplug. When a cpu is offlined, it will be removed from the cpuset/cpus file. When a cpu is onlined, if the cpuset originally requested that that cpu was part of the cpuset, that cpu will be restored to the cpuset. The cpus files still have to be hierachical, but the ranges no longer have to be out of the currently online cpus, just the physically present cpus. Change-Id: I22cdf33e7d312117bcefba1aeb0125e1ada289a9 Signed-off-by: Dmitry Shmidt --- kernel/cpuset.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0ff01c2a7222..3b4f27981778 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,6 +99,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; + cpumask_var_t cpus_requested; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -385,7 +386,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_requested, q->cpus_requested) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -485,7 +486,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_requested, c->cpus_requested)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -944,17 +945,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) return -EINVAL; + + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); } /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; retval = validate_change(cs, trialcs); @@ -963,6 +965,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ @@ -1731,7 +1734,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) switch (type) { case FILE_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); @@ -1920,11 +1923,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; + if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_allowed; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_cpus; + goto free_requested; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_requested); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); @@ -1933,7 +1939,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return &cs->css; -free_cpus: +free_requested: + free_cpumask_var(cs->cpus_requested); +free_allowed: free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); @@ -1996,6 +2004,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->cpus_requested, parent->cpus_requested); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: @@ -2030,6 +2039,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); kfree(cs); } @@ -2096,8 +2106,11 @@ int __init cpuset_init(void) BUG(); if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_requested); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); @@ -2231,7 +2244,7 @@ retry: goto retry; } - cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); From 96a970e08bdec910c7780e8f823b3595eef6ba8c Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 2 Feb 2016 13:36:45 -0500 Subject: [PATCH 339/607] UPSTREAM: netfilter: nfnetlink: correctly validate length of batch messages (cherry picked from commit c58d6c93680f28ac58984af61d0a7ebf4319c241) If nlh->nlmsg_len is zero then an infinite loop is triggered because 'skb_pull(skb, msglen);' pulls zero bytes. The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len < NLMSG_HDRLEN' which bypasses the length validation and will later trigger an out-of-bound read. If the length validation does fail then the malformed batch message is copied back to userspace. However, we cannot do this because the nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in netlink_ack: [ 41.455421] ================================================================== [ 41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340 [ 41.456431] Read of size 4294967280 by task a.out/987 [ 41.456431] ============================================================================= [ 41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 41.456431] ----------------------------------------------------------------------------- ... [ 41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00 ................ [ 41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00 ............... [ 41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05 .......@EV."3... [ 41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb ................ ^^ start of batch nlmsg with nlmsg_len=4294967280 ... [ 41.456431] Memory state around the buggy address: [ 41.456431] ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ^ [ 41.456431] ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb [ 41.456431] ================================================================== Fix this with better validation of nlh->nlmsg_len and by setting NFNL_BATCH_FAILURE if any batch message fails length validation. CAP_NET_ADMIN is required to trigger the bugs. Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch") Signed-off-by: Phil Turnbull Signed-off-by: Pablo Neira Ayuso Change-Id: Id3e15c40cb464bf2791af907c235d8a316b2449c Bug: 30947055 --- net/netfilter/nfnetlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 77afe913d03d..9adedba78eea 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -326,10 +326,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ From 6b93f8214eabf0f363eab5283c2ad18b5bc33135 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Fri, 27 Nov 2015 14:30:21 -0500 Subject: [PATCH 340/607] UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields (cherry picked from commit dd42bf1197144ede075a9d4793123f7689e164bc) Line discipline drivers may mistakenly misuse ldisc-related fields when initializing. For example, a failure to initialize tty->receive_room in the N_GIGASET_M101 line discipline was recently found and fixed [1]. Now, the N_X25 line discipline has been discovered accessing the previous line discipline's already-freed private data [2]. Harden the ldisc interface against misuse by initializing revelant tty fields before instancing the new line discipline. [1] commit fd98e9419d8d622a4de91f76b306af6aa627aa9c Author: Tilman Schmidt Date: Tue Jul 14 00:37:13 2015 +0200 isdn/gigaset: reset tty->receive_room when attaching ser_gigaset [2] Report from Sasha Levin [ 634.336761] ================================================================== [ 634.338226] BUG: KASAN: use-after-free in x25_asy_open_tty+0x13d/0x490 at addr ffff8800a743efd0 [ 634.339558] Read of size 4 by task syzkaller_execu/8981 [ 634.340359] ============================================================================= [ 634.341598] BUG kmalloc-512 (Not tainted): kasan: bad access detected ... [ 634.405018] Call Trace: [ 634.405277] dump_stack (lib/dump_stack.c:52) [ 634.405775] print_trailer (mm/slub.c:655) [ 634.406361] object_err (mm/slub.c:662) [ 634.406824] kasan_report_error (mm/kasan/report.c:138 mm/kasan/report.c:236) [ 634.409581] __asan_report_load4_noabort (mm/kasan/report.c:279) [ 634.411355] x25_asy_open_tty (drivers/net/wan/x25_asy.c:559 (discriminator 1)) [ 634.413997] tty_ldisc_open.isra.2 (drivers/tty/tty_ldisc.c:447) [ 634.414549] tty_set_ldisc (drivers/tty/tty_ldisc.c:567) [ 634.415057] tty_ioctl (drivers/tty/tty_io.c:2646 drivers/tty/tty_io.c:2879) [ 634.423524] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607) [ 634.427491] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613) [ 634.427945] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188) Cc: Tilman Schmidt Cc: Sasha Levin Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Change-Id: Ibed6feadfb9706d478f93feec3b240aecfc64af3 Bug: 30951112 --- drivers/tty/tty_ldisc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 629e3c865072..9bee25cfa0be 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -417,6 +417,10 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush); * they are not on hot paths so a little discipline won't do * any harm. * + * The line discipline-related tty_struct fields are reset to + * prevent the ldisc driver from re-using stale information for + * the new ldisc instance. + * * Locking: takes termios_rwsem */ @@ -425,6 +429,9 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) down_write(&tty->termios_rwsem); tty->termios.c_line = num; up_write(&tty->termios_rwsem); + + tty->disc_data = NULL; + tty->receive_room = 0; } /** From 65a2cd6d500d2620e5a5aa156b518d5fd25d492a Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 7 Sep 2016 17:39:42 -0700 Subject: [PATCH 341/607] Android: Fix build breakages. The IO latency histogram change broke allmodconfig and allnoconfig builds. This fixes those breakages. Change-Id: I9cdae655b40ed155468f3cef25cdb74bb56c4d3e Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 2 ++ include/linux/mmc/host.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 840713279726..64f3455a83ef 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3560,6 +3560,7 @@ blk_zero_latency_hist(struct io_latency_state *s) s->latency_reads_elems = 0; s->latency_writes_elems = 0; } +EXPORT_SYMBOL(blk_zero_latency_hist); ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf) @@ -3621,3 +3622,4 @@ blk_latency_hist_show(struct io_latency_state *s, char *buf) } return bytes_written; } +EXPORT_SYMBOL(blk_latency_hist_show); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e4862f7cdede..97b2b0b1f99d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -380,8 +380,10 @@ struct mmc_host { } embedded_sdio_data; #endif +#ifdef CONFIG_BLOCK int latency_hist_enabled; struct io_latency_state io_lat_s; +#endif unsigned long private[0] ____cacheline_aligned; }; From 9f139d106df7b6a1805b0e4f3b234cedf25d2bbb Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 1 Jul 2016 00:39:35 -0700 Subject: [PATCH 342/607] UPSTREAM: block: fix use-after-free in sys_ioprio_get() (cherry picked from commit 8ba8682107ee2ca3347354e018865d8e1967c5f4) get_task_ioprio() accesses the task->io_context without holding the task lock and thus can race with exit_io_context(), leading to a use-after-free. The reproducer below hits this within a few seconds on my 4-core QEMU VM: #define _GNU_SOURCE #include #include #include #include int main(int argc, char **argv) { pid_t pid, child; long nproc, i; /* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */ syscall(SYS_ioprio_set, 1, 0, 0x6000); nproc = sysconf(_SC_NPROCESSORS_ONLN); for (i = 0; i < nproc; i++) { pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { pid = fork(); assert(pid != -1); if (pid == 0) { _exit(0); } else { child = wait(NULL); assert(child == pid); } } } pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } } } for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } return 0; } This gets us KASAN dumps like this: [ 35.526914] ================================================================== [ 35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c [ 35.530009] Read of size 2 by task ioprio-gpf/363 [ 35.530009] ============================================================================= [ 35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected [ 35.530009] ----------------------------------------------------------------------------- [ 35.530009] Disabling lock debugging due to kernel taint [ 35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360 [ 35.530009] ___slab_alloc+0x55d/0x5a0 [ 35.530009] __slab_alloc.isra.20+0x2b/0x40 [ 35.530009] kmem_cache_alloc_node+0x84/0x200 [ 35.530009] create_task_io_context+0x2b/0x370 [ 35.530009] get_task_io_context+0x92/0xb0 [ 35.530009] copy_process.part.8+0x5029/0x5660 [ 35.530009] _do_fork+0x155/0x7e0 [ 35.530009] SyS_clone+0x19/0x20 [ 35.530009] do_syscall_64+0x195/0x3a0 [ 35.530009] return_from_SYSCALL_64+0x0/0x6a [ 35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060 [ 35.530009] __slab_free+0x27b/0x3d0 [ 35.530009] kmem_cache_free+0x1fb/0x220 [ 35.530009] put_io_context+0xe7/0x120 [ 35.530009] put_io_context_active+0x238/0x380 [ 35.530009] exit_io_context+0x66/0x80 [ 35.530009] do_exit+0x158e/0x2b90 [ 35.530009] do_group_exit+0xe5/0x2b0 [ 35.530009] SyS_exit_group+0x1d/0x20 [ 35.530009] entry_SYSCALL_64_fastpath+0x1a/0xa4 [ 35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080 [ 35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001 [ 35.530009] ================================================================== Fix it by grabbing the task lock while we poke at the io_context. Cc: stable@vger.kernel.org Reported-by: Dmitry Vyukov Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Change-Id: I3f5858cc9a1b9d4124ae7a6578660dec219d2c57 Bug: 30946378 --- block/ioprio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/ioprio.c b/block/ioprio.c index cc7800e9eb44..01b8116298a1 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p) if (ret) goto out; ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + task_unlock(p); out: return ret; } From d942dd3b7c156b005bf01c33af729ab783182f4c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 19 Jan 2016 12:34:58 +0100 Subject: [PATCH 343/607] UPSTREAM: HID: core: prevent out-of-bound readings (cherry picked from commit 50220dead1650609206efe91f0cc116132d59b3f) Plugging a Logitech DJ receiver with KASAN activated raises a bunch of out-of-bound readings. The fields are allocated up to MAX_USAGE, meaning that potentially, we do not have enough fields to fit the incoming values. Add checks and silence KASAN. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Change-Id: Iaf25e882a6696884439d7091b5fbb0b350d893d3 Bug: 30951261 --- drivers/hid/hid-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index c6f7a694f67a..fa5f81bbc95f 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1251,6 +1251,7 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && value[n] >= min && value[n] <= max && + value[n] - min < field->maxusage && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) goto exit; } @@ -1263,11 +1264,13 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, } if (field->value[n] >= min && field->value[n] <= max + && field->value[n] - min < field->maxusage && field->usage[field->value[n] - min].hid && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (value[n] >= min && value[n] <= max + && value[n] - min < field->maxusage && field->usage[value[n] - min].hid && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); From 1a4f17ec07cc4a938b6695e92e4f2b2ec7c3d527 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Sep 2016 11:56:01 -0700 Subject: [PATCH 344/607] UPSTREAM: x86/uaccess: force copy_*_user() to be inlined As already done with __copy_*_user(), mark copy_*_user() as __always_inline. Without this, the checks for things like __builtin_const_p() won't work consistently in either hardened usercopy nor the recent adjustments for detecting usercopy overflows at compile time. The change in kernel text size is detectable, but very small: text data bss dec hex filename 12118735 5768608 14229504 32116847 1ea106f vmlinux.before 12120207 5768608 14229504 32118319 1ea162f vmlinux.after Signed-off-by: Kees Cook Change-Id: I284c85c2a782145f46655a91d4f83874c90eba61 (cherry picked from commit e6971009a95a74f28c58bbae415c40effad1226c) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index be439e246d91..dbe64f27280e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -706,7 +706,7 @@ __copy_from_user_overflow(int size, unsigned long count) #endif -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -742,7 +742,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); From 1fd70f71e8f428f0c191a6b86e72e700dbe34bed Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 31 Aug 2016 16:04:21 -0700 Subject: [PATCH 345/607] BACKPORT: usercopy: fold builtin_const check into inline function Instead of having each caller of check_object_size() need to remember to check for a const size parameter, move the check into check_object_size() itself. This actually matches the original implementation in PaX, though this commit cleans up the now-redundant builtin_const() calls in the various architectures. Signed-off-by: Kees Cook Change-Id: I348809399c10ffa051251866063be674d064b9ff (cherry picked from 81409e9e28058811c9ea865345e1753f8f677e44) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0ae29ff9ccfd..eded095fe81e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -161,7 +161,8 @@ extern void __check_object_size(const void *ptr, unsigned long n, static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { - __check_object_size(ptr, n, to_user); + if (!__builtin_constant_p(n)) + __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, From 93584e7e1b4ea543ed04cff643186946196eaebb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:39:32 -0700 Subject: [PATCH 346/607] UPSTREAM: usercopy: force check_object_size() inline Just for good measure, make sure that check_object_size() is always inlined too, as already done for copy_*_user() and __copy_*_user(). Suggested-by: Linus Torvalds Signed-off-by: Kees Cook Change-Id: Ibfdf4790d03fe426e68d9a864c55a0d1bbfb7d61 (cherry picked from commit a85d6b8242dc78ef3f4542a0f979aebcbe77fc4e) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index eded095fe81e..4cf89517783a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -158,8 +158,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); From 92e04f8a13a8bb56a7544e56978663dc2e4f82cc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:54:34 -0700 Subject: [PATCH 347/607] UPSTREAM: usercopy: remove page-spanning test for now A custom allocator without __GFP_COMP that copies to userspace has been found in vmw_execbuf_process[1], so this disables the page-span checker by placing it behind a CONFIG for future work where such things can be tracked down later. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1373326 Reported-by: Vinson Lee Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Kees Cook Change-Id: I4177c0fb943f14a5faf5c70f5e54bf782c316f43 (cherry picked from commit 8e1f74ea02cf4562404c48c6882214821552c13f) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 61 +++++++++++++++++++++++++++--------------------- security/Kconfig | 11 +++++++++ 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index f78015e8b1e5..b34996a3860b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -135,30 +135,15 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n) return NULL; } -static inline const char *check_heap_object(const void *ptr, unsigned long n, - bool to_user) +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) { - struct page *page, *endpage; +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN const void *end = ptr + n - 1; + struct page *endpage; bool is_reserved, is_cma; - /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - */ - if (is_vmalloc_addr(ptr)) - return NULL; - - if (!virt_addr_valid(ptr)) - return NULL; - - page = virt_to_head_page(ptr); - - /* Check slab allocator for flags and size. */ - if (PageSlab(page)) - return __check_heap_object(ptr, n, page); - /* * Sometimes the kernel data regions are not marked Reserved (see * check below). And sometimes [_sdata,_edata) does not cover @@ -187,7 +172,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, ((unsigned long)end & (unsigned long)PAGE_MASK))) return NULL; - /* Allow if start and end are inside the same compound page. */ + /* Allow if fully inside the same compound (__GFP_COMP) page. */ endpage = virt_to_head_page(end); if (likely(endpage == page)) return NULL; @@ -200,20 +185,44 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, is_reserved = PageReserved(page); is_cma = is_migrate_cma_page(page); if (!is_reserved && !is_cma) - goto reject; + return ""; for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { page = virt_to_head_page(ptr); if (is_reserved && !PageReserved(page)) - goto reject; + return ""; if (is_cma && !is_migrate_cma_page(page)) - goto reject; + return ""; } +#endif return NULL; +} -reject: - return ""; +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); } /* diff --git a/security/Kconfig b/security/Kconfig index 2b42c225de28..3aa60791f84d 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -156,6 +156,17 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. +config HARDENED_USERCOPY_PAGESPAN + bool "Refuse to copy allocations that span multiple pages" + depends on HARDENED_USERCOPY + depends on !COMPILE_TEST + help + When a multi-page allocation is done without __GFP_COMP, + hardened usercopy will reject attempts to copy it. There are, + however, several cases of this in the kernel that have not all + been removed. This config is intended to be used only while + trying to find such users. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From a38338f1cd595f6a66870ba46cc6a4ae7378c084 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 31 Aug 2016 08:09:04 -0700 Subject: [PATCH 348/607] FROMLIST: pstore: drop pmsg bounce buffer (from https://lkml.org/lkml/2016/9/1/428) (cherry pick from android-3.10 commit b58133100b38f2bf83cad2d7097417a3a196ed0b) Removing a bounce buffer copy operation in the pmsg driver path is always better. We also gain in overall performance by not requesting a vmalloc on every write as this can cause precious RT tasks, such as user facing media operation, to stall while memory is being reclaimed. Added a write_buf_user to the pstore functions, a backup platform write_buf_user that uses the small buffer that is part of the instance, and implemented a ramoops write_buf_user that only supports PSTORE_TYPE_PMSG. Signed-off-by: Mark Salyzyn Bug: 31057326 Change-Id: I4cdee1cd31467aa3e6c605bce2fbd4de5b0f8caa --- fs/pstore/platform.c | 36 +++++++++++++++++++++++++++++ fs/pstore/pmsg.c | 35 +++++----------------------- fs/pstore/ram.c | 19 +++++++++++++++ fs/pstore/ram_core.c | 47 ++++++++++++++++++++++++++++++++++++-- include/linux/pstore.h | 11 ++++++--- include/linux/pstore_ram.h | 7 ++++-- 6 files changed, 119 insertions(+), 36 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 588461bb2dd4..40a0fe0a4e05 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -431,6 +431,40 @@ static int pstore_write_compat(enum pstore_type_id type, size, psi); } +static int pstore_write_buf_user_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + unsigned long flags = 0; + size_t i, bufsize = size; + long ret = 0; + + if (unlikely(!access_ok(VERIFY_READ, buf, size))) + return -EFAULT; + if (bufsize > psinfo->bufsize) + bufsize = psinfo->bufsize; + spin_lock_irqsave(&psinfo->buf_lock, flags); + for (i = 0; i < size; ) { + size_t c = min(size - i, bufsize); + + ret = __copy_from_user(psinfo->buf, buf + i, c); + if (unlikely(ret != 0)) { + ret = -EFAULT; + break; + } + ret = psi->write_buf(type, reason, id, part, psinfo->buf, + compressed, c, psi); + if (unlikely(ret < 0)) + break; + i += c; + } + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + return unlikely(ret < 0) ? ret : size; +} + /* * platform specific persistent storage driver registers with * us here. If pstore is already mounted, call the platform @@ -453,6 +487,8 @@ int pstore_register(struct pstore_info *psi) if (!psi->write) psi->write = pstore_write_compat; + if (!psi->write_buf_user) + psi->write_buf_user = pstore_write_buf_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 7de20cd3797f..78f6176c020f 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -19,48 +19,25 @@ #include "internal.h" static DEFINE_MUTEX(pmsg_lock); -#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - size_t i, buffer_size; - char *buffer; + u64 id; + int ret; if (!count) return 0; + /* check outside lock, page in any data. write_buf_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - buffer_size = count; - if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) - buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; - buffer = vmalloc(buffer_size); - if (!buffer) - return -ENOMEM; - mutex_lock(&pmsg_lock); - for (i = 0; i < count; ) { - size_t c = min(count - i, buffer_size); - u64 id; - long ret; - - ret = __copy_from_user(buffer, buf + i, c); - if (unlikely(ret != 0)) { - mutex_unlock(&pmsg_lock); - vfree(buffer); - return -EFAULT; - } - psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, - psinfo); - - i += c; - } - + ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, + psinfo); mutex_unlock(&pmsg_lock); - vfree(buffer); - return count; + return ret ? ret : count; } static const struct file_operations pmsg_fops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 414041342a99..5b10c2b4146c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } +static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + if (type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = psi->data; + + if (!cxt->mprz) + return -ENOMEM; + return persistent_ram_write_user(cxt->mprz, buf, size); + } + + return -EINVAL; +} + static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi) { @@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = { .open = ramoops_pstore_open, .read = ramoops_pstore_read, .write_buf = ramoops_pstore_write_buf, + .write_buf_user = ramoops_pstore_write_buf_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 76c3f80efdfa..aa9afe573155 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,15 +17,16 @@ #include #include #include -#include #include #include +#include #include #include +#include #include #include +#include #include -#include #include struct persistent_ram_buffer { @@ -303,6 +304,16 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, persistent_ram_update_ecc(prz, start, count); } +static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + -EFAULT : 0; + persistent_ram_update_ecc(prz, start, count); + return ret; +} + void persistent_ram_save_old(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; @@ -356,6 +367,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz, return count; } +int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count) +{ + int rem, ret = 0, c = count; + size_t start; + + if (unlikely(!access_ok(VERIFY_READ, s, count))) + return -EFAULT; + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + ret = persistent_ram_update_user(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + if (likely(!ret)) + ret = persistent_ram_update_user(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return unlikely(ret) ? ret : count; +} + size_t persistent_ram_old_size(struct persistent_ram_zone *prz) { return prz->old_log_size; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 831479f8df8f..5cae2c6c90ad 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -22,12 +22,13 @@ #ifndef _LINUX_PSTORE_H #define _LINUX_PSTORE_H -#include +#include +#include #include #include -#include #include -#include +#include +#include /* types */ enum pstore_type_id { @@ -67,6 +68,10 @@ struct pstore_info { enum kmsg_dump_reason reason, u64 *id, unsigned int part, const char *buf, bool compressed, size_t size, struct pstore_info *psi); + int (*write_buf_user)(enum pstore_type_id type, + enum kmsg_dump_reason reason, u64 *id, + unsigned int part, const char __user *buf, + bool compressed, size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi); diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 712757f320a4..45ac5a0d29ee 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -17,11 +17,12 @@ #ifndef __LINUX_PSTORE_RAM_H__ #define __LINUX_PSTORE_RAM_H__ +#include #include +#include #include #include #include -#include struct persistent_ram_buffer; struct rs_control; @@ -59,7 +60,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz); void persistent_ram_zap(struct persistent_ram_zone *prz); int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, - unsigned int count); + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); void persistent_ram_save_old(struct persistent_ram_zone *prz); size_t persistent_ram_old_size(struct persistent_ram_zone *prz); From 81330959ee15a682fbefcdd4f5350a979d8e8fdb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Sep 2016 16:43:21 -0700 Subject: [PATCH 349/607] cpufreq: Kconfig: Fixup incorrect selection by CPU_FREQ_DEFAULT_GOV_SCHED The CPU_FREQ_DEFAULT_GOV_SCHED option is incorrectly selecting CPU_FREQ_GOV_INTERACTIVE, when it should be selecting CPU_FREQ_GOV_SCHED. Signed-off-by: John Stultz --- drivers/cpufreq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index d423f51d3276..a2d63d86bb70 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -105,7 +105,7 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_INTERACTIVE + select CPU_FREQ_GOV_SCHED help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the From 1c81f45725d2cee3c11e1d213eb30cc7f8feea0a Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 25 Aug 2016 11:06:37 +0530 Subject: [PATCH 350/607] sched/walt: include missing header for arm_timer_read_counter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include clocksource/arm_arch_timer.h to fix implicit function declaration of ‘arch_timer_read_counter’ build error for ARCH=arm. Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- kernel/sched/walt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index d9d09914ce30..07b7f84b37e2 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "sched.h" #include "walt.h" From 0f959e0a3261539eb14420d02a491188cc52f9e8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Mar 2016 12:14:49 +0100 Subject: [PATCH 351/607] BACKPORT: ALSA: usb-audio: Minor code cleanup in create_fixed_stream_quirk() (cherry picked from commit 902eb7fd1e4af3ac69b9b30f8373f118c92b9729) Just a minor code cleanup: unify the error paths. Signed-off-by: Takashi Iwai Change-Id: I8253a86235df2ac1258153c9e128fa158527567f Bug: 30952477 --- sound/usb/quirks.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index b6c0c8e3b450..a85b1953d83a 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -167,16 +167,12 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, stream = (fp->endpoint & USB_DIR_IN) ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; err = snd_usb_add_audio_stream(chip, stream, fp); - if (err < 0) { - kfree(fp); - kfree(rate_table); - return err; - } + if (err < 0) + goto error; if (fp->iface != get_iface_desc(&iface->altsetting[0])->bInterfaceNumber || fp->altset_idx >= iface->num_altsetting) { - kfree(fp); - kfree(rate_table); - return -EINVAL; + err = -EINVAL; + goto error; } alts = &iface->altsetting[fp->altset_idx]; altsd = get_iface_desc(alts); @@ -190,6 +186,11 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, snd_usb_init_pitch(chip, fp->iface, alts, fp); snd_usb_init_sample_rate(chip, fp->iface, alts, fp, fp->rate_max); return 0; + + error: + kfree(fp); + kfree(rate_table); + return err; } static int create_auto_pcm_quirk(struct snd_usb_audio *chip, From 571763be0e7f0aab3aa3eb02bb315b63539cc10c Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Thu, 31 Mar 2016 12:05:43 -0400 Subject: [PATCH 352/607] UPSTREAM: ALSA: usb-audio: Fix double-free in error paths after snd_usb_add_audio_stream() call (cherry picked from commit 836b34a935abc91e13e63053d0a83b24dfb5ea78) create_fixed_stream_quirk(), snd_usb_parse_audio_interface() and create_uaxx_quirk() functions allocate the audioformat object by themselves and free it upon error before returning. However, once the object is linked to a stream, it's freed again in snd_usb_audio_pcm_free(), thus it'll be double-freed, eventually resulting in a memory corruption. This patch fixes these failures in the error paths by unlinking the audioformat object before freeing it. Based on a patch by Takashi Iwai [Note for stable backports: this patch requires the commit 902eb7fd1e4a ('ALSA: usb-audio: Minor code cleanup in create_fixed_stream_quirk()')] Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1283358 Reported-by: Ralf Spenneberg Cc: # see the note above Signed-off-by: Vladis Dronov Signed-off-by: Takashi Iwai Change-Id: I7073a17d8c99886d2f6ed7981892712ba7dd5873 Bug: 30952477 --- sound/usb/quirks.c | 4 ++++ sound/usb/stream.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a85b1953d83a..79c1cccc606a 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -150,6 +150,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, usb_audio_err(chip, "cannot memdup\n"); return -ENOMEM; } + INIT_LIST_HEAD(&fp->list); if (fp->nr_rates > MAX_NR_RATES) { kfree(fp); return -EINVAL; @@ -188,6 +189,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, return 0; error: + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); kfree(rate_table); return err; @@ -463,6 +465,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip, fp->ep_attr = get_endpoint(alts, 0)->bmAttributes; fp->datainterval = 0; fp->maxpacksize = le16_to_cpu(get_endpoint(alts, 0)->wMaxPacketSize); + INIT_LIST_HEAD(&fp->list); switch (fp->maxpacksize) { case 0x120: @@ -486,6 +489,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip, ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) { + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); return err; } diff --git a/sound/usb/stream.c b/sound/usb/stream.c index 8ee14f2365e7..3b23102230c0 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -316,7 +316,9 @@ static struct snd_pcm_chmap_elem *convert_chmap(int channels, unsigned int bits, /* * add this endpoint to the chip instance. * if a stream with the same endpoint already exists, append to it. - * if not, create a new pcm stream. + * if not, create a new pcm stream. note, fp is added to the substream + * fmt_list and will be freed on the chip instance release. do not free + * fp or do remove it from the substream fmt_list to avoid double-free. */ int snd_usb_add_audio_stream(struct snd_usb_audio *chip, int stream, @@ -677,6 +679,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no) * (fp->maxpacksize & 0x7ff); fp->attributes = parse_uac_endpoint_attributes(chip, alts, protocol, iface_no); fp->clock = clock; + INIT_LIST_HEAD(&fp->list); /* some quirks for attributes here */ @@ -725,6 +728,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no) dev_dbg(&dev->dev, "%u:%d: add audio endpoint %#x\n", iface_no, altno, fp->endpoint); err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) { + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp->rate_table); kfree(fp->chmap); kfree(fp); From 6c7e03dde14e5074134f47d9ec3dea7de24e4e58 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 29 Aug 2016 19:48:17 +0530 Subject: [PATCH 353/607] DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpu curr capacity can only be traced for SMP systems. Non-SMP builds will fail with: drivers/cpufreq/cpufreq.c: In function ‘cpufreq_freq_transition_begin’: drivers/cpufreq/cpufreq.c:438:22: error: implicit declaration of function ‘capacity_curr_of’ [-Werror=implicit-function-declaration] trace_cpu_capacity(capacity_curr_of(cpu), cpu); ^ Fixes: Change-Id: Icd0930d11068fcb7d2b6a9a48e7ed974904e1081 ("DEBUG: sched,cpufreq: add cpu_capacity change tracepoint") Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- drivers/cpufreq/cpufreq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7264820e6443..7b728143440d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,7 +29,9 @@ #include #include #include +#ifdef CONFIG_SMP #include +#endif #include static LIST_HEAD(cpufreq_policy_list); @@ -474,7 +476,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { +#ifdef CONFIG_SMP int cpu; +#endif /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -503,8 +507,10 @@ wait: spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); +#ifdef CONFIG_SMP for_each_cpu(cpu, policy->cpus) trace_cpu_capacity(capacity_curr_of(cpu), cpu); +#endif cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } From aeb4a3112e516ef93069acdc04b65e9ae5882d28 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 24 Aug 2016 11:52:17 +0530 Subject: [PATCH 354/607] sched/walt: use do_div instead of division operator Use do_div() instead of "/" operator to fix undefined references to "__aeabi_uldivmod" build error for ARCH=arm. Also in TP_fast_assign(), along with do_div() usage, replace "," with ";" which would have resulted in a syntax error (!), because '#define TP_fast_assign(args...) args' would have stripped off the "," and left white space between these two assignments after CPP phase. Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- include/trace/events/sched.h | 3 ++- kernel/sched/sched.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c50310a7fd6d..dffaffab4bc8 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1057,7 +1057,8 @@ TRACE_EVENT(walt_update_history, __entry->samples = samples; __entry->evt = evt; __entry->demand = p->ravg.demand; - __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->walt_avg = (__entry->demand << 10); + do_div(__entry->walt_avg, walt_ravg_window); __entry->pelt_avg = p->se.avg.util_avg; memcpy(__entry->hist, p->ravg.sum_history, RAVG_HIST_SIZE_MAX * sizeof(u32)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4e2e44e3cc7b..06f15724a639 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1577,9 +1577,10 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) - util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / - walt_ravg_window; + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } #endif delta += util; if (delta < 0) From ad607cd620b8caeebf3ed3336fa80921c588b27f Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 4 May 2016 18:56:45 -0700 Subject: [PATCH 355/607] arm: Fix build error "conflicting types for 'scale_cpu_capacity'" Commit "arm: Update arch_scale_cpu_capacity() to reflect change to define" introduced a dependency on struct sched_domain in arch/arm/include/asm/topologoy.h, but that structure is only currently defined if CONFIG_CPU_FREQ is enabled, which causes include/linux/cpufreq.h to get pulled in which defines it. Include regardless of CONFIG_CPU_FREQ so struct sched_domain is always defined. Fixes: Change-Id: I372bd5e4c1e203428d72b18c8a806b06f3567ef6 ("arm: Update arch_scale_cpu_capacity() to reflect change to define") Signed-off-by: Steve Muckle Signed-off-by: Amit Pundir [jstultz: Cherry-picked from android-3.18] Signed-off-by: John Stultz --- arch/arm/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index e3e596cbb1a7..d06064120694 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -3,6 +3,7 @@ #ifdef CONFIG_ARM_CPU_TOPOLOGY +#include #include struct cputopo_arm { @@ -25,7 +26,6 @@ void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef CONFIG_CPU_FREQ -#include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif #define arch_scale_cpu_capacity scale_cpu_capacity From 23eeab46c8550123a5e66b7b11862118dbb05d69 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 2 Jun 2016 12:18:08 +0000 Subject: [PATCH 356/607] arm: Fix #if/#ifdef typo in topology.c Probably a typo in arch/arm/kernel/topology.c This patch fixes the warning... arch/arm/kernel/topology.c: In function 'scale_cpu_capacity': arch/arm/kernel/topology.c:47:5: warning: "CONFIG_CPU_FREQ" is not defined [-Wundef] Fixes: Change-Id: If5e9e0ba8ff5a5d3236b373dbce8c72ea71b5e18 ("arm: Enable max freq invariant scheduler load-tracking and capacity support") Signed-off-by: Jon Medhurst Signed-off-by: Amit Pundir [jstultz: Cherry-picked from android-3.18] Signed-off-by: John Stultz --- arch/arm/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index f5941004efba..4f2c51ef162d 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,7 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { -#if CONFIG_CPU_FREQ +#ifdef CONFIG_CPU_FREQ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; From fb3cce0136514c65c2d1a9d2285a357cfb96d54c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:02:29 +0100 Subject: [PATCH 357/607] FIXUP: sched/tune: add fixes missing from a previous patch The previous patch: e7ce26f - FIXUP: sched/tune: fix accounting for runnable tasks squashed together patches of a series to fix SchedTune's accounting issues. However, in the consolidation and cleanup of the series to merge in the Android Common Kernel, we somehow missed a couple of important changes: 1) the schedtune_exit function is not more required, because e7ce26f fixes accounting of exiting tasks in a different way 2) the schedtune_initialized flag was not set at the end of scheddtune_init_cgroup() thus failing to enabled SchedTune at boot. This patch thus is to be considered an integration of e7ce26f. Signed-off-by: Patrick Bellasi [jstultz: Cherry-picked from android-3.18. It should be noted that some of this patch was already applied in the 4.4 patches (schedtune_exit doesn't exist for example), but this patch just ensures things are totally synced up] Signed-off-by: John Stultz --- kernel/sched/tune.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index bd7f319ce53e..505d7b35b0e1 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -715,7 +715,7 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, -// .allow_attach = schedtune_allow_attach, + .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, @@ -736,6 +736,8 @@ schedtune_init_cgroups(void) pr_info("schedtune: configured to support %d boost groups\n", BOOSTGROUPS_COUNT); + + schedtune_initialized = true; } #else /* CONFIG_CGROUP_SCHEDTUNE */ From a85045c03470e88923ed98bb5917b289fa752030 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:27:27 +0100 Subject: [PATCH 358/607] FIXUP: sched/tune: update accouting before CPU capacity The SchedTune tasks accounting is used to identify how many tasks are in a boostgroup and thus to bias the selection of an OPP based on the maximum boost value of the active boostgroups. The current implementation however update the accounting after CPU capacity has been update. This has two effects: a) when we enqueue a boosted task, we do not immediately boost its CPU b) when we dequeue a boosted task, we can keep a CPU boosted even if not required This patch change the order of the SchedTune accounting and SchedFreq updated to ensure to have always an updated representation of which boosted tasks are runnable on a CPU before updating its capacity. Reported-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c362be1df195..2cd3e2671f16 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4240,6 +4240,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && @@ -4259,9 +4278,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_capacity_of(cpu_of(rq)); } - /* Update SchedTune accouting */ - schedtune_enqueue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4327,6 +4343,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + if (!se) { walt_dec_cumulative_runnable_avg(rq, p); @@ -4346,9 +4371,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } - /* Update SchedTune accouting */ - schedtune_dequeue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); From 1510827f29ced43ac686c4625ce709e0dfb1292d Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Tue, 29 Dec 2015 04:47:00 +0800 Subject: [PATCH 359/607] UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor The VMSA field of MMFR0 (bottom 4 bits) is incremented for each added feature. PXN is supported if the value is >= 4 and LPAE is supported if it is >= 5. In case a kernel with CONFIG_ARM_LPAE disabled is used on a processor that supports LPAE, we can still use PXN in short descriptors. So check for >= 4 not == 4. Signed-off-by: Jungseung Lee Acked-by: Catalin Marinas Signed-off-by: Ben Hutchings Signed-off-by: Russell King --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4867f5daf82c..de9f8921e407 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -572,7 +572,7 @@ static void __init build_mem_type_table(void) * in the Short-descriptor translation table format descriptors. */ if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) { user_pmd_table |= PMD_PXNTABLE; } #endif From 702ea26d1f7d421a31831de98ea02e92c6782c20 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Jul 2016 17:42:57 -0400 Subject: [PATCH 360/607] UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg() (cherry picked from commit 43761473c254b45883a64441dd0bc85a42f3645c) There is a double fetch problem in audit_log_single_execve_arg() where we first check the execve(2) argumnets for any "bad" characters which would require hex encoding and then re-fetch the arguments for logging in the audit record[1]. Of course this leaves a window of opportunity for an unsavory application to munge with the data. This patch reworks things by only fetching the argument data once[2] into a buffer where it is scanned and logged into the audit records(s). In addition to fixing the double fetch, this patch improves on the original code in a few other ways: better handling of large arguments which require encoding, stricter record length checking, and some performance improvements (completely unverified, but we got rid of some strlen() calls, that's got to be a good thing). As part of the development of this patch, I've also created a basic regression test for the audit-testsuite, the test can be tracked on GitHub at the following link: * https://github.com/linux-audit/audit-testsuite/issues/25 [1] If you pay careful attention, there is actually a triple fetch problem due to a strnlen_user() call at the top of the function. [2] This is a tiny white lie, we do make a call to strnlen_user() prior to fetching the argument data. I don't like it, but due to the way the audit record is structured we really have no choice unless we copy the entire argument at once (which would require a rather wasteful allocation). The good news is that with this patch the kernel no longer relies on this strnlen_user() value for anything beyond recording it in the log, we also update it with a trustworthy value whenever possible. Reported-by: Pengfei Wang Cc: Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Change-Id: I10e979e94605e3cf8d461e3e521f8f9837228aa5 Bug: 30956807 --- kernel/auditsc.c | 332 +++++++++++++++++++++++------------------------ 1 file changed, 164 insertions(+), 168 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..48f45987dc6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) -{ - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; - } - - /* walk the whole argument looking for non-ascii chars */ - do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } - - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} - static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { - int i, len; - size_t len_sent = 0; - const char __user *p; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; - p = (const char __user *)current->mm->arg_start; + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { audit_panic("out of memory for argv string"); return; } + buf = buf_head; - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; + do { + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; + + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); + } + + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } + + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } + + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); + + /* NOTE: the caller handles the final audit_log_end() call */ + +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) From 8a627afb7a4ffda3d6f87d9bcaa19b839d247f5d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Feb 2016 14:58:52 -0800 Subject: [PATCH 361/607] UPSTREAM: x86: fix SMAP in 32-bit environments (cherry picked from commit de9e478b9d49f3a0214310d921450cf5bb4a21e6) In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses") I changed how the stac/clac instructions were generated around the user space accesses, which then made it possible to do batched accesses efficiently for user string copies etc. However, in doing so, I completely spaced out, and didn't even think about the 32-bit case. And nobody really even seemed to notice, because SMAP doesn't even exist until modern Skylake processors, and you'd have to be crazy to run 32-bit kernels on a modern CPU. Which brings us to Andy Lutomirski. He actually tested the 32-bit kernel on new hardware, and noticed that it doesn't work. My bad. The trivial fix is to add the required uaccess begin/end markers around the raw accesses in . I feel a bit bad about this patch, just because that header file really should be cleaned up to avoid all the duplicated code in it, and this commit just expands on the problem. But this just fixes the bug without any bigger cleanup surgery. Reported-and-tested-by: Andy Lutomirski Signed-off-by: Linus Torvalds Change-Id: Ic044ebfe658a13179984111d062ca3a0b1404110 Signed-off-by: Amit Pundir --- arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 2ffc9188cc70..c3724acd0fe4 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -49,20 +49,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __put_user_size(*(u8 *)from, (u8 __user *)to, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __put_user_size(*(u16 *)from, (u16 __user *)to, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); + __uaccess_end(); return ret; case 8: + __uaccess_begin(); __put_user_size(*(u64 *)from, (u64 __user *)to, 8, ret, 8); + __uaccess_end(); return ret; } } @@ -104,13 +112,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -150,13 +164,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -172,13 +192,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } From 5b432907fedfa6413bfe5971068853a786c4be16 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Nov 2015 13:26:19 +0000 Subject: [PATCH 362/607] UPSTREAM: arm64: mm: detect bad __create_mapping uses If a caller of __create_mapping provides a PA and VA which have different sub-page offsets, it is not clear which offset they expect to apply to the mapping, and is indicative of a bad caller. In some cases, the region we wish to map may validly have a sub-page offset in the physical and virtual addresses. For example, EFI runtime regions have 4K granularity, yet may be mapped by a 64K page kernel. So long as the physical and virtual offsets are the same, the region will be mapped at the expected VAs. Disallow calls with differing sub-page offsets, and WARN when they are encountered, so that we can detect and fix such cases. Cc: Laura Abbott Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Reviewed-by: Steve Capper Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: __create_mapping-fixes (cherry picked from commit cc5d2b3b95cdbb3fed4e38e667d17b9ac7250f7a) Signed-off-by: Jeff Vander Stoep Change-Id: I114a1265b10ff76daff385728d2125e618c313a1 --- arch/arm64/mm/mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9a8dc4a79d8b..52f03bb07429 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -251,6 +251,13 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, { unsigned long addr, length, end, next; + /* + * If the virtual and physical address don't have the same offset + * within a page, we cannot map the region as the caller expects. + */ + if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) + return; + addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); From ab8158d22085047097653a799fc9bae6e9920edf Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Nov 2015 13:26:20 +0000 Subject: [PATCH 363/607] UPSTREAM: arm64: mm: allow sections for unaligned bases Callees of __create_mapping may decide to create section mappings if sufficient low bits of the physical and virtual addresses they were passed are zero. While __create_mapping rounds the virtual base address down, it does not similarly round the physical base address down, and hence non-zero bits in the physical address can prevent use of a section mapping, even where a whole next-level table would be used instead. Round down the physical base address in __create_mapping to enable all callees to always create section mappings when such a mapping is possible. Cc: Laura Abbott Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Reviewed-by: Steve Capper Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: __create_mapping-fixes (cherry picked from commit 9c4e08a3022b6df90d31ef4007291faabfce5431) Signed-off-by: Jeff Vander Stoep Change-Id: Ic447350efdba3cd9f9e101c72183e04e39dd28d2 --- arch/arm64/mm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 52f03bb07429..4c381f0e77a7 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -258,6 +258,7 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) return; + phys &= PAGE_MASK; addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); From 7152164d7b5fdfe68423ca1568fc227e92d3d6f7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Oct 2015 18:56:19 +0000 Subject: [PATCH 364/607] UPSTREAM: arm64: pgtable: implement pte_accessible() This patch implements the pte_accessible() macro, which can be used to test whether or not a given pte is a candidate for allocation in the TLB. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 76c714be0e5e60c935a53b31be58939510ba1d0f) Signed-off-by: Jeff Vander Stoep Change-Id: I249e2d15665870149dd17d1cdb3850008f5a56fd --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 63f52b55defe..824416a219e7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -167,6 +167,16 @@ extern struct page *empty_zero_page; ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) +#define pte_valid_young(pte) \ + ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF)) + +/* + * Could the pte be present in the TLB? We must check mm_tlb_flush_pending + * so that we don't erroneously return false for pages that have been + * remapped as PROT_NONE but are yet to be flushed from the TLB. + */ +#define pte_accessible(mm, pte) \ + (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte)) static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { From 4277d7534ed7af7f103e20504dff3aead7799c99 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Fri, 2 Sep 2016 09:37:24 +0100 Subject: [PATCH 365/607] UPSTREAM: brcmfmac: avoid potential stack overflow in brcmf_cfg80211_start_ap() commit ded89912156b1a47d940a0c954c43afbabd0c42c upstream User-space can choose to omit NL80211_ATTR_SSID and only provide raw IE TLV data. When doing so it can provide SSID IE with length exceeding the allowed size. The driver further processes this IE copying it into a local variable without checking the length. Hence stack can be corrupted and used as exploit. Cc: stable@vger.kernel.org # v4.4, v4.1 Reported-by: Daxing Guo Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c index deb5f78dcacc..786fff3d7466 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c @@ -4099,7 +4099,7 @@ brcmf_cfg80211_start_ap(struct wiphy *wiphy, struct net_device *ndev, (u8 *)&settings->beacon.head[ie_offset], settings->beacon.head_len - ie_offset, WLAN_EID_SSID); - if (!ssid_ie) + if (!ssid_ie || ssid_ie->len > IEEE80211_MAX_SSID_LEN) return -EINVAL; memcpy(ssid_le.SSID, ssid_ie->data, ssid_ie->len); From 1a8cf46bbfcc2eac1db8cfcc9c3b0596b9ee1d66 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 14 Apr 2016 17:01:17 +0200 Subject: [PATCH 366/607] UPSTREAM: usb: gadget: f_fs: Fix use-after-free (cherry picked from commit 38740a5b87d53ceb89eb2c970150f6e94e00373a) When using asynchronous read or write operations on the USB endpoints the issuer of the IO request is notified by calling the ki_complete() callback of the submitted kiocb when the URB has been completed. Calling this ki_complete() callback will free kiocb. Make sure that the structure is no longer accessed beyond that point, otherwise undefined behaviour might occur. Fixes: 2e4c7553cd6f ("usb: gadget: f_fs: add aio support") Cc: # v3.15+ Signed-off-by: Lars-Peter Clausen Signed-off-by: Felipe Balbi Change-Id: I3c7b643f6440c4fb6160a57c1058523030b46a6c Bug: 30950866 --- drivers/usb/gadget/function/f_fs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index cf43e9e18368..79d895c2dd71 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -646,6 +646,7 @@ static void ffs_user_copy_worker(struct work_struct *work) work); int ret = io_data->req->status ? io_data->req->status : io_data->req->actual; + bool kiocb_has_eventfd = io_data->kiocb->ki_flags & IOCB_EVENTFD; if (io_data->read && ret > 0) { use_mm(io_data->mm); @@ -657,13 +658,11 @@ static void ffs_user_copy_worker(struct work_struct *work) io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); - if (io_data->ffs->ffs_eventfd && - !(io_data->kiocb->ki_flags & IOCB_EVENTFD)) + if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd) eventfd_signal(io_data->ffs->ffs_eventfd, 1); usb_ep_free_request(io_data->ep, io_data->req); - io_data->kiocb->private = NULL; if (io_data->read) kfree(io_data->to_free); kfree(io_data->buf); From 6b51f3d20c52982bd71028509ebb3a4a2389850b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 20 Nov 2015 17:59:10 +0800 Subject: [PATCH 367/607] UPSTREAM: arm64: add __init/__initdata section marker to some functions/variables These functions/variables are not needed after booting, so mark them as __init or __initdata. Signed-off-by: Jisheng Zhang Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit a7c61a3452d39078919f0e1f493ff966fb64f0db) Signed-off-by: Jeff Vander Stoep Change-Id: I50a3362e186750e139d2440d2c1e1d49ace896e1 --- arch/arm64/kernel/armv8_deprecated.c | 6 +++--- arch/arm64/kernel/cpufeature.c | 9 +++++---- arch/arm64/kernel/fpsimd.c | 2 +- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/arm64/mm/init.c | 6 +++--- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 937f5e58a4d3..3e01207917b1 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -62,7 +62,7 @@ struct insn_emulation { }; static LIST_HEAD(insn_emulation); -static int nr_insn_emulated; +static int nr_insn_emulated __initdata; static DEFINE_RAW_SPINLOCK(insn_emulation_lock); static void register_emulation_hooks(struct insn_emulation_ops *ops) @@ -173,7 +173,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn, return ret; } -static void register_insn_emulation(struct insn_emulation_ops *ops) +static void __init register_insn_emulation(struct insn_emulation_ops *ops) { unsigned long flags; struct insn_emulation *insn; @@ -237,7 +237,7 @@ static struct ctl_table ctl_abi[] = { { } }; -static void register_insn_emulation_sysctl(struct ctl_table *table) +static void __init register_insn_emulation_sysctl(struct ctl_table *table) { unsigned long flags; int i = 0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0669c63281ea..5c90aa490a2b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -684,7 +684,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = { {}, }; -static void cap_set_hwcap(const struct arm64_cpu_capabilities *cap) +static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: @@ -729,7 +729,7 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities * return rc; } -static void setup_cpu_hwcaps(void) +static void __init setup_cpu_hwcaps(void) { int i; const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; @@ -758,7 +758,8 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, * Run through the enabled capabilities and enable() it on all active * CPUs */ -static void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) +static void __init +enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { int i; @@ -897,7 +898,7 @@ static inline void set_sys_caps_initialised(void) #endif /* CONFIG_HOTPLUG_CPU */ -static void setup_feature_capabilities(void) +static void __init setup_feature_capabilities(void) { update_cpu_capabilities(arm64_features, "detected feature:"); enable_cpu_capabilities(arm64_features); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4c46c54a3ad7..acc1afd5c749 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -289,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = { .notifier_call = fpsimd_cpu_pm_notifier, }; -static void fpsimd_pm_init(void) +static void __init fpsimd_pm_init(void) { cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block); } diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6ad554b01e95..7b71b7c47adb 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -40,7 +40,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, static struct gen_pool *atomic_pool; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; static int __init early_coherent_pool(char *p) { @@ -896,7 +896,7 @@ static int __iommu_attach_notifier(struct notifier_block *nb, return 0; } -static int register_iommu_dma_ops_notifier(struct bus_type *bus) +static int __init register_iommu_dma_ops_notifier(struct bus_type *bus) { struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL); int ret; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 325f89ff897e..d43715a14383 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -71,7 +71,7 @@ early_param("initrd", early_initrd); * currently assumes that for memory starting above 4G, 32-bit devices will * use a DMA offset. */ -static phys_addr_t max_zone_dma_phys(void) +static phys_addr_t __init max_zone_dma_phys(void) { phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32); return min(offset + (1ULL << 32), memblock_end_of_DRAM()); @@ -128,11 +128,11 @@ EXPORT_SYMBOL(pfn_valid); #endif #ifndef CONFIG_SPARSEMEM -static void arm64_memory_present(void) +static void __init arm64_memory_present(void) { } #else -static void arm64_memory_present(void) +static void __init arm64_memory_present(void) { struct memblock_region *reg; From f65f45205e5bae2464be051cb6deca598f62f3e8 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 2 Dec 2015 14:00:10 +0000 Subject: [PATCH 368/607] UPSTREAM: arm64: fix COMPAT_SHMLBA definition for large pages ARM glibc uses (4 * __getpagesize()) for SHMLBA, which is correct for 4KB pages and works fine for 64KB pages, but the kernel uses a hardcoded 16KB that is too small for 64KB page based kernels. This changes the definition to what user space sees when using 64KB pages. Acked-by: Arnd Bergmann Signed-off-by: Yury Norov Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit b9b7aebb42d1b1392f3111de61136bb6cf3aae3f) Signed-off-by: Jeff Vander Stoep Change-Id: I698814d005a28c7fe3963cded9756f88660d4aa0 --- arch/arm64/include/asm/shmparam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h index 4df608a8459e..e368a55ebd22 100644 --- a/arch/arm64/include/asm/shmparam.h +++ b/arch/arm64/include/asm/shmparam.h @@ -21,7 +21,7 @@ * alignment value. Since we don't have aliasing D-caches, the rest of * the time we can safely use PAGE_SIZE. */ -#define COMPAT_SHMLBA 0x4000 +#define COMPAT_SHMLBA (4 * PAGE_SIZE) #include From 9b49bf531ba02bcdbc6d4182a179a49dd30349d9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Nov 2015 15:12:59 +0000 Subject: [PATCH 369/607] UPSTREAM: arm64: enable HAVE_IRQ_TIME_ACCOUNTING arm64 relies on the arm_arch_timer for sched_clock, so we can select HAVE_IRQ_TIME_ACCOUNTING and have the core sched-clock code enable the feature at runtime based on the rate. Reported-by: Mario Smarduch Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 24da208db32ee1e4757ceaba898c47add8e5361e) Signed-off-by: Jeff Vander Stoep Change-Id: I849e010459dbbde0ac0d44d665dadcea9f8bf12d --- Documentation/features/time/irq-time-acct/arch-support.txt | 2 +- arch/arm64/Kconfig | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index e63316239938..4199ffecc0ff 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | .. | | arc: | TODO | | arm: | ok | - | arm64: | .. | + | arm64: | ok | | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3ad9d9ea374b..92f234a5579b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -73,6 +73,7 @@ config ARM64 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_GENERIC_DMA_COHERENT select HAVE_HW_BREAKPOINT if PERF_EVENTS + select HAVE_IRQ_TIME_ACCOUNTING select HAVE_MEMBLOCK select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS From 5bb08d2a552a3a7ba1e84527ea7987018fcc2f69 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 19 Nov 2015 17:48:31 +0000 Subject: [PATCH 370/607] UPSTREAM: arm64: spinlock: serialise spin_unlock_wait against concurrent lockers Boqun Feng reported a rather nasty ordering issue with spin_unlock_wait on architectures implementing spin_lock with LL/SC sequences and acquire semantics: | CPU 1 CPU 2 CPU 3 | ================== ==================== ============== | spin_unlock(&lock); | spin_lock(&lock): | r1 = *lock; // r1 == 0; | o = READ_ONCE(object); // reordered here | object = NULL; | smp_mb(); | spin_unlock_wait(&lock); | *lock = 1; | smp_mb(); | o->dead = true; | if (o) // true | BUG_ON(o->dead); // true!! The crux of the problem is that spin_unlock_wait(&lock) can return on CPU 1 whilst CPU 2 is in the process of taking the lock. This can be resolved by upgrading spin_unlock_wait to a LOCK operation, forcing it to serialise against a concurrent locker and giving it acquire semantics in the process (although it is not at all clear whether this is needed - different callers seem to assume different things about the barrier semantics and architectures are similarly disjoint in their implementations of the macro). This patch implements spin_unlock_wait using an LL/SC sequence with acquire semantics on arm64. For v8.1 systems with the LSE atomics, the exclusive writeback is omitted, since the spin_lock operation is indivisible and no intermediate state can be observed. Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit d86b8da04dfa4771a68bdbad6c424d40f22f0d14) Signed-off-by: Jeff Vander Stoep Change-Id: I27d88458c99dfd475d0326bd1d408ec3e575a7dd --- arch/arm64/include/asm/spinlock.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index c85e96d174a5..fc9682bfe002 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,9 +26,28 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + unsigned int tmp; + arch_spinlock_t lockval; -#define arch_spin_unlock_wait(lock) \ - do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0) + asm volatile( +" sevl\n" +"1: wfe\n" +"2: ldaxr %w0, %2\n" +" eor %w1, %w0, %w0, ror #16\n" +" cbnz %w1, 1b\n" + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ +" stxr %w1, %w0, %2\n" +" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */ + /* LSE atomics */ +" nop\n" +" nop\n") + : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) + : + : "memory"); +} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) From b42c14a531b4b887ffca279ad12eb844a99408ae Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 4 Dec 2015 11:38:39 +0800 Subject: [PATCH 371/607] UPSTREAM: arm64: ftrace: stop using kstop_machine to enable/disable tracing For ftrace on arm64, kstop_machine which is hugely disruptive to a running system is not needed to convert nops to ftrace calls or back, because that to be modified instrucions, that NOP, B or BL, are all safe instructions which called "concurrent modification and execution of instructions", that can be executed by one thread of execution as they are being modified by another thread of execution without requiring explicit synchronization. Signed-off-by: Li Bin Reviewed-by: Steven Rostedt Signed-off-by: Will Deacon Bug: 30369029 Patchset: arm64-ftrace (cherry picked from commit 81a6a146e88eca5d6726569779778d61489d85aa) Signed-off-by: Jeff Vander Stoep Change-Id: I54e2c0d49bd68f9547bd9f0da8b7520e02bb0714 --- arch/arm64/kernel/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c851be795080..9669b331a23b 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ftrace_modify_code(pc, old, new, true); } +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + int __init ftrace_dyn_arch_init(void) { return 0; From 2c21fce80b6fab7304d2081f2b1157033e30ad50 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 4 Dec 2015 11:38:40 +0800 Subject: [PATCH 372/607] UPSTREAM: arm64: ftrace: fix the comments for ftrace_modify_code There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Suggested-by: Steven Rostedt Signed-off-by: Li Bin Signed-off-by: Will Deacon Bug: 30369029 Patchset: arm64-ftrace (cherry picked from commit 004ab584e028093996cf5b8e220b8bc50c5111cf) Signed-off-by: Jeff Vander Stoep Change-Id: I69df7eddbf9e17031920b950312399dc4d36c09e --- arch/arm64/kernel/ftrace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 9669b331a23b..8f7005bc35bd 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, /* * Note: - * Due to modules and __init, code can disappear and change, - * we need to protect against faulting as well as code changing. - * We do this by aarch64_insn_*() which use the probe_kernel_*(). - * - * No lock is held here because all the modifications are run - * through stop_machine(). + * We are paranoid about modifying text, as if a bug were to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with aarch64_insn_*() which uses + * probe_kernel_*(), and make sure what we read is what we expected it + * to be before modifying it. */ if (validate) { if (aarch64_insn_read((void *)pc, &replaced)) From 611a16d0e5fe5f9f77c62551d94770e2cc825eb5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 4 Dec 2015 12:42:29 +0000 Subject: [PATCH 373/607] UPSTREAM: arm64: Add trace_hardirqs_off annotation in ret_to_user When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning is produced when entering userspace for the first time: WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519 Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639 Hardware name: Juno (DT) task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000 PC is at check_flags.part.22+0x19c/0x1a8 LR is at check_flags.part.22+0x19c/0x1a8 pc : [] lr : [] pstate: 600001c5 sp : ffffffc9768abe10 x29: ffffffc9768abe10 x28: ffffffc9768a8000 x27: 0000000000000000 x26: 0000000000000001 x25: 00000000000000a6 x24: ffffffc00064be6c x23: ffffffc0009f249e x22: ffffffc9768a0000 x21: ffffffc97fea5480 x20: 00000000000001c0 x19: ffffffc00169a000 x18: 0000005558cc7b58 x17: 0000007fb78e3180 x16: 0000005558d2e238 x15: ffffffffffffffff x14: 0ffffffffffffffd x13: 0000000000000008 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63 x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964 x7 : 0000000000000001 x6 : ffffffc0001079c4 x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc001698438 x2 : 0000000000000000 x1 : ffffffc9768a0000 x0 : 000000000000002e Call trace: [] check_flags.part.22+0x19c/0x1a8 [] lock_is_held+0x80/0x98 [] __schedule+0x404/0x730 [] schedule+0x44/0xb8 [] ret_to_user+0x0/0x24 possible reason: unannotated irqs-off. irq event stamp: 502169 hardirqs last enabled at (502169): [] el0_irq_naked+0x1c/0x24 hardirqs last disabled at (502167): [] __do_softirq+0x17c/0x298 softirqs last enabled at (502168): [] __do_softirq+0x1fc/0x298 softirqs last disabled at (502143): [] irq_exit+0xa0/0xf0 This happens because we disable interrupts in ret_to_user before calling schedule() in work_resched. This patch adds the necessary trace_hardirqs_off annotation. Signed-off-by: Catalin Marinas Reported-by: Mark Rutland Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit db3899a6477a4dccd26cbfb7f408b6be2cc068e0) Signed-off-by: Jeff Vander Stoep Change-Id: I868c6c64c548f12156467959e2b8df09caae282f --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ed3d75f6304..e5b25389c48f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -634,6 +634,9 @@ work_pending: bl do_notify_resume b ret_to_user work_resched: +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off // the IRQs are off here, inform the tracing code +#endif bl schedule /* From b7547d55d7a578f62053c86f475eabd68282e6aa Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Fri, 4 Dec 2015 11:02:25 +0000 Subject: [PATCH 374/607] UPSTREAM: arm64: Store struct thread_info in sp_el0 There is need for figuring out how to manage struct thread_info data when IRQ stack is introduced. struct thread_info information should be copied to IRQ stack under the current thread_info calculation logic whenever context switching is invoked. This is too expensive to keep supporting the approach. Instead, this patch pays attention to sp_el0 which is an unused scratch register in EL1 context. sp_el0 utilization not only simplifies the management, but also prevents text section size from being increased largely due to static allocated IRQ stack as removing masking operation using THREAD_SIZE in many places. Reviewed-by: Catalin Marinas Signed-off-by: Jungseok Lee Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 6cdf9c7ca687e01840d0215437620a20263012fc) Signed-off-by: Jeff Vander Stoep Change-Id: I53c9f44a0772b8649f302a65a7a6519d8eebcb91 --- arch/arm64/include/asm/thread_info.h | 10 ++++++++-- arch/arm64/kernel/entry.S | 15 ++++++++++++--- arch/arm64/kernel/head.S | 5 +++++ arch/arm64/kernel/sleep.S | 3 +++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 90c7ff233735..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp"); */ static inline struct thread_info *current_thread_info(void) __attribute_const__; +/* + * struct thread_info can be accessed directly via sp_el0. + */ static inline struct thread_info *current_thread_info(void) { - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); + unsigned long sp_el0; + + asm ("mrs %0, sp_el0" : "=r" (sp_el0)); + + return (struct thread_info *)sp_el0; } #define thread_saved_pc(tsk) \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e5b25389c48f..245fa6837880 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -88,7 +88,8 @@ .if \el == 0 mrs x21, sp_el0 - get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, + mov tsk, sp + and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. .else @@ -107,6 +108,13 @@ str x21, [sp, #S_SYSCALLNO] .endif + /* + * Set sp_el0 to current thread_info. + */ + .if \el == 0 + msr sp_el0, tsk + .endif + /* * Registers that may be useful after this macro is invoked: * @@ -164,8 +172,7 @@ alternative_endif .endm .macro get_thread_info, rd - mov \rd, sp - and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack + mrs \rd, sp_el0 .endm /* @@ -599,6 +606,8 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 + and x9, x9, #~(THREAD_SIZE - 1) + msr sp_el0, x9 ret ENDPROC(cpu_switch_to) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 23cfc08fc8ba..b363f340f2c7 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -424,6 +424,9 @@ __mmap_switched: b 1b 2: adr_l sp, initial_sp, x4 + mov x4, sp + and x4, x4, #~(THREAD_SIZE - 1) + msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer str_l x24, memstart_addr, x6 // Save PHYS_OFFSET mov x29, #0 @@ -606,6 +609,8 @@ ENDPROC(secondary_startup) ENTRY(__secondary_switched) ldr x0, [x21] // get secondary_data.stack mov sp, x0 + and x0, x0, #~(THREAD_SIZE - 1) + msr sp_el0, x0 // save thread_info mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index f586f7c875e2..e33fe33876ab 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -173,6 +173,9 @@ ENTRY(cpu_resume) /* load physical address of identity map page table in x1 */ adrp x1, idmap_pg_dir mov sp, x2 + /* save thread_info */ + and x2, x2, #~(THREAD_SIZE - 1) + msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context physical address * pointer and x1 to contain physical address of 1:1 page tables From 52494a8bb8a274cb31bb241bb92bd2617855f9f8 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 4 Dec 2015 11:02:26 +0000 Subject: [PATCH 375/607] UPSTREAM: arm64: Modify stack trace and dump for use with irq_stack This patch allows unwind_frame() to traverse from interrupt stack to task stack correctly. It requires data from a dummy stack frame, created during irq_stack_entry(), added by a later patch. A similar approach is taken to modify dump_backtrace(), which expects to find struct pt_regs underneath any call to functions marked __exception. When on an irq_stack, the struct pt_regs is stored on the old task stack, the location of which is stored in the dummy stack frame. Reviewed-by: Catalin Marinas Signed-off-by: AKASHI Takahiro [james.morse: merged two patches, reworked for per_cpu irq_stacks, and no alignment guarantees, added irq_stack definitions] Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 132cd887b5c54758d04bf25c52fa48f45e843a30) Signed-off-by: Jeff Vander Stoep Change-Id: I60b29291620a71ab7b6564730299d29f41ceb199 --- arch/arm64/include/asm/irq.h | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/irq.c | 3 +++ arch/arm64/kernel/stacktrace.c | 29 +++++++++++++++++++++++++++-- arch/arm64/kernel/traps.c | 14 +++++++++++++- 4 files changed, 75 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 8e8d30684392..e2f3f135a3bc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,10 +1,32 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H +#define IRQ_STACK_SIZE THREAD_SIZE +#define IRQ_STACK_START_SP THREAD_START_SP + +#ifndef __ASSEMBLER__ + +#include + #include +#include struct pt_regs; +DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +/* + * The highest address on the stack, and the first to be used. Used to + * find the dummy-stack frame put down by el?_irq() in entry.S. + */ +#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) + +/* + * The offset from irq_stack_ptr where entry.S will store the original + * stack pointer. Used by unwind_frame() and dump_backtrace(). + */ +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); + extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -12,4 +34,14 @@ static inline int nr_legacy_irqs(void) return 0; } +static inline bool on_irq_stack(unsigned long sp, int cpu) +{ + /* variable names the same as kernel/stacktrace.c */ + unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); + unsigned long high = low + IRQ_STACK_START_SP; + + return (low <= sp && sp <= high); +} + +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 9f17ec071ee0..1e3cef578e21 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -30,6 +30,9 @@ unsigned long irq_err_count; +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ccb6078ed9f2..b947eeffa5b2 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,7 @@ #include #include +#include #include /* @@ -39,17 +40,41 @@ int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; + unsigned long irq_stack_ptr; + + /* + * Use raw_smp_processor_id() to avoid false-positives from + * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping + * task stacks, we can be pre-empted in this case, so + * {raw_,}smp_processor_id() may give us the wrong value. Sleeping + * tasks can't ever be on an interrupt stack, so regardless of cpu, + * the checks will always fail. + */ + irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); low = frame->sp; - high = ALIGN(low, THREAD_SIZE); + /* irq stacks are not THREAD_SIZE aligned */ + if (on_irq_stack(frame->sp, raw_smp_processor_id())) + high = irq_stack_ptr; + else + high = ALIGN(low, THREAD_SIZE) - 0x20; - if (fp < low || fp > high - 0x18 || fp & 0xf) + if (fp < low || fp > high || fp & 0xf) return -EINVAL; frame->sp = fp + 0x10; frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); + /* + * Check whether we are going to walk through from interrupt stack + * to task stack. + * If we reach the end of the stack - and its an interrupt stack, + * read the original task stack pointer from the dummy frame. + */ + if (frame->sp == irq_stack_ptr) + frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index e9b9b5364393..8a0084541f84 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,6 +146,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; + unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -180,9 +181,20 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (ret < 0) break; stack = frame.sp; - if (in_exception_text(where)) + if (in_exception_text(where)) { + /* + * If we switched to the irq_stack before calling this + * exception handler, then the pt_regs will be on the + * task stack. The easiest way to tell is if the large + * pt_regs would overlap with the end of the irq_stack. + */ + if (stack < irq_stack_ptr && + (stack + sizeof(struct pt_regs)) > irq_stack_ptr) + stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + dump_mem("", "Exception stack", stack, stack + sizeof(struct pt_regs), false); + } } } From d8c228e9513c852cd1c6388a8e011eec9120435c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Dec 2015 11:02:27 +0000 Subject: [PATCH 376/607] UPSTREAM: arm64: Add do_softirq_own_stack() and enable irq_stacks entry.S is modified to switch to the per_cpu irq_stack during el{0,1}_irq. irq_count is used to detect recursive interrupts on the irq_stack, it is updated late by do_softirq_own_stack(), when called on the irq_stack, before __do_softirq() re-enables interrupts to process softirqs. do_softirq_own_stack() is added by this patch, but does not yet switch stack. This patch adds the dummy stack frame and data needed by the previous stack tracing patches. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 8e23dacd12a48e58125b84c817da50850b73280a) Signed-off-by: Jeff Vander Stoep Change-Id: I9f79437af3da0398cb12e7afd1aa9f473f59b2e6 --- arch/arm64/include/asm/irq.h | 2 ++ arch/arm64/kernel/entry.S | 42 ++++++++++++++++++++++++++++++++++-- arch/arm64/kernel/irq.c | 38 +++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index e2f3f135a3bc..fa2a8d0e4792 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,6 +11,8 @@ #include #include +#define __ARCH_HAS_DO_SOFTIRQ + struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 245fa6837880..8f7e737949fe 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,42 @@ alternative_endif mrs \rd, sp_el0 .endm + .macro irq_stack_entry, dummy_lr + mov x19, sp // preserve the original sp + + adr_l x25, irq_stack + mrs x26, tpidr_el1 + add x25, x25, x26 + + /* + * Check the lowest address on irq_stack for the irq_count value, + * incremented by do_softirq_own_stack if we have re-enabled irqs + * while on the irq_stack. + */ + ldr x26, [x25] + cbnz x26, 9998f // recursive use? + + /* switch to the irq stack */ + mov x26, #IRQ_STACK_START_SP + add x26, x25, x26 + mov sp, x26 + + /* Add a dummy stack frame */ + stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + mov x29, sp + stp xzr, x19, [sp, #-16]! + +9998: + .endm + + /* + * x19 should be preserved between irq_stack_entry and + * irq_stack_exit. + */ + .macro irq_stack_exit + mov sp, x19 + .endm + /* * These are the registers used in the syscall handler, and allow us to * have in theory up to 7 arguments to a function - x0 to x6. @@ -190,10 +227,11 @@ tsk .req x28 // current thread_info * Interrupt handling. */ .macro irq_handler - adrp x1, handle_arch_irq - ldr x1, [x1, #:lo12:handle_arch_irq] + ldr_l x1, handle_arch_irq mov x0, sp + irq_stack_entry x22 blr x1 + irq_stack_exit .endm .text diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 1e3cef578e21..ff7ebb710e51 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,14 +25,24 @@ #include #include #include +#include #include #include unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +/* + * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. + * irq_stack[0] is used as irq_count, a non-zero value indicates the stack + * is in use, and el?_irq() shouldn't switch to it. This is used to detect + * recursive use of the irq_stack, it is lazily updated by + * do_softirq_own_stack(), which is called on the irq_stack, before + * re-enabling interrupts to process softirqs. + */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -56,3 +66,29 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } + +/* + * do_softirq_own_stack() is called from irq_exit() before __do_softirq() + * re-enables interrupts, at which point we may re-enter el?_irq(). We + * increase irq_count here so that el1_irq() knows that it is already on the + * irq stack. + * + * Called with interrupts disabled, so we don't worry about moving cpu, or + * being interrupted while modifying irq_count. + * + * This function doesn't actually switch stack. + */ +void do_softirq_own_stack(void) +{ + int cpu = smp_processor_id(); + + WARN_ON_ONCE(!irqs_disabled()); + + if (on_irq_stack(current_stack_pointer, cpu)) { + IRQ_COUNT()++; + __do_softirq(); + IRQ_COUNT()--; + } else { + __do_softirq(); + } +} From b485dccd9eee43b0739198f53459c6a0088e93a5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Dec 2015 13:58:42 +0000 Subject: [PATCH 377/607] UPSTREAM: arm64: irq: fix walking from irq stack to task stack Running with CONFIG_DEBUG_SPINLOCK=y can trigger a BUG with the new IRQ stack code: BUG: spinlock lockup suspected on CPU#1 This is due to the IRQ_STACK_TO_TASK_STACK macro incorrectly retrieving the task stack pointer stashed at the top of the IRQ stack. Sayeth James: | Yup, this is what is happening. Its an off-by-one due to broken | thinking about how the stack works. My broken thinking was: | | > top ------------ | > | dummy_lr | <- irq_stack_ptr | > ------------ | > | x29 | | > ------------ | > | x19 | <- irq_stack_ptr - 0x10 | > ------------ | > | xzr | | > ------------ | | But the stack-pointer is decreased before use. So it actually looks | like this: | | > ------------ | > | | <- irq_stack_ptr | > top ------------ | > | dummy_lr | | > ------------ | > | x29 | <- irq_stack_ptr - 0x10 | > ------------ | > | x19 | | > ------------ | > | xzr | <- irq_stack_ptr - 0x20 | > ------------ | | The value being used as the original stack is x29, which in all the | tests is sp but without the current frames data, hence there are no | missing frames in the output. | | Jungseok Lee picked it up with a 32bit user space because aarch32 | can't use x29, so it remains 0 forever. The fix he posted is correct. This patch fixes the macro and adds some of this wisdom to a comment, so that the layout of the IRQ stack is well understood. Cc: James Morse Reported-by: Jungseok Lee Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 7596abf2e5661d52c4f414f37addeed54e098880) Signed-off-by: Jeff Vander Stoep Change-Id: Ic65c0d0d90a30a5caf8b3791d1e856400bd2b5f5 --- arch/arm64/include/asm/irq.h | 20 ++++++++++++++++++-- arch/arm64/kernel/entry.S | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index fa2a8d0e4792..877c7e358384 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -19,7 +19,23 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); /* * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S. + * find the dummy-stack frame put down by el?_irq() in entry.S, which + * is structured as follows: + * + * ------------ + * | | <- irq_stack_ptr + * top ------------ + * | elr_el1 | + * ------------ + * | x29 | <- irq_stack_ptr - 0x10 + * ------------ + * | xzr | + * ------------ + * | x19 | <- irq_stack_ptr - 0x20 + * ------------ + * + * where x19 holds a copy of the task stack pointer. + * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -27,7 +43,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8f7e737949fe..be7ec544b540 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -199,7 +199,7 @@ alternative_endif /* Add a dummy stack frame */ stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame mov x29, sp - stp xzr, x19, [sp, #-16]! + stp x19, xzr, [sp, #-16]! 9998: .endm From 722e6114950d4dfdf890c045461932c91a4209d7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:39 +0000 Subject: [PATCH 378/607] UPSTREAM: arm64: Add this_cpu_ptr() assembler macro for use in entry.S irq_stack is a per_cpu variable, that needs to be access from entry.S. Use an assembler macro instead of the unreadable details. Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit aa4d5d3cbc258c355151a3903211b27359390ec5) Signed-off-by: Jeff Vander Stoep Change-Id: I2ccc11f442c0303c62e1c3e0a05a088958c922b8 --- arch/arm64/include/asm/assembler.h | 11 +++++++++++ arch/arm64/kernel/entry.S | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 12eff928ef8b..bb7b72734c24 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -193,6 +193,17 @@ lr .req x30 // link register str \src, [\tmp, :lo12:\sym] .endm + /* + * @sym: The name of the per-cpu variable + * @reg: Result of per_cpu(sym, smp_processor_id()) + * @tmp: scratch register + */ + .macro this_cpu_ptr, sym, reg, tmp + adr_l \reg, \sym + mrs \tmp, tpidr_el1 + add \reg, \reg, \tmp + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index be7ec544b540..e394f8c9595a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -179,9 +179,7 @@ alternative_endif .macro irq_stack_entry, dummy_lr mov x19, sp // preserve the original sp - adr_l x25, irq_stack - mrs x26, tpidr_el1 - add x25, x25, x26 + this_cpu_ptr irq_stack, x25, x26 /* * Check the lowest address on irq_stack for the irq_count value, From 4ba051d5a5476288cb27912c90de10aaf7c7f151 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:40 +0000 Subject: [PATCH 379/607] UPSTREAM: arm64: when walking onto the task stack, check sp & fp are in current->stack When unwind_frame() reaches the bottom of the irq_stack, the last fp points to the original task stack. unwind_frame() uses IRQ_STACK_TO_TASK_STACK() to find the sp value. If either values is wrong, we may end up walking a corrupt stack. Check these values are sane by testing if they are both on the stack pointed to by current->stack. Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 1ffe199b1c9b72a8e752a9ae2a7af10128ab2ca1) Signed-off-by: Jeff Vander Stoep Change-Id: I2e5bf1ce899a1018f1c5b8ccb4f7c816d61bba21 --- arch/arm64/kernel/stacktrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b947eeffa5b2..d916d5b6aef6 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -71,9 +71,17 @@ int notrace unwind_frame(struct stackframe *frame) * to task stack. * If we reach the end of the stack - and its an interrupt stack, * read the original task stack pointer from the dummy frame. + * + * Check the frame->fp we read from the bottom of the irq_stack, + * and the original task stack pointer are both in current->stack. */ - if (frame->sp == irq_stack_ptr) - frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (frame->sp == irq_stack_ptr) { + unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + + if(object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) + frame->sp = orig_sp; + } return 0; } From 6674ba575b07febb6f025811e6afb64c15834b30 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:41 +0000 Subject: [PATCH 380/607] UPSTREAM: arm64: don't call C code with el0's fp register On entry from el0, we save all the registers on the kernel stack, and restore them before returning. x29 remains unchanged when we call out to C code, which will store x29 as the frame-pointer on the stack. Instead, write 0 into x29 after entry from el0, to avoid any risk of tracing into user space. Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 49003a8d6b35e128ef5e51433e60e783a46fbe5f) Signed-off-by: Jeff Vander Stoep Change-Id: Ifae7003018e4088d5de038cef25fa210211a75b6 --- arch/arm64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e394f8c9595a..2284c296e3f7 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -93,6 +93,8 @@ and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. + + mov x29, xzr // fp pointed to user-space .else add x21, sp, #S_FRAME_SIZE .endif From eb87372aa93bc9849fe05a86eb0994e2bc755c45 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 23 Jun 2016 18:42:51 -0700 Subject: [PATCH 381/607] net: diag: Add support to filter on device index Add support to inet_diag facility to filter sockets based on device index. If an interface index is in the filter only sockets bound to that index (sk_bound_dev_if) are returned. [cherry-pick of net-next 637c841dd7a5f9bd97b75cbe90b526fa1a52e530] Change-Id: I6b6bcdcf15d3142003f1ee53b4d82f2fabbb8250 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 68a1f71fde9f..b49bfb6ea22a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -72,6 +72,7 @@ enum { INET_DIAG_BC_AUTO, INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, + INET_DIAG_BC_DEV_COND, /* u32 ifindex */ }; struct inet_diag_hostcond { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a403a676d452..e532671c3f4f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,7 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; + u32 ifindex; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -552,6 +553,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_DEV_COND: { + u32 ifindex; + + ifindex = *((const u32 *)(op + 1)); + if (ifindex != entry->ifindex) + yes = 0; + break; + } } if (yes) { @@ -594,6 +603,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); + entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); @@ -617,6 +627,17 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* data is u32 ifindex */ +static bool valid_devcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + /* Check ifindex space. */ + *min_len += sizeof(u32); + if (len < *min_len) + return false; + + return true; +} /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) @@ -681,6 +702,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_DEV_COND: + if (!valid_devcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: From 5bd0c83ba48ce73eb89bd4308b07a263a07dd21e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 24 Aug 2016 15:46:25 +0900 Subject: [PATCH 382/607] net: diag: slightly refactor the inet_diag_bc_audit error checks. This simplifies the code a bit and also allows inet_diag_bc_audit to send to userspace an error that isn't EINVAL. [cherry-pick of net-next 627cc4add53c0470bfd118002669205d222d3a54] Change-Id: Iee3d2bbb19f3110d71f0698ffb293f9bdffc8ef1 Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e532671c3f4f..75f6d4bde273 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -687,10 +687,16 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +static int inet_diag_bc_audit(const struct nlattr *attr) { - const void *bc = bytecode; - int len = bytecode_len; + const void *bytecode, *bc; + int bytecode_len, len; + + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + return -EINVAL; + + bytecode = bc = nla_data(attr); + len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); @@ -1001,13 +1007,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(nlh, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr); + if (err) + return err; } { struct netlink_dump_control c = { @@ -1032,13 +1038,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr); + if (err) + return err; } { struct netlink_dump_control c = { From ecd02bbc43641454b940fcf9c16b39ab8179af67 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 24 Aug 2016 15:46:26 +0900 Subject: [PATCH 383/607] net: diag: allow socket bytecode filters to match socket marks This allows a privileged process to filter by socket mark when dumping sockets via INET_DIAG_BY_FAMILY. This is useful on systems that use mark-based routing such as Android. The ability to filter socket marks requires CAP_NET_ADMIN, which is consistent with other privileged operations allowed by the SOCK_DIAG interface such as the ability to destroy sockets and the ability to inspect BPF filters attached to packet sockets. [cherry-pick of a52e95abf772b43c9226e9a72d3c1353903ba96f] Change-Id: I8b90b814264d9808bda050cdba8f104943bdb9a8 Tested: https://android-review.googlesource.com/261350 Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 6 ++++++ net/ipv4/inet_diag.c | 36 +++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index b49bfb6ea22a..35435c348c60 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -73,6 +73,7 @@ enum { INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, INET_DIAG_BC_DEV_COND, /* u32 ifindex */ + INET_DIAG_BC_MARK_COND, }; struct inet_diag_hostcond { @@ -82,6 +83,11 @@ struct inet_diag_hostcond { __be32 addr[0]; }; +struct inet_diag_markcond { + __u32 mark; + __u32 mask; +}; + /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ struct inet_diag_msg { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 75f6d4bde273..5202149cbce0 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -45,6 +45,7 @@ struct inet_diag_entry { u16 family; u16 userlocks; u32 ifindex; + u32 mark; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -561,6 +562,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_MARK_COND: { + struct inet_diag_markcond *cond; + + cond = (struct inet_diag_markcond *)(op + 1); + if ((entry->mark & cond->mask) != cond->mark) + yes = 0; + break; + } } if (yes) { @@ -605,6 +614,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; + if (sk_fullsock(sk)) + entry.mark = sk->sk_mark; + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else + entry.mark = 0; return inet_diag_bc_run(bc, &entry); } @@ -687,8 +702,17 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const struct nlattr *attr) +static bool valid_markcond(const struct inet_diag_bc_op *op, int len, + int *min_len) { + *min_len += sizeof(struct inet_diag_markcond); + return len >= *min_len; +} + +static int inet_diag_bc_audit(const struct nlattr *attr, + const struct sk_buff *skb) +{ + bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); const void *bytecode, *bc; int bytecode_len, len; @@ -719,6 +743,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr) if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_MARK_COND: + if (!net_admin) + return -EPERM; + if (!valid_markcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: @@ -1011,7 +1041,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr); + err = inet_diag_bc_audit(attr, skb); if (err) return err; } @@ -1042,7 +1072,7 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr); + err = inet_diag_bc_audit(attr, skb); if (err) return err; } From 2f500d61a4df5764d94abfab7cbc43e4e9b92881 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Aug 2016 21:06:33 -0700 Subject: [PATCH 384/607] net: diag: support SOCK_DESTROY for UDP sockets This implements SOCK_DESTROY for UDP sockets similar to what was done for TCP with commit c1e64e298b8ca ("net: diag: Support destroying TCP sockets.") A process with a UDP socket targeted for destroy is awakened and recvmsg fails with ECONNABORTED. [cherry-pick of 5d77dca82839ef016a93ad7acd7058b14d967752] Change-Id: I4b4862548e6e3c05dde27781e7daa0b18b93bd81 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/ipv4/udp.c | 15 +++++++++ net/ipv4/udp_diag.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/udp.c | 1 + 4 files changed, 96 insertions(+) diff --git a/include/net/udp.h b/include/net/udp.h index 6d4ed18e1427..e57f50258cda 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -238,6 +238,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); void udp_err(struct sk_buff *, u32); +int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d8946929813e..660933edd2d2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2257,6 +2257,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(udp_poll); +int udp_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(udp_abort); + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -2288,6 +2302,7 @@ struct proto udp_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif .clear_sk = sk_prot_clear_portaddr_nulls, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6116604bf6e8..8b9570c84fd5 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -165,12 +165,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_wqueue = sk_wmem_alloc_get(sk); } +#ifdef CONFIG_INET_DIAG_DESTROY +static int __udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req, + struct udp_table *tbl) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + rcu_read_lock(); + + if (req->sdiag_family == AF_INET) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl); +#if IS_ENABLED(CONFIG_IPV6) + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl); + + else + sk = __udp6_lib_lookup(net, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if, tbl); + } +#endif + else { + rcu_read_unlock(); + return -EINVAL; + } + + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + + rcu_read_unlock(); + + if (!sk) + return -ENOENT; + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_put(sk); + return -ENOENT; + } + + err = sock_diag_destroy(sk, ECONNABORTED); + + sock_put(sk); + + return err; +} + +static int udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udp_table); +} + +static int udplite_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udplite_table); +} + +#endif + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udp_diag_destroy, +#endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -192,6 +268,9 @@ static const struct inet_diag_handler udplite_diag_handler = { .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udplite_diag_destroy, +#endif }; static int __init udp_diag_init(void) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a1b6adc20e1e..54235ef177bb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1552,6 +1552,7 @@ struct proto udpv6_prot = { .compat_getsockopt = compat_udpv6_getsockopt, #endif .clear_sk = udp_v6_clear_sk, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { From 513e28e1983a31c52f30b4ca1b1e69e4c25b71fe Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 7 Sep 2016 13:38:35 +0900 Subject: [PATCH 385/607] net: diag: make udp_diag_destroy work for mapped addresses. udp_diag_destroy does look up the IPv4 UDP hashtable for mapped addresses, but it gets the IPv4 address to look up from the beginning of the IPv6 address instead of the end. [cherry-pick of net-next f95bf346226b9b79352e05508beececc807cc37a] Change-Id: Ia369482c4645bcade320b2c33a763f1ce4378ff1 Tested: https://android-review.googlesource.com/269874 Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets") Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp_diag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 8b9570c84fd5..39e0b8347bd2 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -186,8 +186,8 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, - req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_dst[3], req->id.idiag_dport, + req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, tbl); else From 662afd95b354bcf78a301838f345c59d9e164ad4 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 8 Sep 2016 00:42:25 +0900 Subject: [PATCH 386/607] net: inet: diag: expose the socket mark to privileged processes. This adds the capability for a process that has CAP_NET_ADMIN on a socket to see the socket mark in socket dumps. Commit a52e95abf772 ("net: diag: allow socket bytecode filters to match socket marks") recently gave privileged processes the ability to filter socket dumps based on mark. This patch is complementary: it ensures that the mark is also passed to userspace in the socket's netlink attributes. It is useful for tools like ss which display information about sockets. [backport of net-next d545caca827b65aab557a9e9dcdcf1e5a3823c2d] Change-Id: I33336ed9c3ee3fb78fe05c4c47b7fd18c6e33ef1 Tested: https://android-review.googlesource.com/270210 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 2 +- include/uapi/linux/inet_diag.h | 6 ++++- net/ipv4/inet_diag.c | 44 +++++++++++++++++++++++----------- net/ipv4/udp_diag.c | 10 ++++---- 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 7c27fa1030e8..795852dc3434 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh); + const struct nlmsghdr *unlh, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 35435c348c60..c7f189bd5979 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -120,9 +120,13 @@ enum { INET_DIAG_DCTCPINFO, INET_DIAG_PROTOCOL, /* response attribute only */ INET_DIAG_SKV6ONLY, + INET_DIAG_LOCALS, + INET_DIAG_PEERS, + INET_DIAG_PAD, + INET_DIAG_MARK, }; -#define INET_DIAG_MAX INET_DIAG_SKV6ONLY +#define INET_DIAG_MAX INET_DIAG_MARK /* INET_DIAG_MEM */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5202149cbce0..fcb83b2a61f0 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -98,6 +98,7 @@ static size_t inet_sk_attr_size(void) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -110,7 +111,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { const struct inet_sock *inet = inet_sk(sk); const struct tcp_congestion_ops *ca_ops; @@ -160,6 +162,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #endif + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + goto errout; + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -258,10 +263,11 @@ static int inet_csk_diag_fill(struct sock *sk, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, - user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, + portid, seq, nlmsg_flags, unlh, net_admin); } static int inet_twsk_diag_fill(struct sock *sk, @@ -303,8 +309,9 @@ static int inet_twsk_diag_fill(struct sock *sk, static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { + struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -318,7 +325,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = inet_reqsk(sk)->num_retrans; + r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); @@ -330,6 +337,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) + return -EMSGSIZE; + nlmsg_end(skb, nlh); return 0; } @@ -338,7 +349,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, portid, seq, @@ -346,10 +357,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -416,7 +427,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -777,7 +789,8 @@ static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - const struct nlattr *bc) + const struct nlattr *bc, + bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -785,7 +798,8 @@ static int inet_csk_diag_dump(struct sock *sk, return inet_csk_diag_fill(sk, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, + net_admin); } static void twsk_build_assert(void) @@ -821,6 +835,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -862,7 +877,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, cb->args[3] > 0) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, + bc, net_admin) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -930,7 +946,7 @@ skip_listen_ht: sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh); + cb->nlh, net_admin); if (res < 0) { spin_unlock_bh(lock); goto done; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 39e0b8347bd2..092aa60e8b92 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -20,7 +20,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return inet_sk_diag_fill(sk, NULL, skb, req, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, @@ -75,7 +75,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, err = inet_sk_diag_fill(sk, NULL, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -98,6 +99,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } From c47e4a12f511721dab3d2407d18e0e1a6b7b0e1c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:36 +0000 Subject: [PATCH 387/607] UPSTREAM: arm64: mm: remove pointless PAGE_MASKing As pgd_offset{,_k} shift the input address by PGDIR_SHIFT, the sub-page bits will always be shifted out. There is no need to apply PAGE_MASK before this. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit e2c30ee320eb96304896c7ab84499e5bc5e5fb6e) Signed-off-by: Jeff Vander Stoep Change-Id: I86d3438aecf7295d5895e1430c1e19fbc82c9719 --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4c381f0e77a7..7b6e58a58bf3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -288,7 +288,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt, + __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, size, prot, early_alloc); } @@ -309,7 +309,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, return; } - return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), + return __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, size, prot, late_alloc); } From 31400ef82e7ae74663e9ca8e041516ede5e617e4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:37 +0000 Subject: [PATCH 388/607] BACKPORT: arm64: Remove redundant padding from linker script Currently we place an ALIGN_DEBUG_RO between text and data for the .text and .init sections, and depending on configuration each of these may result in up to SECTION_SIZE bytes worth of padding (for DEBUG_RODATA_ALIGN). We make no distinction between the text and data in each of these sections at any point when creating the initial page tables in head.S. We also make no distinction when modifying the tables; __map_memblock, fixup_executable, mark_rodata_ro, and fixup_init only work at section granularity. Thus this padding is unnecessary. For the spit between init text and data we impose a minimum alignment of 16 bytes, but this is also unnecessary. The init data is output immediately after the padding before any symbols are defined, so this is not required to keep a symbol for linker a section array correctly associated with the data. Any objects within the section will be given at least their usual alignment regardless. This patch removes the redundant padding. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 5b28cd9d084eca8ddc46270d2720305bfd40e348) Signed-off-by: Jeff Vander Stoep Change-Id: I5540ba1f4d90e2ae8fafa22e98c389bc4e975ac7 --- arch/arm64/kernel/vmlinux.lds.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f472809a7b45..1cc195872a1f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -113,7 +113,6 @@ SECTIONS *(.got) /* Global offset table */ } - ALIGN_DEBUG_RO _etext = .; /* End of text section */ RO_DATA(PAGE_SIZE) @@ -129,7 +128,6 @@ SECTIONS ARM_EXIT_KEEP(EXIT_TEXT) } - ALIGN_DEBUG_RO_MIN(16) .init.data : { INIT_DATA INIT_SETUP(16) From 8ace694d891ede66ac585fa12aa76cb9c5530427 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:38 +0000 Subject: [PATCH 389/607] UPSTREAM: arm64: mm: fold alternatives into .init Currently we treat the alternatives separately from other data that's only used during initialisation, using separate .altinstructions and .altinstr_replacement linker sections. These are freed for general allocation separately from .init*. This is problematic as: * We do not remove execute permissions, as we do for .init, leaving the memory executable. * We pad between them, making the kernel Image bianry up to PAGE_SIZE bytes larger than necessary. This patch moves the two sections into the contiguous region used for .init*. This saves some memory, ensures that we remove execute permissions, and allows us to remove some code made redundant by this reorganisation. Signed-off-by: Mark Rutland Cc: Andre Przywara Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 9aa4ec1571da62366cfddc20f3b923609604fe63) Signed-off-by: Jeff Vander Stoep Change-Id: I3fee118ead5c73ade50ea10d436881c9424a549c --- arch/arm64/include/asm/alternative.h | 1 - arch/arm64/kernel/alternative.c | 6 ------ arch/arm64/kernel/vmlinux.lds.S | 5 ++--- arch/arm64/mm/init.c | 1 - 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index d56ec0715157..e4962f04201e 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -19,7 +19,6 @@ struct alt_instr { void __init apply_alternatives_all(void); void apply_alternatives(void *start, size_t length); -void free_alternatives_memory(void); #define ALTINSTR_ENTRY(feature) \ " .word 661b - .\n" /* label */ \ diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index ab9db0e9818c..d2ee1b21a10d 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length) __apply_alternatives(®ion); } - -void free_alternatives_memory(void) -{ - free_reserved_area(__alt_instructions, __alt_instructions_end, - 0, "alternatives"); -} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1cc195872a1f..1a056bb26633 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -142,9 +142,6 @@ SECTIONS PERCPU_SECTION(L1_CACHE_BYTES) - . = ALIGN(PAGE_SIZE); - __init_end = .; - . = ALIGN(4); .altinstructions : { __alt_instructions = .; @@ -156,6 +153,8 @@ SECTIONS } . = ALIGN(PAGE_SIZE); + __init_end = .; + _data = .; _sdata = .; RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d43715a14383..f13078bbf66e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -362,7 +362,6 @@ void free_initmem(void) { fixup_init(); free_initmem_default(0); - free_alternatives_memory(); } #ifdef CONFIG_BLK_DEV_INITRD From c5ca0f872092c8c7fbcc903be244b63a33afd585 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 10 Dec 2015 16:54:32 +0000 Subject: [PATCH 390/607] UPSTREAM: arm64: cmpxchg: Don't incldue linux/mmdebug.h The arm64 asm/cmpxchg.h includes linux/mmdebug.h but doesn't so far as I can tell actually use anything from it. Removing the inclusion reduces spurious header dependency rebuilds and also avoids issues with recursive inclusions of headers causing build breaks due to attempts to use things before they are defined if linux/mmdebug.h starts pulling in more low level headers. Such errors have happened in -next recently, for example: In file included from include/linux/completion.h:11:0, from include/linux/rcupdate.h:43, from include/linux/tracepoint.h:19, from include/linux/mmdebug.h:6, from ./arch/arm64/include/asm/cmpxchg.h:22, from ./arch/arm64/include/asm/atomic.h:41, from include/linux/atomic.h:4, from include/linux/spinlock.h:406, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:19, from arch/arm64/kernel/asm-offsets.c:21: include/linux/wait.h: In function 'wait_on_atomic_t': include/linux/wait.h:1218:2: error: implicit declaration of function 'atomic_read' [-Werror=implicit-function-declaration] if (atomic_read(val) == 0) Signed-off-by: Mark Brown Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 4a6ccf30263f4e265c0f171561bf4c40bed5f273) Signed-off-by: Jeff Vander Stoep Change-Id: Idf44176dad0abc11e67b4e416b02a3fba14f3f1b --- arch/arm64/include/asm/cmpxchg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index 9ea611ea69df..510c7b404454 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -19,7 +19,6 @@ #define __ASM_CMPXCHG_H #include -#include #include #include From 2393897c7c7e10c2e8fe7d1c32466cbb885c9047 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 11 Dec 2015 11:04:31 +0000 Subject: [PATCH 391/607] UPSTREAM: arm64: mm: place __cpu_setup in .text We drop __cpu_setup in .text.init, which ends up being part of .text. The .text.init section was a legacy section name which has been unused elsewhere for a long time. The ".text.init" name is misleading if read as a synonym for ".init.text". Any CPU may execute __cpu_setup before turning the MMU on, so it should simply live in .text. Remove the pointless section assignment. This will leave __cpu_setup in the .text section. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit f00083cae331e5d3eecade6b4fdc35d0825e73ef) Signed-off-by: Jeff Vander Stoep Change-Id: I2e9b154cd6de92662c70c2b957479448252c661a --- arch/arm64/mm/proc.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index cacecc4ad3e5..b6f9053ab184 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -139,8 +139,6 @@ ENTRY(cpu_do_switch_mm) ret ENDPROC(cpu_do_switch_mm) - .section ".text.init", #alloc, #execinstr - /* * __cpu_setup * From 9e4d16282cb949c1ef3d9736de79c8155cb24d7b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 17 Nov 2015 14:45:47 +0000 Subject: [PATCH 392/607] UPSTREAM: arm64: Documentation: add list of software workarounds for errata It's not immediately obvious which hardware errata are worked around in the Linux kernel for an arbitrary kernel tree, so add a file to keep track of what we're working around. Acked-by: Catalin Marinas Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 9cb9c9e5ba8453537e8e645318edf231fe54eaf9) Signed-off-by: Jeff Vander Stoep Change-Id: I139972ff6e10e12c4b4f27cd047f55b3dd8b4118 --- Documentation/arm64/silicon-errata.txt | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 Documentation/arm64/silicon-errata.txt diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt new file mode 100644 index 000000000000..58b71ddf9b60 --- /dev/null +++ b/Documentation/arm64/silicon-errata.txt @@ -0,0 +1,58 @@ + Silicon Errata and Software Workarounds + ======================================= + +Author: Will Deacon +Date : 27 November 2015 + +It is an unfortunate fact of life that hardware is often produced with +so-called "errata", which can cause it to deviate from the architecture +under specific circumstances. For hardware produced by ARM, these +errata are broadly classified into the following categories: + + Category A: A critical error without a viable workaround. + Category B: A significant or critical error with an acceptable + workaround. + Category C: A minor error that is not expected to occur under normal + operation. + +For more information, consult one of the "Software Developers Errata +Notice" documents available on infocenter.arm.com (registration +required). + +As far as Linux is concerned, Category B errata may require some special +treatment in the operating system. For example, avoiding a particular +sequence of code, or configuring the processor in a particular way. A +less common situation may require similar actions in order to declassify +a Category A erratum into a Category C erratum. These are collectively +known as "software workarounds" and are only required in the minority of +cases (e.g. those cases that both require a non-secure workaround *and* +can be triggered by Linux). + +For software workarounds that may adversely impact systems unaffected by +the erratum in question, a Kconfig entry is added under "Kernel +Features" -> "ARM errata workarounds via the alternatives framework". +These are enabled by default and patched in at runtime when an affected +CPU is detected. For less-intrusive workarounds, a Kconfig option is not +available and the code is structured (preferably with a comment) in such +a way that the erratum will not be hit. + +This approach can make it slightly onerous to determine exactly which +errata are worked around in an arbitrary kernel source tree, so this +file acts as a registry of software workarounds in the Linux Kernel and +will be updated when new workarounds are committed and backported to +stable kernels. + +| Implementor | Component | Erratum ID | Kconfig | ++----------------+-----------------+-----------------+-------------------------+ +| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | +| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | +| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 | +| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 | +| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 | +| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | +| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | +| ARM | Cortex-A57 | #852523 | N/A | +| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | +| | | | | +| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | +| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | From 41b7e19e9b27266024dc0565426a91d922c3d96b Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Dec 2015 11:21:25 +0000 Subject: [PATCH 393/607] UPSTREAM: arm64: reduce stack use in irq_handler The code for switching to irq_stack stores three pieces of information on the stack, fp+lr, as a fake stack frame (that lets us walk back onto the interrupted tasks stack frame), and the address of the struct pt_regs that contains the register values from kernel entry. (which dump_backtrace() will print in any stack trace). To reduce this, we store fp, and the pointer to the struct pt_regs. unwind_frame() can recognise this as the irq_stack dummy frame, (as it only appears at the top of the irq_stack), and use the struct pt_regs values to find the missing interrupted link-register. Suggested-by: Will Deacon Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 971c67ce37cfeeaf560e792a2c3bc21d8b67163a) Signed-off-by: Jeff Vander Stoep Change-Id: I84cbb04857a441083d331e875c3e228d24ec2276 --- arch/arm64/include/asm/irq.h | 11 ++++------- arch/arm64/kernel/entry.S | 12 +++++++----- arch/arm64/kernel/stacktrace.c | 19 ++++++++++++++++--- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 877c7e358384..3bece4379bd9 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -25,16 +25,13 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * ------------ * | | <- irq_stack_ptr * top ------------ - * | elr_el1 | + * | x19 | <- irq_stack_ptr - 0x08 * ------------ * | x29 | <- irq_stack_ptr - 0x10 * ------------ - * | xzr | - * ------------ - * | x19 | <- irq_stack_ptr - 0x20 - * ------------ * - * where x19 holds a copy of the task stack pointer. + * where x19 holds a copy of the task stack pointer where the struct pt_regs + * from kernel_entry can be found. * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -43,7 +40,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); +#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2284c296e3f7..0667fb7d8bb1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -178,7 +178,7 @@ alternative_endif mrs \rd, sp_el0 .endm - .macro irq_stack_entry, dummy_lr + .macro irq_stack_entry mov x19, sp // preserve the original sp this_cpu_ptr irq_stack, x25, x26 @@ -196,10 +196,12 @@ alternative_endif add x26, x25, x26 mov sp, x26 - /* Add a dummy stack frame */ - stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + /* + * Add a dummy stack frame, this non-standard format is fixed up + * by unwind_frame() + */ + stp x29, x19, [sp, #-16]! mov x29, sp - stp x19, xzr, [sp, #-16]! 9998: .endm @@ -229,7 +231,7 @@ tsk .req x28 // current thread_info .macro irq_handler ldr_l x1, handle_arch_irq mov x0, sp - irq_stack_entry x22 + irq_stack_entry blr x1 irq_stack_exit .endm diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d916d5b6aef6..b9fd3a8abfc1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -70,17 +70,30 @@ int notrace unwind_frame(struct stackframe *frame) * Check whether we are going to walk through from interrupt stack * to task stack. * If we reach the end of the stack - and its an interrupt stack, - * read the original task stack pointer from the dummy frame. + * unpack the dummy frame to find the original elr. * * Check the frame->fp we read from the bottom of the irq_stack, * and the original task stack pointer are both in current->stack. */ if (frame->sp == irq_stack_ptr) { + struct pt_regs *irq_args; unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - if(object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) + if (object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) { frame->sp = orig_sp; + + /* orig_sp is the saved pt_regs, find the elr */ + irq_args = (struct pt_regs *)orig_sp; + frame->pc = irq_args->pc; + } else { + /* + * This frame has a non-standard format, and we + * didn't fix it, because the data looked wrong. + * Refuse to output this frame. + */ + return -EINVAL; + } } return 0; From b10551a35399a277afe9b2e28266e2a00a5f6253 Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:31 -0800 Subject: [PATCH 394/607] UPSTREAM: arm64: Defer dcache flush in __cpu_copy_user_page Defer dcache flushing to __sync_icache_dcache by calling flush_dcache_page which clears PG_dcache_clean flag. Acked-by: Catalin Marinas Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit e6b1185f77351aa154e63bd54b05d07ff99d4ffa) Signed-off-by: Jeff Vander Stoep Change-Id: I2e02ce2f43f68287337bed30e3c3455c0eee4164 --- arch/arm64/mm/copypage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 13bbc3be6f5a..22e4cb4d6f53 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -24,8 +24,9 @@ void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) { + struct page *page = virt_to_page(kto); copy_page(kto, kfrom); - __flush_dcache_area(kto, PAGE_SIZE); + flush_dcache_page(page); } EXPORT_SYMBOL_GPL(__cpu_copy_user_page); From 350a6a0fe0fe70d15a59084c8f4863d10271887f Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Wed, 13 Jan 2016 14:50:03 +0000 Subject: [PATCH 395/607] BACKPORT: arm64: kernel: fix architected PMU registers unconditional access The Performance Monitors extension is an optional feature of the AArch64 architecture, therefore, in order to access Performance Monitors registers safely, the kernel should detect the architected PMU unit presence through the ID_AA64DFR0_EL1 register PMUVer field before accessing them. This patch implements a guard by reading the ID_AA64DFR0_EL1 register PMUVer field to detect the architected PMU presence and prevent accessing PMU system registers if the Performance Monitors extension is not implemented in the core. Cc: Peter Maydell Cc: Mark Rutland Cc: Fixes: 60792ad349f3 ("arm64: kernel: enforce pmuserenr_el0 initialization and restore") Signed-off-by: Lorenzo Pieralisi Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit f436b2ac90a095746beb6729b8ee8ed87c9eaede) Signed-off-by: Jeff Vander Stoep Change-Id: Ie442b9feba5d143bd6b6c82d70190fc8bc95298d --- arch/arm64/kernel/head.S | 5 +++++ arch/arm64/mm/proc-macros.S | 12 ++++++++++++ arch/arm64/mm/proc.S | 2 ++ 3 files changed, 19 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b363f340f2c7..17ce7285bb12 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -515,9 +515,14 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems #endif /* EL2 debug */ + mrs x0, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx x0, x0, #8, #4 + cmp x0, #1 + b.lt 4f // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps ubfx x0, x0, #11, #5 // to EL2 and allow access to msr mdcr_el2, x0 // all PMU counters from EL1 +4: /* Stage-2 translation */ msr vttbr_el2, xzr diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index 4c4d93c4bf65..d69dffffaa89 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -62,3 +62,15 @@ bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH #endif .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index b6f9053ab184..c164d2cb35c0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -117,6 +117,7 @@ ENTRY(cpu_do_resume) */ ubfx x11, x11, #1, #1 msr oslar_el1, x11 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 mov x0, x12 dsb nsh // Make sure local tlb invalidation completed isb @@ -153,6 +154,7 @@ ENTRY(__cpu_setup) msr cpacr_el1, x0 // Enable FP/ASIMD mov x0, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x0 // access to the DCC from EL0 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 /* * Memory region attributes for LPAE: * From d378add20954990566e91f914105e2638564d8ba Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:32 -0800 Subject: [PATCH 396/607] BACKPORT: arm64: Use PoU cache instr for I/D coherency In systems with three levels of cache(PoU at L1 and PoC at L3), PoC cache flush instructions flushes L2 and L3 caches which could affect performance. For cache flushes for I and D coherency, PoU should suffice. So changing all I and D coherency related cache flushes to PoU. Introduced a new __clean_dcache_area_pou API for dcache flush till PoU and provided a common macro for __flush_dcache_area and __clean_dcache_area_pou. Also, now in __sync_icache_dcache, icache invalidation for non-aliasing VIPT icache is done only for that particular page instead of the earlier __flush_icache_all. Reviewed-by: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 0a28714c53fd4f7aea709be7577dfbe0095c8c3e) Signed-off-by: Jeff Vander Stoep Change-Id: I64f065140d5e8783e91ed53ae9c7a2e33a3e515a --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/cache.S | 28 ++++++++++++++---------- arch/arm64/mm/flush.c | 33 ++++++++++++++++------------- arch/arm64/mm/proc-macros.S | 22 +++++++++++++++++++ 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 54efedaf331f..7fc294c3bc5b 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -68,6 +68,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); static inline void flush_cache_mm(struct mm_struct *mm) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index cfa44a6adc0a..6df07069a025 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -81,25 +81,31 @@ ENDPROC(__flush_cache_user_range) /* * __flush_dcache_area(kaddr, size) * - * Ensure that the data held in the page kaddr is written back to the - * page in question. + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned and invalidated to the PoC. * * - kaddr - kernel address * - size - size in question */ ENTRY(__flush_dcache_area) - dcache_line_size x2, x3 - add x1, x0, x1 - sub x3, x2, #1 - bic x0, x0, x3 -1: dc civac, x0 // clean & invalidate D line / unified line - add x0, x0, x2 - cmp x0, x1 - b.lo 1b - dsb sy + dcache_by_line_op civac, sy, x0, x1, x2, x3 ret ENDPIPROC(__flush_dcache_area) +/* + * __clean_dcache_area_pou(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoU. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pou) + dcache_by_line_op cvau, ish, x0, x1, x2, x3 + ret +ENDPROC(__clean_dcache_area_pou) + /* * __inval_cache_range(start, end) * - start - start address of region diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index c26b804015e8..46649d6e6c5a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, __flush_icache_all(); } +static void sync_icache_aliases(void *kaddr, unsigned long len) +{ + unsigned long addr = (unsigned long)kaddr; + + if (icache_is_aliasing()) { + __clean_dcache_area_pou(kaddr, len); + __flush_icache_all(); + } else { + flush_icache_range(addr, addr + len); + } +} + static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len) { - if (vma->vm_flags & VM_EXEC) { - unsigned long addr = (unsigned long)kaddr; - if (icache_is_aliasing()) { - __flush_dcache_area(kaddr, len); - __flush_icache_all(); - } else { - flush_icache_range(addr, addr + len); - } - } + if (vma->vm_flags & VM_EXEC) + sync_icache_aliases(kaddr, len); } /* @@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr) if (!page_mapping(page)) return; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) { - __flush_dcache_area(page_address(page), - PAGE_SIZE << compound_order(page)); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + sync_icache_aliases(page_address(page), + PAGE_SIZE << compound_order(page)); + else if (icache_is_aivivt()) __flush_icache_all(); - } else if (icache_is_aivivt()) { - __flush_icache_all(); - } } /* diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index d69dffffaa89..984edcda1850 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -74,3 +74,25 @@ msr pmuserenr_el0, xzr // Disable PMU access from EL0 9000: .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm From 357838314d69e67ff89db49a7170fa1968638b88 Mon Sep 17 00:00:00 2001 From: David Woods Date: Thu, 17 Dec 2015 14:31:26 -0500 Subject: [PATCH 397/607] UPSTREAM: arm64: hugetlb: add support for PTE contiguous bit The arm64 MMU supports a Contiguous bit which is a hint that the TTE is one of a set of contiguous entries which can be cached in a single TLB entry. Supporting this bit adds new intermediate huge page sizes. The set of huge page sizes available depends on the base page size. Without using contiguous pages the huge page sizes are as follows. 4KB: 2MB 1GB 64KB: 512MB With a 4KB granule, the contiguous bit groups together sets of 16 pages and with a 64KB granule it groups sets of 32 pages. This enables two new huge page sizes in each case, so that the full set of available sizes is as follows. 4KB: 64KB 2MB 32MB 1GB 64KB: 2MB 512MB 16GB If a 16KB granule is used then the contiguous bit groups 128 pages at the PTE level and 32 pages at the PMD level. If the base page size is set to 64KB then 2MB pages are enabled by default. It is possible in the future to make 2MB the default huge page size for both 4KB and 64KB granules. Reviewed-by: Chris Metcalf Reviewed-by: Steve Capper Signed-off-by: David Woods Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 66b3923a1a0f77a563b43f43f6ad091354abbfe9) Signed-off-by: Jeff Vander Stoep Change-Id: I5e99c5165bc5eb966adf4d4523632fd9eedd9602 --- arch/arm64/Kconfig | 3 - arch/arm64/include/asm/hugetlb.h | 44 ++-- arch/arm64/include/asm/pgtable-hwdef.h | 18 +- arch/arm64/include/asm/pgtable.h | 10 +- arch/arm64/mm/hugetlbpage.c | 274 ++++++++++++++++++++++++- include/linux/hugetlb.h | 2 - 6 files changed, 313 insertions(+), 38 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 92f234a5579b..ac3dbd933064 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -564,9 +564,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_WANT_HUGE_PMD_SHARE def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bb4052e85dba..bbc1e35aa601 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return *ptep; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page) clear_bit(PG_dcache_clean, &page->flags); } +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable); +#define arch_make_huge_pte arch_make_huge_pte +extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d6739e836f7b..5c25b831273d 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -90,7 +90,23 @@ /* * Contiguous page definitions. */ -#define CONT_PTES (_AC(1, UL) << CONT_SHIFT) +#ifdef CONFIG_ARM64_64K_PAGES +#define CONT_PTE_SHIFT 5 +#define CONT_PMD_SHIFT 5 +#elif defined(CONFIG_ARM64_16K_PAGES) +#define CONT_PTE_SHIFT 7 +#define CONT_PMD_SHIFT 5 +#else +#define CONT_PTE_SHIFT 4 +#define CONT_PMD_SHIFT 4 +#endif + +#define CONT_PTES (1 << CONT_PTE_SHIFT) +#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) +#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) +#define CONT_PMDS (1 << CONT_PMD_SHIFT) +#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) +#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) /* the the numerical offset of the PTE within a range of CONT_PTES */ #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 824416a219e7..51f382b7e5c7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -227,7 +227,8 @@ static inline pte_t pte_mkspecial(pte_t pte) static inline pte_t pte_mkcont(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_CONT)); + pte = set_pte_bit(pte, __pgprot(PTE_CONT)); + return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); } static inline pte_t pte_mknoncont(pte_t pte) @@ -235,6 +236,11 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } +static inline pmd_t pmd_mkcont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | PMD_SECT_CONT); +} + static inline void set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; @@ -308,7 +314,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, /* * Hugetlb definitions. */ -#define HUGE_MAX_HSTATE 2 +#define HUGE_MAX_HSTATE 4 #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 383b03ff38f8..82d607c3614e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,17 +41,289 @@ int pud_huge(pud_t pud) #endif } +static int find_num_contig(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, size_t *pgsize) +{ + pgd_t *pgd = pgd_offset(mm, addr); + pud_t *pud; + pmd_t *pmd; + + *pgsize = PAGE_SIZE; + if (!pte_cont(pte)) + return 1; + if (!pgd_present(*pgd)) { + VM_BUG_ON(!pgd_present(*pgd)); + return 1; + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + VM_BUG_ON(!pud_present(*pud)); + return 1; + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + VM_BUG_ON(!pmd_present(*pmd)); + return 1; + } + if ((pte_t *)pmd == ptep) { + *pgsize = PMD_SIZE; + return CONT_PMDS; + } + return CONT_PTES; +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + size_t pgsize; + int i; + int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + unsigned long pfn; + pgprot_t hugeprot; + + if (ncontig == 1) { + set_pte_at(mm, addr, ptep, pte); + return; + } + + pfn = pte_pfn(pte); + hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); + for (i = 0; i < ncontig; i++) { + pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, + pte_val(pfn_pte(pfn, hugeprot))); + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + ptep++; + pfn += pgsize >> PAGE_SHIFT; + addr += pgsize; + } +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz); + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else if (sz == (PAGE_SIZE * CONT_PTES)) { + pmd_t *pmd = pmd_alloc(mm, pud, addr); + + WARN_ON(addr & (sz - 1)); + /* + * Note that if this code were ever ported to the + * 32-bit arm platform then it will cause trouble in + * the case where CONFIG_HIGHPTE is set, since there + * will be no pte_unmap() to correspond with this + * pte_alloc_map(). + */ + pte = pte_alloc_map(mm, NULL, pmd, addr); + } else if (sz == PMD_SIZE) { + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && + pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } else if (sz == (PMD_SIZE * CONT_PMDS)) { + pmd_t *pmd; + + pmd = pmd_alloc(mm, pud, addr); + WARN_ON(addr & (sz - 1)); + return (pte_t *)pmd; + } + + pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr, + sz, pte, pte_val(*pte)); + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return NULL; + + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return NULL; + + if (pte_cont(pmd_pte(*pmd))) { + pmd = pmd_offset( + pud, (addr & CONT_PMD_MASK)); + return (pte_t *)pmd; + } + if (pmd_huge(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, addr); + if (pte_present(*pte) && pte_cont(*pte)) { + pte = pte_offset_kernel( + pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; +} + +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t pagesize = huge_page_size(hstate_vma(vma)); + + if (pagesize == CONT_PTE_SIZE) { + entry = pte_mkcont(entry); + } else if (pagesize == CONT_PMD_SIZE) { + entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); + } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) { + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); + } + return entry; +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte; + + if (pte_cont(*ptep)) { + int ncontig, i; + size_t pgsize; + pte_t *cpte; + bool is_dirty = false; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + /* save the 1st pte to return */ + pte = ptep_get_and_clear(mm, addr, cpte); + for (i = 1; i < ncontig; ++i) { + /* + * If HW_AFDBM is enabled, then the HW could + * turn on the dirty bit for any of the page + * in the set, so check them all. + */ + ++cpte; + if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + is_dirty = true; + } + if (is_dirty) + return pte_mkdirty(pte); + else + return pte; + } else { + return ptep_get_and_clear(mm, addr, ptep); + } +} + +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + pte_t *cpte; + + if (pte_cont(pte)) { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte); + /* Select all bits except the pfn */ + pgprot_t hugeprot = + __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ + pte_val(pte)); + + cpte = huge_pte_offset(vma->vm_mm, addr); + pfn = pte_pfn(*cpte); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) { + changed = ptep_set_access_flags(vma, addr, cpte, + pfn_pte(pfn, + hugeprot), + dirty); + pfn += pgsize >> PAGE_SHIFT; + } + return changed; + } else { + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + } +} + +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_set_wrprotect(mm, addr, cpte); + } else { + ptep_set_wrprotect(mm, addr, ptep); + } +} + +void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(vma->vm_mm, addr); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_clear_flush(vma, addr, cpte); + } else { + ptep_clear_flush(vma, addr, ptep); + } +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else if (ps == (PAGE_SIZE * CONT_PTES)) { + hugetlb_add_hstate(CONT_PTE_SHIFT); + } else if (ps == (PMD_SIZE * CONT_PMDS)) { + hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; } return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PMD_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 685c262e0be8..b0eb06423d5e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct address_space *mapping, pgoff_t idx, unsigned long address); -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -#endif extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; From 0c6cdfec1241b011aeecb1cc0b4d69f122a8d075 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Dec 2015 16:01:47 +0000 Subject: [PATCH 398/607] UPSTREAM: arm64: remove irq_count and do_softirq_own_stack() sysrq_handle_reboot() re-enables interrupts while on the irq stack. The irq_stack implementation wrongly assumed this would only ever happen via the softirq path, allowing it to update irq_count late, in do_softirq_own_stack(). This means if an irq occurs in sysrq_handle_reboot(), during emergency_restart() the stack will be corrupted, as irq_count wasn't updated. Lose the optimisation, and instead of moving the adding/subtracting of irq_count into irq_stack_entry/irq_stack_exit, remove it, and compare sp_el0 (struct thread_info) with sp & ~(THREAD_SIZE - 1). This tells us if we are on a task stack, if so, we can safely switch to the irq stack. Finally, remove do_softirq_own_stack(), we don't need it anymore. Reported-by: Will Deacon Signed-off-by: James Morse [will: use get_thread_info macro] Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit d224a69e3d80fe08f285d1f41d21b590bae4fa9f) Signed-off-by: Jeff Vander Stoep Change-Id: I1f613279bf875443b10d65b1cd1ed4a6abfcb605 --- arch/arm64/include/asm/irq.h | 2 -- arch/arm64/kernel/entry.S | 19 +++++++++--------- arch/arm64/kernel/irq.c | 38 +----------------------------------- 3 files changed, 11 insertions(+), 48 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 3bece4379bd9..b77197d941fc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,8 +11,6 @@ #include #include -#define __ARCH_HAS_DO_SOFTIRQ - struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 0667fb7d8bb1..c0db321db7e1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -181,19 +181,20 @@ alternative_endif .macro irq_stack_entry mov x19, sp // preserve the original sp - this_cpu_ptr irq_stack, x25, x26 - /* - * Check the lowest address on irq_stack for the irq_count value, - * incremented by do_softirq_own_stack if we have re-enabled irqs - * while on the irq_stack. + * Compare sp with the current thread_info, if the top + * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and + * should switch to the irq stack. */ - ldr x26, [x25] - cbnz x26, 9998f // recursive use? + and x25, x19, #~(THREAD_SIZE - 1) + cmp x25, tsk + b.ne 9998f - /* switch to the irq stack */ + this_cpu_ptr irq_stack, x25, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 + + /* switch to the irq stack */ mov sp, x26 /* @@ -405,10 +406,10 @@ el1_irq: bl trace_hardirqs_off #endif + get_thread_info tsk irq_handler #ifdef CONFIG_PREEMPT - get_thread_info tsk ldr w24, [tsk, #TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 ldr x0, [tsk, #TI_FLAGS] // get flags diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index ff7ebb710e51..2386b26c0712 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,24 +25,14 @@ #include #include #include -#include #include #include unsigned long irq_err_count; -/* - * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. - * irq_stack[0] is used as irq_count, a non-zero value indicates the stack - * is in use, and el?_irq() shouldn't switch to it. This is used to detect - * recursive use of the irq_stack, it is lazily updated by - * do_softirq_own_stack(), which is called on the irq_stack, before - * re-enabling interrupts to process softirqs. - */ +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); -#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) - int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -66,29 +56,3 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } - -/* - * do_softirq_own_stack() is called from irq_exit() before __do_softirq() - * re-enables interrupts, at which point we may re-enter el?_irq(). We - * increase irq_count here so that el1_irq() knows that it is already on the - * irq stack. - * - * Called with interrupts disabled, so we don't worry about moving cpu, or - * being interrupted while modifying irq_count. - * - * This function doesn't actually switch stack. - */ -void do_softirq_own_stack(void) -{ - int cpu = smp_processor_id(); - - WARN_ON_ONCE(!irqs_disabled()); - - if (on_irq_stack(current_stack_pointer, cpu)) { - IRQ_COUNT()++; - __do_softirq(); - IRQ_COUNT()--; - } else { - __do_softirq(); - } -} From cc6027395e152f2a7aee43623bac1d3bdd8f5da0 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:39 +0900 Subject: [PATCH 399/607] UPSTREAM: arm64: ftrace: modify a stack frame in a safe way Function graph tracer modifies a return address (LR) in a stack frame by calling ftrace_prepare_return() in a traced function's function prologue. The current code does this modification before preserving an original address at ftrace_push_return_trace() and there is always a small window of inconsistency when an interrupt occurs. This doesn't matter, as far as an interrupt stack is introduced, because stack tracer won't be invoked in an interrupt context. But it would be better to proactively minimize such a window by moving the LR modification after ftrace_push_return_trace(). Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 79fdee9b6355c9720f14717e1ad66af51bb331b5) Signed-off-by: Jeff Vander Stoep Change-Id: Ief0a136aa01348b4d0d3f447544f21fd77835c67 --- arch/arm64/kernel/ftrace.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8f7005bc35bd..ebecf9aa33d1 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -129,23 +129,20 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, * on other archs. It's unlikely on AArch64. */ old = *parent; - *parent = return_hooker; trace.func = self_addr; trace.depth = current->curr_ret_stack + 1; /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - *parent = old; + if (!ftrace_graph_entry(&trace)) return; - } err = ftrace_push_return_trace(old, self_addr, &trace.depth, frame_pointer); - if (err == -EBUSY) { - *parent = old; + if (err == -EBUSY) return; - } + else + *parent = return_hooker; } #ifdef CONFIG_DYNAMIC_FTRACE From 6adbc95c63d8b5a2873865ee8b6a4092ac95a0e9 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:40 +0900 Subject: [PATCH 400/607] UPSTREAM: arm64: pass a task parameter to unwind_frame() Function graph tracer modifies a return address (LR) in a stack frame to hook a function's return. This will result in many useless entries (return_to_handler) showing up in a call stack list. We will fix this problem in a later patch ("arm64: ftrace: fix a stack tracer's output under function graph tracer"). But since real return addresses are saved in ret_stack[] array in struct task_struct, unwind functions need to be notified of, in addition to a stack pointer address, which task is being traced in order to find out real return addresses. This patch extends unwind functions' interfaces by adding an extra argument of a pointer to task_struct. Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit fe13f95b720075327a761fe6ddb45b0c90cab504) Signed-off-by: Jeff Vander Stoep Change-Id: I92a9a07468c182d5abbacaa73a90984ab11ad535 --- arch/arm64/include/asm/stacktrace.h | 6 ++++-- arch/arm64/kernel/perf_callchain.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/return_address.c | 2 +- arch/arm64/kernel/stacktrace.c | 8 ++++---- arch/arm64/kernel/time.c | 2 +- arch/arm64/kernel/traps.c | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 7318f6d54aa9..6fb61c5090b4 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,14 +16,16 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H +struct task_struct; + struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; }; -extern int unwind_frame(struct stackframe *frame); -extern void walk_stackframe(struct stackframe *frame, +extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); +extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 3aa74830cc69..797220da912b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -165,7 +165,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.sp = regs->sp; frame.pc = regs->pc; - walk_stackframe(&frame, callchain_trace, entry); + walk_stackframe(current, &frame, callchain_trace, entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 62068ded1e90..51be6356fb03 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -414,7 +414,7 @@ unsigned long get_wchan(struct task_struct *p) do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(&frame)) + unwind_frame(p, &frame)) return 0; if (!in_sched_functions(frame.pc)) return frame.pc; diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 6c4fd2810ecb..07b37ac05be4 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -44,7 +44,7 @@ void *return_address(unsigned int level) frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ - walk_stackframe(&frame, save_return_addr, &data); + walk_stackframe(current, &frame, save_return_addr, &data); if (!data.level) return data.addr; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b9fd3a8abfc1..f7ee597ec883 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -36,7 +36,7 @@ * ldp x29, x30, [sp] * add sp, sp, #0x10 */ -int notrace unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; @@ -99,7 +99,7 @@ int notrace unwind_frame(struct stackframe *frame) return 0; } -void notrace walk_stackframe(struct stackframe *frame, +void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data) { while (1) { @@ -107,7 +107,7 @@ void notrace walk_stackframe(struct stackframe *frame, if (fn(frame, data)) break; - ret = unwind_frame(frame); + ret = unwind_frame(tsk, frame); if (ret < 0) break; } @@ -159,7 +159,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.pc = (unsigned long)save_stack_trace_tsk; } - walk_stackframe(&frame, save_trace, &data); + walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 13339b6ffc1a..6e5c521f123a 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -53,7 +53,7 @@ unsigned long profile_pc(struct pt_regs *regs) frame.sp = regs->sp; frame.pc = regs->pc; do { - int ret = unwind_frame(&frame); + int ret = unwind_frame(NULL, &frame); if (ret < 0) return 0; } while (in_lock_functions(frame.pc)); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a0084541f84..937008523fa5 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -177,7 +177,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) int ret; dump_backtrace_entry(where); - ret = unwind_frame(&frame); + ret = unwind_frame(tsk, &frame); if (ret < 0) break; stack = frame.sp; From 0c078f6a47bb3de392e16788a119d575441c1fb2 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:41 +0900 Subject: [PATCH 401/607] UPSTREAM: arm64: ftrace: fix a stack tracer's output under function graph tracer Function graph tracer modifies a return address (LR) in a stack frame to hook a function return. This will result in many useless entries (return_to_handler) showing up in a) a stack tracer's output b) perf call graph (with perf record -g) c) dump_backtrace (at panic et al.) For example, in case of a), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo 1 > /proc/sys/kernel/stack_trace_enabled $ cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (54 entries) ----- ---- -------- 0) 4504 16 gic_raise_softirq+0x28/0x150 1) 4488 80 smp_cross_call+0x38/0xb8 2) 4408 48 return_to_handler+0x0/0x40 3) 4360 32 return_to_handler+0x0/0x40 ... In case of b), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ perf record -e mem:XXX:x -ag -- sleep 10 $ perf report ... | | |--0.22%-- 0x550f8 | | | 0x10888 | | | el0_svc_naked | | | sys_openat | | | return_to_handler | | | return_to_handler ... In case of c), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo c > /proc/sysrq-trigger ... Call trace: [] sysrq_handle_crash+0x24/0x30 [] return_to_handler+0x0/0x40 [] return_to_handler+0x0/0x40 ... This patch replaces such entries with real addresses preserved in current->ret_stack[] at unwind_frame(). This way, we can cover all the cases. Reviewed-by: Jungseok Lee Signed-off-by: AKASHI Takahiro [will: fixed minor context changes conflicting with irq stack bits] Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 20380bb390a443b2c5c8800cec59743faf8151b4) Signed-off-by: Jeff Vander Stoep Change-Id: I6360182f8d04fdd2e31c0cb6054aefa2adb216e7 --- arch/arm64/include/asm/ftrace.h | 2 ++ arch/arm64/include/asm/stacktrace.h | 3 +++ arch/arm64/kernel/perf_callchain.c | 3 +++ arch/arm64/kernel/process.c | 3 +++ arch/arm64/kernel/return_address.c | 3 +++ arch/arm64/kernel/stacktrace.c | 17 +++++++++++++++++ arch/arm64/kernel/time.c | 3 +++ arch/arm64/kernel/traps.c | 26 ++++++++++++++++++++------ 8 files changed, 54 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index c5534facf941..3c60f37e48ab 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -28,6 +28,8 @@ struct dyn_arch_ftrace { extern unsigned long ftrace_graph_call; +extern void return_to_handler(void); + static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 6fb61c5090b4..801a16dbbdf6 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -22,6 +22,9 @@ struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned int graph; +#endif }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 797220da912b..ff4665462a02 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -164,6 +164,9 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, callchain_trace, entry); } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 51be6356fb03..5f6244526e31 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -410,6 +410,9 @@ unsigned long get_wchan(struct task_struct *p) frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = p->curr_ret_stack; +#endif stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 07b37ac05be4..1718706fde83 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -43,6 +43,9 @@ void *return_address(unsigned int level) frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, save_return_addr, &data); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index f7ee597ec883..4fad9787ab46 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -17,6 +17,7 @@ */ #include #include +#include #include #include @@ -66,6 +67,19 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (tsk && tsk->ret_stack && + (frame->pc == (unsigned long)return_to_handler)) { + /* + * This is a case where function graph tracer has + * modified a return address (LR) in a stack frame + * to hook a function return. + * So replace it to an original value. + */ + frame->pc = tsk->ret_stack[frame->graph--].ret; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Check whether we are going to walk through from interrupt stack * to task stack. @@ -158,6 +172,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 6e5c521f123a..59779699a1a4 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -52,6 +52,9 @@ unsigned long profile_pc(struct pt_regs *regs) frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = -1; /* no task info */ +#endif do { int ret = unwind_frame(NULL, &frame); if (ret < 0) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 937008523fa5..bdc293f6adc4 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -147,17 +147,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) tsk = current; - if (regs) { - frame.fp = regs->regs[29]; - frame.sp = regs->sp; - frame.pc = regs->pc; - } else if (tsk == current) { + if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; @@ -169,14 +166,31 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif + skip = !!regs; pr_emerg("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; int ret; - dump_backtrace_entry(where); + /* skip until specified stack frame */ + if (!skip) { + dump_backtrace_entry(where); + } else if (frame.fp == regs->regs[29]) { + skip = 0; + /* + * Mostly, this is the case where this function is + * called in panic/abort. As exception handler's + * stack frame does not contain the corresponding pc + * at which an exception has taken place, use regs->pc + * instead. + */ + dump_backtrace_entry(regs->pc); + } ret = unwind_frame(tsk, &frame); if (ret < 0) break; From cc68289965af2e593fdf5161f7e6407d48d79191 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 21 Dec 2015 16:44:27 +0000 Subject: [PATCH 402/607] UPSTREAM: arm64: traps: address fallout from printk -> pr_* conversion Commit ac7b406c1a9d ("arm64: Use pr_* instead of printk") was a fairly mindless s/printk/pr_*/ change driven by a complaint from checkpatch. As is usual with such changes, this has led to some odd behaviour on arm64: * syslog now picks up the "pr_emerg" line from dump_backtrace, but not the actual trace, which leads to a bunch of "kernel:Call trace:" lines in the log * __{pte,pmd,pgd}_error print at KERN_CRIT, as opposed to KERN_ERR which is used by other architectures. This patch restores the original printk behaviour for dump_backtrace and downgrade the pgtable error macros to KERN_ERR. Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit c9cd0ed925c0b927283d4739bfe689eb9d1e9dfd) Signed-off-by: Jeff Vander Stoep Change-Id: Iaf028e5368df4623f7257ef432a7f9da86261609 --- arch/arm64/kernel/traps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index bdc293f6adc4..cbedd724f48e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -171,7 +171,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) #endif skip = !!regs; - pr_emerg("Call trace:\n"); + printk("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; @@ -482,22 +482,22 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) void __pte_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pte %016lx.\n", file, line, val); + pr_err("%s:%d: bad pte %016lx.\n", file, line, val); } void __pmd_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val); + pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); } void __pud_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pud %016lx.\n", file, line, val); + pr_err("%s:%d: bad pud %016lx.\n", file, line, val); } void __pgd_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val); + pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); } /* GENERIC_BUG traps */ From 606d0cab2091ee86fa8dc7c77a53a9e59e9fac05 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 Jan 2016 15:36:59 +0000 Subject: [PATCH 403/607] UPSTREAM: arm64: mm: move pgd_cache initialisation to pgtable_cache_init Initialising the suppport for EFI runtime services requires us to allocate a pgd off the back of an early_initcall. On systems where the PGD_SIZE is smaller than PAGE_SIZE (e.g. 64k pages and 48-bit VA), the pgd_cache isn't initialised at this stage, and we panic with a NULL dereference during boot: Unable to handle kernel NULL pointer dereference at virtual address 00000000 __create_mapping.isra.5+0x84/0x350 create_pgd_mapping+0x20/0x28 efi_create_mapping+0x5c/0x6c arm_enable_runtime_services+0x154/0x1e4 do_one_initcall+0x8c/0x190 kernel_init_freeable+0x84/0x1ec kernel_init+0x10/0xe0 ret_from_fork+0x10/0x50 This patch fixes the problem by initialising the pgd_cache earlier, in the pgtable_cache_init callback, which sounds suspiciously like what it was intended for. Reported-by: Dennis Chen Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 39b5be9b4233a9f212b98242bddf008f379b5122) Signed-off-by: Jeff Vander Stoep Change-Id: I124f03d19299be93124af35641294bf73c13bb22 --- arch/arm64/include/asm/pgtable.h | 3 ++- arch/arm64/mm/pgd.c | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 51f382b7e5c7..69d2e2f86bce 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -680,7 +680,8 @@ extern int kern_addr_valid(unsigned long addr); #include -#define pgtable_cache_init() do { } while (0) +void pgd_cache_init(void); +#define pgtable_cache_init pgd_cache_init /* * On AArch64, the cache coherency is handled via the set_pte_at() function. diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index cb3ba1b812e7..ae11d4e03d0e 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { + if (PGD_SIZE == PAGE_SIZE) + return; + /* * Naturally aligned pgds required by the architecture. */ - if (PGD_SIZE != PAGE_SIZE) - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, - SLAB_PANIC, NULL); - return 0; + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, + SLAB_PANIC, NULL); } -core_initcall(pgd_cache_init); From d686cf4829c01ca92e8b10603b9250221b3bdfcb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 5 Jan 2016 17:33:34 +0000 Subject: [PATCH 404/607] UPSTREAM: arm64: entry: remove pointless SPSR mode check In work_pending, we may skip work if the stacked SPSR value represents anything other than an EL0 context. We then immediately invoke the kernel_exit 0 macro as part of ret_to_user, assuming a return to EL0. This is somewhat confusing. We use work_pending as part of the ret_to_user/ret_fast_syscall state machine. We only use ret_fast_syscall in the return from an SVC issued from EL0. We use ret_to_user for return from EL0 exception handlers and also for return from ret_from_fork in the case the task was not a kernel thread (i.e. it is a user task). Thus in all cases the stacked SPSR value must represent an EL0 context, and the check is redundant. This patch removes it, along with the now unused no_work_pending label. Cc: Chris Metcalf Acked-by: Catalin Marinas Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit ee03353bc04f8e460cc4e3da80d9721d9ecb89f1) Signed-off-by: Jeff Vander Stoep Change-Id: I2738149342ef469e8cde7c5f8f7c65daab93fb2b --- arch/arm64/kernel/entry.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c0db321db7e1..1f7f5a2b61bf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -676,10 +676,7 @@ ret_fast_syscall_trace: work_pending: tbnz x1, #TIF_NEED_RESCHED, work_resched /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ - ldr x2, [sp, #S_PSTATE] mov x0, sp // 'regs' - tst x2, #PSR_MODE_MASK // user mode regs? - b.ne no_work_pending // returning to kernel enable_irq // enable interrupts for do_notify_resume() bl do_notify_resume b ret_to_user @@ -698,7 +695,6 @@ ret_to_user: and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending enable_step_tsk x1, x2 -no_work_pending: kernel_exit 0 ENDPROC(ret_to_user) From 3c31c6d7098894f0bf886ebe711fd930855626e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 23 Dec 2015 10:29:28 +0100 Subject: [PATCH 405/607] UPSTREAM: efi: stub: define DISABLE_BRANCH_PROFILING for all architectures This moves the DISABLE_BRANCH_PROFILING define from the x86 specific to the general CFLAGS definition for the stub. This fixes build errors when building for arm64 with CONFIG_PROFILE_ALL_BRANCHES_ENABLED. Reviewed-by: Matt Fleming Reported-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit b523e185bba36164ca48a190f5468c140d815414) Signed-off-by: Jeff Vander Stoep Change-Id: Id5a2a3986adc60ddf3d247c038667ce9719f3ee0 --- drivers/firmware/efi/libstub/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 3c0467d3688c..c0ddd1b8dca3 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -8,7 +8,7 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ -fPIC -fno-strict-aliasing -mno-red-zone \ - -mno-mmx -mno-sse -DDISABLE_BRANCH_PROFILING + -mno-mmx -mno-sse cflags-$(CONFIG_ARM64) := $(subst -pg,,$(KBUILD_CFLAGS)) cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ @@ -16,7 +16,7 @@ cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ cflags-$(CONFIG_EFI_ARMSTUB) += -I$(srctree)/scripts/dtc/libfdt -KBUILD_CFLAGS := $(cflags-y) \ +KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) From a725004a74ab543510a8158b206c334756e9d8cf Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 6 Jan 2016 11:05:27 +0000 Subject: [PATCH 406/607] UPSTREAM: arm64: head.S: use memset to clear BSS Currently we use an open-coded memzero to clear the BSS. As it is a trivial implementation, it is sub-optimal. Our optimised memset doesn't use the stack, is position-independent, and for the memzero case can use of DC ZVA to clear large blocks efficiently. In __mmap_switched the MMU is on and there are no live caller-saved registers, so we can safely call an uninstrumented memset. This patch changes __mmap_switched to use memset when clearing the BSS. We use the __pi_memset alias so as to avoid any instrumentation in all kernel configurations. Cc: Catalin Marinas Cc: Marc Zyngier Reviewed-by: Ard Biesheuvel Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 2a803c4db615d85126c5c7afd5849a3cfde71422) Signed-off-by: Jeff Vander Stoep Change-Id: I3dc7050fe5566f2126cbea9abfa6063c8e6b029a --- arch/arm64/kernel/head.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 17ce7285bb12..917d98108b3f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -415,14 +415,13 @@ ENDPROC(__create_page_tables) */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: - adr_l x6, __bss_start - adr_l x7, __bss_stop + // Clear BSS + adr_l x0, __bss_start + mov x1, xzr + adr_l x2, __bss_stop + sub x2, x2, x0 + bl __pi_memset -1: cmp x6, x7 - b.hs 2f - str xzr, [x6], #8 // Clear BSS - b 1b -2: adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) From 66a58725b83cdc0754f0329465f7169aa8417e3d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 15 Jan 2016 13:28:57 +0100 Subject: [PATCH 407/607] UPSTREAM: arm64: hide __efistub_ aliases from kallsyms Commit e8f3010f7326 ("arm64/efi: isolate EFI stub from the kernel proper") isolated the EFI stub code from the kernel proper by prefixing all of its symbols with __efistub_, and selectively allowing access to core kernel symbols from the stub by emitting __efistub_ aliases for functions and variables that the stub can access legally. As an unintended side effect, these aliases are emitted into the kallsyms symbol table, which means they may turn up in backtraces, e.g., ... PC is at __efistub_memset+0x108/0x200 LR is at fixup_init+0x3c/0x48 ... [] __efistub_memset+0x108/0x200 [] free_initmem+0x2c/0x40 [] kernel_init+0x20/0xe0 [] ret_from_fork+0x10/0x40 The backtrace in question has nothing to do with the EFI stub, but simply returns one of the several aliases of memset() that have been recorded in the kallsyms table. This is undesirable, since it may suggest to people who are not aware of this that the issue they are seeing is somehow EFI related. So hide the __efistub_ aliases from kallsyms, by emitting them as absolute linker symbols explicitly. The distinction between those and section relative symbols is completely irrelevant to these definitions, and to the final link we are performing when these definitions are being taken into account (the distinction is only relevant to symbols defined inside a section definition when performing a partial link), and so the resulting values are identical to the original ones. Since absolute symbols are ignored by kallsyms, this will result in these values to be omitted from its symbol table. After this patch, the backtrace generated from the same address looks like this: ... PC is at __memset+0x108/0x200 LR is at fixup_init+0x3c/0x48 ... [] __memset+0x108/0x200 [] free_initmem+0x2c/0x40 [] kernel_init+0x20/0xe0 [] ret_from_fork+0x10/0x40 Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 75feee3d9d51775072d3a04f47d4a439a4c4590e) Signed-off-by: Jeff Vander Stoep Change-Id: Ieeb8c576e31f0dd0b7f82982ffa6362864eed311 --- arch/arm64/kernel/image.h | 40 ++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index bc2abb8b1599..999633bd7294 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -64,6 +64,16 @@ #ifdef CONFIG_EFI +/* + * Prevent the symbol aliases below from being emitted into the kallsyms + * table, by forcing them to be absolute symbols (which are conveniently + * ignored by scripts/kallsyms) rather than section relative symbols. + * The distinction is only relevant for partial linking, and only for symbols + * that are defined within a section declaration (which is not the case for + * the definitions below) so the resulting values will be identical. + */ +#define KALLSYMS_HIDE(sym) ABSOLUTE(sym) + /* * The EFI stub has its own symbol namespace prefixed by __efistub_, to * isolate it from the kernel proper. The following symbols are legally @@ -73,25 +83,25 @@ * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub___flush_dcache_area = __pi___flush_dcache_area; +__efistub_memcmp = KALLSYMS_HIDE(__pi_memcmp); +__efistub_memchr = KALLSYMS_HIDE(__pi_memchr); +__efistub_memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub_memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub_memset = KALLSYMS_HIDE(__pi_memset); +__efistub_strlen = KALLSYMS_HIDE(__pi_strlen); +__efistub_strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub_strncmp = KALLSYMS_HIDE(__pi_strncmp); +__efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); #ifdef CONFIG_KASAN -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; +__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub___memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub___memset = KALLSYMS_HIDE(__pi_memset); #endif -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; +__efistub__text = KALLSYMS_HIDE(_text); +__efistub__end = KALLSYMS_HIDE(_end); +__efistub__edata = KALLSYMS_HIDE(_edata); #endif From a84dda4e8627f7b6a91a5c2cd17e4cd059ac9cbf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:33 -0800 Subject: [PATCH 408/607] UPSTREAM: arch/arm/include/asm/pgtable-3level.h: add pmd_mkclean for THP MADV_FREE needs pmd_dirty and pmd_mkclean for detecting recent overwrite of the contents since MADV_FREE syscall is called for THP page. This patch adds pmd_mkclean for THP page MADV_FREE support. Signed-off-by: Minchan Kim Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: Jason Evans Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Kirill A. Shutemov Cc: Matt Turner Cc: Max Filippov Cc: Mel Gorman Cc: Michael Kerrisk Cc: Michal Hocko Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Rik van Riel Cc: Roland Dreier Cc: Shaohua Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 44842045e4baaf406db2954dd2e07152fa61528d) Signed-off-by: Jeff Vander Stoep Change-Id: I59d53667aa8c40dea4f18fc58acc7d27f4a85a04 --- arch/arm/include/asm/pgtable-3level.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index a745a2a53853..6d6012a320b2 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING); PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); +PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY); PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) From fd89b55b8391791a0cd49258a8922b4982d19f47 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 14:50:21 +0100 Subject: [PATCH 409/607] UPSTREAM: arm64: kasan: ensure that the KASAN zero page is mapped read-only When switching from the early KASAN shadow region, which maps the entire shadow space read-write, to the permanent KASAN shadow region, which uses a zero page to shadow regions that are not subject to instrumentation, the lowest level table kasan_zero_pte[] may be reused unmodified, which means that the mappings of the zero page that it contains will still be read-write. So update it explicitly to map the zero page read only when we activate the permanent mapping. Acked-by: Andrey Ryabinin Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 7b1af9795773d745c2a8c7d4ca5f2936e8b6adfb) Signed-off-by: Jeff Vander Stoep Change-Id: I4745dc91236c2612ad3e13be1cc176ffd923f7da --- arch/arm64/mm/kasan_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cf038c7d9fa9..cab7a5be40aa 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -120,6 +120,7 @@ static void __init cpu_set_ttbr1(unsigned long ttbr1) void __init kasan_init(void) { struct memblock_region *reg; + int i; /* * We are going to perform proper setup of shadow memory. @@ -155,6 +156,14 @@ void __init kasan_init(void) pfn_to_nid(virt_to_pfn(start))); } + /* + * KAsan may reuse the contents of kasan_zero_pte directly, so we + * should make sure that it maps the zero page read-only. + */ + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(&kasan_zero_pte[i], + pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); + memset(kasan_zero_page, 0, PAGE_SIZE); cpu_set_ttbr1(__pa(swapper_pg_dir)); flush_tlb_all(); From da4712caec8748615aa17eeb9d6c3cd3b0f4f910 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 24 Jan 2016 15:24:12 +0900 Subject: [PATCH 410/607] UPSTREAM: arm64: Fix an enum typo in mm/dump.c This patch fixes a typo in mm/dump.c: "MODUELS_END_NR" should be "MODULES_END_NR". Signed-off-by: Masanari Iida Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit b3122023df935cf14bf951da98ca598d71b9f826) Signed-off-by: Jeff Vander Stoep Change-Id: I5e73d18fd70f20f200a974f5f2ba22cb7ae64952 --- arch/arm64/mm/dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 5a22a119a74c..0adbebbc2803 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -46,7 +46,7 @@ enum address_markers_idx { PCI_START_NR, PCI_END_NR, MODULES_START_NR, - MODUELS_END_NR, + MODULES_END_NR, KERNEL_SPACE_NR, }; From 720345213019ec25bc43f6b78592f4c39e2c1731 Mon Sep 17 00:00:00 2001 From: William Cohen Date: Thu, 21 Jan 2016 22:56:26 -0500 Subject: [PATCH 411/607] BACKPORT: Eliminate the .eh_frame sections from the aarch64 vmlinux and kernel modules By default the aarch64 gcc generates .eh_frame sections. Unlike .debug_frame sections, the .eh_frame sections are loaded into memory when the associated code is loaded. On an example kernel being built with this default the .eh_frame section in vmlinux used an extra 1.7MB of memory. The x86 disables the creation of the .eh_frame section. The aarch64 should probably do the same to save some memory. Signed-off-by: William Cohen Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 728dabd6d1751cf5e0f8e0535891393da62396e9) Signed-off-by: Jeff Vander Stoep Change-Id: I697baae2209d6d11f2cc447459d935f7200eb7b1 --- arch/arm64/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 4e77fd165f38..693a4dc6220c 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -28,6 +28,7 @@ endif KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) KBUILD_CFLAGS += -fno-pic +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_AFLAGS += $(lseinstr) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) From 81d07628d337df8070830a82fbca2f52015e0ebd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:55 +0000 Subject: [PATCH 412/607] UPSTREAM: asm-generic: Fix local variable shadow in __set_fixmap_offset Currently __set_fixmap_offset is a macro function which has a local variable called 'addr'. If a caller passes a 'phys' parameter which is derived from a variable also called 'addr', the local variable will shadow this, and the compiler will complain about the use of an uninitialized variable. To avoid the issue with namespace clashes, 'addr' is prefixed with a liberal sprinkling of underscores. Turning __set_fixmap_offset into a static inline breaks the build for several architectures. Fixing this properly requires updates to a number of architectures to make them agree on the prototype of __set_fixmap (it could be done as a subsequent patch series). Signed-off-by: Mark Rutland Cc: Arnd Bergmann [catalin.marinas@arm.com: squashed the original function patch and macro fixup] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 3694bd76781b76c4f8d2ecd85018feeb1609f0e5) Signed-off-by: Jeff Vander Stoep Change-Id: Iec27cb36dfca39e333de9f7318e76da0670d0156 --- include/asm-generic/fixmap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h index 1cbb8338edf3..827e4d3bbc7a 100644 --- a/include/asm-generic/fixmap.h +++ b/include/asm-generic/fixmap.h @@ -70,12 +70,12 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) #endif /* Return a pointer with offset calculated */ -#define __set_fixmap_offset(idx, phys, flags) \ -({ \ - unsigned long addr; \ - __set_fixmap(idx, phys, flags); \ - addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \ - addr; \ +#define __set_fixmap_offset(idx, phys, flags) \ +({ \ + unsigned long ________addr; \ + __set_fixmap(idx, phys, flags); \ + ________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \ + ________addr; \ }) #define set_fixmap_offset(idx, phys) \ From 0c8db8d0d4c615dd53b703fd632d4481e2bb18d6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:56 +0000 Subject: [PATCH 413/607] UPSTREAM: arm64: mm: specialise pagetable allocators We pass a size parameter to early_alloc and late_alloc, but these are only ever used to allocate single pages. In late_alloc we always allocate a single page. Both allocators provide us with zeroed pages (such that all entries are invalid), but we have no barriers between allocating a page and adding that page to existing (live) tables. A concurrent page table walk may see stale data, leading to a number of issues. This patch specialises the two allocators for page tables. The size parameter is removed and the necessary dsb(ishst) is folded into each. To make it clear that the functions are intended for use for page table allocation, they are renamed to {early,late}_pgtable_alloc, with the related function pointed renamed to pgtable_alloc. As the dsb(ishst) is now in the allocator, the existing barrier for the zero page is redundant and thus is removed. The previously missing include of barrier.h is added. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 21ab99c289d350f4ae454bc069870009db6df20e) Signed-off-by: Jeff Vander Stoep Change-Id: Ib88fafc146506943122aa1ca6ee5a6c331ddb26c --- arch/arm64/mm/mmu.c | 49 +++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 7b6e58a58bf3..ad8c076d0aef 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -62,15 +63,18 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static void __init *early_alloc(unsigned long sz) +static void __init *early_pgtable_alloc(void) { phys_addr_t phys; void *ptr; - phys = memblock_alloc(sz, sz); + phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); BUG_ON(!phys); ptr = __va(phys); - memset(ptr, 0, sz); + memset(ptr, 0, PAGE_SIZE); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); return ptr; } @@ -95,12 +99,12 @@ static void split_pmd(pmd_t *pmd, pte_t *pte) static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - pte = alloc(PTRS_PER_PTE * sizeof(pte_t)); + pte = pgtable_alloc(); if (pmd_sect(*pmd)) split_pmd(pmd, pte); __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); @@ -130,7 +134,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pmd_t *pmd; unsigned long next; @@ -139,7 +143,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t)); + pmd = pgtable_alloc(); if (pud_sect(*pud)) { /* * need to have the 1G of mappings continue to be @@ -174,7 +178,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, } } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, alloc); + prot, pgtable_alloc); } phys += next - addr; } while (pmd++, addr = next, addr != end); @@ -195,13 +199,13 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pud_t *pud; unsigned long next; if (pgd_none(*pgd)) { - pud = alloc(PTRS_PER_PUD * sizeof(pud_t)); + pud = pgtable_alloc(); pgd_populate(mm, pgd, pud); } BUG_ON(pgd_bad(*pgd)); @@ -234,7 +238,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } } } else { - alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc); + alloc_init_pmd(mm, pud, addr, next, phys, prot, + pgtable_alloc); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -247,7 +252,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { unsigned long addr, length, end, next; @@ -265,18 +270,18 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc); + alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc); phys += next - addr; } while (pgd++, addr = next, addr != end); } -static void *late_alloc(unsigned long size) +static void *late_pgtable_alloc(void) { - void *ptr; - - BUG_ON(size > PAGE_SIZE); - ptr = (void *)__get_free_page(PGALLOC_GFP); + void *ptr = (void *)__get_free_page(PGALLOC_GFP); BUG_ON(!ptr); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); return ptr; } @@ -289,7 +294,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, return; } __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, - size, prot, early_alloc); + size, prot, early_pgtable_alloc); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -297,7 +302,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot) { __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot, - late_alloc); + late_pgtable_alloc); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -310,7 +315,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } return __create_mapping(&init_mm, pgd_offset_k(virt), - phys, virt, size, prot, late_alloc); + phys, virt, size, prot, late_pgtable_alloc); } #ifdef CONFIG_DEBUG_RODATA @@ -457,7 +462,7 @@ void __init paging_init(void) fixup_executable(); /* allocate the zero page. */ - zero_page = early_alloc(PAGE_SIZE); + zero_page = early_pgtable_alloc(); bootmem_init(); From 2563a08010f2a945a39c19853297ed980c8d469f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:57 +0000 Subject: [PATCH 414/607] UPSTREAM: arm64: mm: place empty_zero_page in bss Currently the zero page is set up in paging_init, and thus we cannot use the zero page earlier. We use the zero page as a reserved TTBR value from which no TLB entries may be allocated (e.g. when uninstalling the idmap). To enable such usage earlier (as may be required for invasive changes to the kernel page tables), and to minimise the time that the idmap is active, we need to be able to use the zero page before paging_init. This patch follows the example set by x86, by allocating the zero page at compile time, in .bss. This means that the zero page itself is available immediately upon entry to start_kernel (as we zero .bss before this), and also means that the zero page takes up no space in the raw Image binary. The associated struct page is allocated in bootmem_init, and remains unavailable until this time. Outside of arch code, the only users of empty_zero_page assume that the empty_zero_page symbol refers to the zeroed memory itself, and that ZERO_PAGE(x) must be used to acquire the associated struct page, following the example of x86. This patch also brings arm64 inline with these assumptions. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 5227cfa71f9e8574373f4d0e9e754942d76cdf67) Signed-off-by: Jeff Vander Stoep Change-Id: I43005dd8533d9968d2cef48bd2821d4507cae5ad --- arch/arm64/include/asm/mmu_context.h | 2 +- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/arm64/kernel/head.S | 1 + arch/arm64/mm/mmu.c | 9 +-------- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 24165784b803..600eacb9f7d5 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -48,7 +48,7 @@ static inline void contextidr_thread_switch(struct task_struct *next) */ static inline void cpu_set_reserved_ttbr0(void) { - unsigned long ttbr = page_to_phys(empty_zero_page); + unsigned long ttbr = virt_to_phys(empty_zero_page); asm( " msr ttbr0_el1, %0 // set TTBR0\n" diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 69d2e2f86bce..e11dc90fbf56 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -121,8 +121,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val); * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern struct page *empty_zero_page; -#define ZERO_PAGE(vaddr) (empty_zero_page) +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 917d98108b3f..53b9f9f128c2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -421,6 +421,7 @@ __mmap_switched: adr_l x2, __bss_stop sub x2, x2, x0 bl __pi_memset + dsb ishst // Make zero page visible to PTW adr_l sp, initial_sp, x4 mov x4, sp diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ad8c076d0aef..acf0442b9759 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -49,7 +49,7 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); * Empty_zero_page is a special page that is used for zero-initialized data * and COW. */ -struct page *empty_zero_page; +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -456,18 +456,11 @@ void fixup_init(void) */ void __init paging_init(void) { - void *zero_page; - map_mem(); fixup_executable(); - /* allocate the zero page. */ - zero_page = early_pgtable_alloc(); - bootmem_init(); - empty_zero_page = virt_to_page(zero_page); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. From c66ef5f947144ca420c7483226e3ac64ad8fd813 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:58 +0000 Subject: [PATCH 415/607] UPSTREAM: arm64: unify idmap removal We currently open-code the removal of the idmap and restoration of the current task's MMU state in a few places. Before introducing yet more copies of this sequence, unify these to call a new helper, cpu_uninstall_idmap. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Lorenzo Pieralisi Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 9e8e865bbe294a69666a1996bda3e87825b258c0) Signed-off-by: Jeff Vander Stoep Change-Id: I6e9cb0253a1d2d63232f8fa0b3f39f8f6987b239 --- arch/arm64/include/asm/mmu_context.h | 25 +++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 1 + arch/arm64/kernel/smp.c | 4 +--- arch/arm64/kernel/suspend.c | 20 ++++---------------- arch/arm64/mm/mmu.c | 4 +--- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 600eacb9f7d5..b1b2514d8883 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef CONFIG_PID_IN_CONTEXTIDR static inline void contextidr_thread_switch(struct task_struct *next) @@ -89,6 +90,30 @@ static inline void cpu_set_default_tcr_t0sz(void) : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } +/* + * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. + * + * The idmap lives in the same VA range as userspace, but uses global entries + * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from + * speculative TLB fetches, we must temporarily install the reserved page + * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ. + * + * If current is a not a user task, the mm covers the TTBR1_EL1 page tables, + * which should not be installed in TTBR0_EL1. In this case we can leave the + * reserved page tables in place. + */ +static inline void cpu_uninstall_idmap(void) +{ + struct mm_struct *mm = current->active_mm; + + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + cpu_set_default_tcr_t0sz(); + + if (mm != &init_mm) + cpu_switch_mm(mm->pgd, mm); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 7f0e7226795e..de8e187ec8c6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -62,6 +62,7 @@ #include #include #include +#include phys_addr_t __fdt_pointer __initdata; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b1adc51b2c2e..68e7f79630d4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -149,9 +149,7 @@ asmlinkage void secondary_start_kernel(void) * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); + cpu_uninstall_idmap(); preempt_disable(); trace_hardirqs_off(); diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 1095aa483a1c..66055392f445 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -60,7 +60,6 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *)) */ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { - struct mm_struct *mm = current->active_mm; int ret; unsigned long flags; @@ -87,22 +86,11 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) ret = __cpu_suspend_enter(arg, fn); if (ret == 0) { /* - * We are resuming from reset with TTBR0_EL1 set to the - * idmap to enable the MMU; set the TTBR0 to the reserved - * page tables to prevent speculative TLB allocations, flush - * the local tlb and set the default tcr_el1.t0sz so that - * the TTBR0 address space set-up is properly restored. - * If the current active_mm != &init_mm we entered cpu_suspend - * with mappings in TTBR0 that must be restored, so we switch - * them back to complete the address space configuration - * restoration before returning. + * We are resuming from reset with the idmap active in TTBR0_EL1. + * We must uninstall the idmap and restore the expected MMU + * state before we can possibly return to userspace. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); - - if (mm != &init_mm) - cpu_switch_mm(mm->pgd, mm); + cpu_uninstall_idmap(); /* * Restore per-cpu offset before any kernel diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index acf0442b9759..b00462172705 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -465,9 +465,7 @@ void __init paging_init(void) * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); + cpu_uninstall_idmap(); } /* From 8ac5abfecf99f0c1c065c83a0ff7481d29173e03 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:59 +0000 Subject: [PATCH 416/607] UPSTREAM: arm64: unmap idmap earlier During boot we leave the idmap in place until paging_init, as we previously had to wait for the zero page to become allocated and accessible. Now that we have a statically-allocated zero page, we can uninstall the idmap much earlier in the boot process, making it far easier to spot accidental use of physical addresses. This also brings the cold boot path in line with the secondary boot path. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 86ccce896cb0aa800a7a6dcd29b41ffc4eeb1a75) Signed-off-by: Jeff Vander Stoep Change-Id: I6375bd9855e45727790697875b7cd19f84a4dd7f --- arch/arm64/kernel/setup.c | 6 ++++++ arch/arm64/mm/mmu.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index de8e187ec8c6..079db3ba21c4 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -314,6 +314,12 @@ void __init setup_arch(char **cmdline_p) */ local_async_enable(); + /* + * TTBR0 is only used for the identity mapping at this stage. Make it + * point to zero page to avoid speculatively fetching new entries. + */ + cpu_uninstall_idmap(); + efi_init(); arm64_memblock_init(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b00462172705..c0a4160331f8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -460,12 +460,6 @@ void __init paging_init(void) fixup_executable(); bootmem_init(); - - /* - * TTBR0 is only used for the identity mapping at this stage. Make it - * point to zero page to avoid speculatively fetching new entries. - */ - cpu_uninstall_idmap(); } /* From 4ba7b986cf3bbc2e3e66f2a7805356a30f3b203c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:00 +0000 Subject: [PATCH 417/607] UPSTREAM: arm64: add function to install the idmap In some cases (e.g. when making invasive changes to the kernel page tables) we will need to execute code from the idmap. Add a new helper which may be used to install the idmap, complementing the existing cpu_uninstall_idmap. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 609116d202a8c5fd3fe393eb85373cbee906df68) Signed-off-by: Jeff Vander Stoep Change-Id: I1aac9e85e3d49add72c8aab7d80777a39f5fdd8e --- arch/arm64/include/asm/mmu_context.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index b1b2514d8883..944f2730a940 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -74,7 +74,7 @@ static inline bool __cpu_uses_extended_idmap(void) /* * Set TCR.T0SZ to its default value (based on VA_BITS) */ -static inline void cpu_set_default_tcr_t0sz(void) +static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { unsigned long tcr; @@ -87,9 +87,12 @@ static inline void cpu_set_default_tcr_t0sz(void) " msr tcr_el1, %0 ;" " isb" : "=&r" (tcr) - : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); + : "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } +#define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS)) +#define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz) + /* * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. * @@ -114,6 +117,15 @@ static inline void cpu_uninstall_idmap(void) cpu_switch_mm(mm->pgd, mm); } +static inline void cpu_install_idmap(void) +{ + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + cpu_set_idmap_tcr_t0sz(); + + cpu_switch_mm(idmap_pg_dir, &init_mm); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously From 214ae8984d113452991df67aa2820a28755219c3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:01 +0000 Subject: [PATCH 418/607] UPSTREAM: arm64: mm: add code to safely replace TTBR1_EL1 If page tables are modified without suitable TLB maintenance, the ARM architecture permits multiple TLB entries to be allocated for the same VA. When this occurs, it is permitted that TLB conflict aborts are raised in response to synchronous data/instruction accesses, and/or and amalgamation of the TLB entries may be used as a result of a TLB lookup. The presence of conflicting TLB entries may result in a variety of behaviours detrimental to the system (e.g. erroneous physical addresses may be used by I-cache fetches and/or page table walks). Some of these cases may result in unexpected changes of hardware state, and/or result in the (asynchronous) delivery of SError. To avoid these issues, we must avoid situations where conflicting entries may be allocated into TLBs. For user and module mappings we can follow a strict break-before-make approach, but this cannot work for modifications to the swapper page tables that cover the kernel text and data. Instead, this patch adds code which is intended to be executed from the idmap, which can safely unmap the swapper page tables as it only requires the idmap to be active. This enables us to uninstall the active TTBR1_EL1 entry, invalidate TLBs, then install a new TTBR1_EL1 entry without potentially unmapping code or data required for the sequence. This avoids the risk of conflict, but requires that updates are staged in a copy of the swapper page tables prior to being installed. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 50e1881ddde2a986c7d0d2150985239e5e3d7d96) Signed-off-by: Jeff Vander Stoep Change-Id: Ibd0a7c802a1abe0ba08a99819fd62ac646186005 --- arch/arm64/include/asm/mmu_context.h | 19 +++++++++++++++++++ arch/arm64/mm/proc.S | 28 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 944f2730a940..a00f7cf35bbd 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -126,6 +126,25 @@ static inline void cpu_install_idmap(void) cpu_switch_mm(idmap_pg_dir, &init_mm); } +/* + * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, + * avoiding the possibility of conflicting TLB entries being allocated. + */ +static inline void cpu_replace_ttbr1(pgd_t *pgd) +{ + typedef void (ttbr_replace_func)(phys_addr_t); + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + + phys_addr_t pgd_phys = virt_to_phys(pgd); + + replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); + replace_phys(pgd_phys); + cpu_uninstall_idmap(); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index c164d2cb35c0..0c19534a901e 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -140,6 +140,34 @@ ENTRY(cpu_do_switch_mm) ret ENDPROC(cpu_do_switch_mm) + .pushsection ".idmap.text", "ax" +/* + * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd) + * + * This is the low-level counterpart to cpu_replace_ttbr1, and should not be + * called by anything else. It can only be executed from a TTBR0 mapping. + */ +ENTRY(idmap_cpu_replace_ttbr1) + mrs x2, daif + msr daifset, #0xf + + adrp x1, empty_zero_page + msr ttbr1_el1, x1 + isb + + tlbi vmalle1 + dsb nsh + isb + + msr ttbr1_el1, x0 + isb + + msr daif, x2 + + ret +ENDPROC(idmap_cpu_replace_ttbr1) + .popsection + /* * __cpu_setup * From 0991212c3218e24c047ce32fa462931a5a53024e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:02 +0000 Subject: [PATCH 419/607] UPSTREAM: arm64: kasan: avoid TLB conflicts The page table modification performed during the KASAN init risks the allocation of conflicting TLB entries, as it swaps a set of valid global entries for another without suitable TLB maintenance. The presence of conflicting TLB entries can result in the delivery of synchronous TLB conflict aborts, or may result in the use of erroneous data being returned in response to a TLB lookup. This can affect explicit data accesses from software as well as translations performed asynchronously (e.g. as part of page table walks or speculative I-cache fetches), and can therefore result in a wide variety of problems. To avoid this, use cpu_replace_ttbr1 to swap the page tables. This ensures that when the new tables are installed there are no stale entries from the old tables which may conflict. As all updates are made to the tables while they are not active, the updates themselves are safe. At the same time, add the missing barrier to ensure that the tmp_pg_dir entries updated via memcpy are visible to the page table walkers at the point the tmp_pg_dir is installed. All other page table updates made as part of KASAN initialisation have the requisite barriers due to the use of the standard page table accessors. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit c1a88e9124a499939ebd8069d5e4d3937f019157) Signed-off-by: Jeff Vander Stoep Change-Id: I0934b236e25e015d6946dcedcca14ba9512418f7 --- arch/arm64/mm/kasan_init.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cab7a5be40aa..263b59020500 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -108,15 +109,6 @@ static void __init clear_pgds(unsigned long start, set_pgd(pgd_offset_k(start), __pgd(0)); } -static void __init cpu_set_ttbr1(unsigned long ttbr1) -{ - asm( - " msr ttbr1_el1, %0\n" - " isb" - : - : "r" (ttbr1)); -} - void __init kasan_init(void) { struct memblock_region *reg; @@ -130,8 +122,8 @@ void __init kasan_init(void) * setup will be finished. */ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir)); - cpu_set_ttbr1(__pa(tmp_pg_dir)); - flush_tlb_all(); + dsb(ishst); + cpu_replace_ttbr1(tmp_pg_dir); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -165,8 +157,7 @@ void __init kasan_init(void) pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); memset(kasan_zero_page, 0, PAGE_SIZE); - cpu_set_ttbr1(__pa(swapper_pg_dir)); - flush_tlb_all(); + cpu_replace_ttbr1(swapper_pg_dir); /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; From ded0130fa93036b518b16b5af7ef7c318b67e3e4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:03 +0000 Subject: [PATCH 420/607] UPSTREAM: arm64: mm: move pte_* macros For pmd, pud, and pgd levels of table, functions including p?d_index and p?d_offset are defined after the p?d_page_vaddr function for the immediately higher level of table. The pte functions however are defined much earlier, even though several rely on the later definition of pmd_page_vaddr. While this isn't currently a problem as these are macros, it prevents the logical grouping of later C functions (which cannot rely on prototypes for functions not yet defined). Move these definitions after pmd_page_vaddr, for consistency with the placement of these functions for other levels of table. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 053520f7d3923cc6d37afb28f9887cb1e7d77454) Signed-off-by: Jeff Vander Stoep Change-Id: I44c7327dc0f257188c0c2204ec7da0e7fdca7f64 --- arch/arm64/include/asm/pgtable.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e11dc90fbf56..1d55520c3561 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -134,16 +134,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) -/* Find an entry in the third-level page table. */ -#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) - -#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) - /* * The following only work if pte_present(). Undefined behaviour otherwise. */ @@ -445,6 +435,16 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK); } +/* Find an entry in the third-level page table. */ +#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + +#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) + +#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) + #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) /* From 4d41b2a8a8412b07033ddcd56cc0b31b38889cdb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:04 +0000 Subject: [PATCH 421/607] UPSTREAM: arm64: mm: add functions to walk page tables by PA To allow us to walk tables allocated into the fixmap, we need to acquire the physical address of a page, rather than the virtual address in the linear map. This patch adds new p??_page_paddr and p??_offset_phys functions to acquire the physical address of a next-level table, and changes p??_offset* into macros which simply convert this to a linear map VA. This renders p??_page_vaddr unused, and hence they are removed. At the pgd level, a new pgd_offset_raw function is added to find the relevant PGD entry given the base of a PGD and a virtual address. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit dca56dca7124709f3dfca81afe61b4d98eb9cacf) Signed-off-by: Jeff Vander Stoep Change-Id: Ie43d00014322e29aec70617db3af242ed8545738 --- arch/arm64/include/asm/pgtable.h | 39 +++++++++++++++++++------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1d55520c3561..837fc7b8a9a4 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -430,15 +430,16 @@ static inline void pmd_clear(pmd_t *pmdp) set_pmd(pmdp, __pmd(0)); } -static inline pte_t *pmd_page_vaddr(pmd_t pmd) +static inline phys_addr_t pmd_page_paddr(pmd_t pmd) { - return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK); + return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -473,21 +474,23 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -static inline pmd_t *pud_page_vaddr(pud_t pud) +static inline phys_addr_t pud_page_paddr(pud_t pud) { - return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK); + return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the second-level page table. */ #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) -{ - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); -} +#define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) +#define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +#else + +#define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -509,21 +512,23 @@ static inline void pgd_clear(pgd_t *pgdp) set_pgd(pgdp, __pgd(0)); } -static inline pud_t *pgd_page_vaddr(pgd_t pgd) +static inline phys_addr_t pgd_page_paddr(pgd_t pgd) { - return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK); + return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) -{ - return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr); -} +#define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +#else + +#define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) @@ -531,7 +536,9 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) /* to find an entry in a page-table-directory */ #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) -#define pgd_offset(mm, addr) ((mm)->pgd+pgd_index(addr)) +#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) + +#define pgd_offset(mm, addr) (pgd_offset_raw((mm)->pgd, (addr))) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) From 1845f62c4eb95f0dd288ac54bba784e0acb53171 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:05 +0000 Subject: [PATCH 422/607] UPSTREAM: arm64: mm: avoid redundant __pa(__va(x)) When we "upgrade" to a section mapping, we free any table we made redundant by giving it back to memblock. To get the PA, we acquire the physical address and convert this to a VA, then subsequently convert this back to a PA. This works currently, but will not work if the tables are not accessed via linear map VAs (e.g. is we use fixmap slots). This patch uses {pmd,pud}_page_paddr to acquire the PA. This avoids the __pa(__va()) round trip, saving some work and avoiding reliance on the linear mapping. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 316b39db06718d59d82736df9fc65cf05b467cc7) Signed-off-by: Jeff Vander Stoep Change-Id: I26ed77790b5b00a15ea09a5a23a176de5b73a002 --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c0a4160331f8..fa2471ac191a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -171,7 +171,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, if (!pmd_none(old_pmd)) { flush_tlb_all(); if (pmd_table(old_pmd)) { - phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0)); + phys_addr_t table = pmd_page_paddr(old_pmd); if (!WARN_ON_ONCE(slab_is_available())) memblock_free(table, PAGE_SIZE); } @@ -232,7 +232,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, if (!pud_none(old_pud)) { flush_tlb_all(); if (pud_table(old_pud)) { - phys_addr_t table = __pa(pmd_offset(&old_pud, 0)); + phys_addr_t table = pud_page_paddr(old_pud); if (!WARN_ON_ONCE(slab_is_available())) memblock_free(table, PAGE_SIZE); } From 04390d5e9950ee522e1f6f829615a14c2368a242 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:06 +0000 Subject: [PATCH 423/607] UPSTREAM: arm64: mm: add __{pud,pgd}_populate We currently have __pmd_populate for creating a pmd table entry given the physical address of a pte, but don't have equivalents for the pud or pgd levels of table. To enable us to manipulate tables which are mapped outside of the linear mapping (where we have a PA, but not a linear map VA), it is useful to have these functions. This patch adds __{pud,pgd}_populate. As these should not be called when the kernel uses folded {pmd,pud}s, in these cases they expand to BUILD_BUG(). So long as the appropriate checks are made on the {pud,pgd} entry prior to attempting population, these should be optimized out at compile time. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 1e531cce68c92b46c7d29f36a72f9a3e5886678f) Signed-off-by: Jeff Vander Stoep Change-Id: I4bb3b67d0e8b63dfbd1292a93bdde42ead23a5c2 --- arch/arm64/include/asm/pgalloc.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index c15053902942..ff98585d085a 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -42,11 +42,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) { - set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE)); + set_pud(pud, __pud(pmd | prot)); } +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + __pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE); +} +#else +static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) +{ + BUILD_BUG(); +} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -62,11 +71,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) { - set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE)); + set_pgd(pgdp, __pgd(pud | prot)); } +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + __pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE); +} +#else +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) +{ + BUILD_BUG(); +} #endif /* CONFIG_PGTABLE_LEVELS > 3 */ extern pgd_t *pgd_alloc(struct mm_struct *mm); From d548784c60aa9b5be69218bcbbdaf5f7a767c659 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:07 +0000 Subject: [PATCH 424/607] UPSTREAM: arm64: mm: add functions to walk tables in fixmap As a preparatory step to allow us to allocate early page tables from unmapped memory using memblock_alloc, add new p??_{set,clear}_fixmap* functions which can be used to walk page tables outside of the linear mapping by using fixmap slots. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 961faac114819a01e627fe9c9c82b830bb3849d4) Signed-off-by: Jeff Vander Stoep Change-Id: I58a0f54d2c637ef0fda8e0b9187ade969aecd347 --- arch/arm64/include/asm/fixmap.h | 10 ++++++++++ arch/arm64/include/asm/pgtable.h | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 309704544d22..1a617d46fce9 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,6 +62,16 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + + /* + * Used for kernel page table creation, so unmapped memory may be used + * for tables. + */ + FIX_PTE, + FIX_PMD, + FIX_PUD, + FIX_PGD, + __end_of_fixed_addresses }; diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 837fc7b8a9a4..6a58dd48663f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -57,6 +57,7 @@ #ifndef __ASSEMBLY__ +#include #include extern void __pte_error(const char *file, int line, unsigned long val); @@ -446,6 +447,10 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) +#define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) +#define pte_clear_fixmap() clear_fixmap(FIX_PTE) + #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) /* @@ -485,12 +490,21 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) #define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) +#define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr)) +#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) +#define pmd_clear_fixmap() clear_fixmap(FIX_PMD) + #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) +/* Match pmd_offset folding in */ +#define pmd_set_fixmap(addr) NULL +#define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) +#define pmd_clear_fixmap() + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -523,12 +537,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) +#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) +#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) +#define pud_clear_fixmap() clear_fixmap(FIX_PUD) + #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) #else #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) +/* Match pud_offset folding in */ +#define pud_set_fixmap(addr) NULL +#define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) +#define pud_clear_fixmap() + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) @@ -543,6 +566,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) +#define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr)) +#define pgd_clear_fixmap() clear_fixmap(FIX_PGD) + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | From 7739d1799b628e8a52402f25034c960ee2e27a9a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:08 +0000 Subject: [PATCH 425/607] UPSTREAM: arm64: mm: use fixmap when creating page tables As a preparatory step to allow us to allocate early page tables from unmapped memory using memblock_alloc, modify the __create_mapping callees to map and unmap the tables they modify using fixmap entries. All but the top-level pgd initialisation is performed via the fixmap. Subsequent patches will inject the pgd physical address, and migrate to using the FIX_PGD slot. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit f4710445458c0a1bd1c3c014ada2e7d7dc7b882f) Signed-off-by: Jeff Vander Stoep Change-Id: I71d2fb02d009915b5aab7f3467c4a3c658e898de --- arch/arm64/mm/mmu.c | 61 ++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fa2471ac191a..d7fa7f52f446 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -63,19 +63,30 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static void __init *early_pgtable_alloc(void) +static phys_addr_t __init early_pgtable_alloc(void) { phys_addr_t phys; void *ptr; phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); BUG_ON(!phys); - ptr = __va(phys); + + /* + * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE + * slot will be free, so we can (ab)use the FIX_PTE slot to initialise + * any level of table. + */ + ptr = pte_set_fixmap(phys); + memset(ptr, 0, PAGE_SIZE); - /* Ensure the zeroed page is visible to the page table walker */ - dsb(ishst); - return ptr; + /* + * Implicit barriers also ensure the zeroed page is visible to the page + * table walker + */ + pte_clear_fixmap(); + + return phys; } /* @@ -99,24 +110,28 @@ static void split_pmd(pmd_t *pmd, pte_t *pte) static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - pte = pgtable_alloc(); + phys_addr_t pte_phys = pgtable_alloc(); + pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); - __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); flush_tlb_all(); + pte_clear_fixmap(); } BUG_ON(pmd_bad(*pmd)); - pte = pte_offset_kernel(pmd, addr); + pte = pte_set_fixmap_offset(pmd, addr); do { set_pte(pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + + pte_clear_fixmap(); } static void split_pud(pud_t *old_pud, pmd_t *pmd) @@ -134,7 +149,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pmd_t *pmd; unsigned long next; @@ -143,7 +158,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - pmd = pgtable_alloc(); + phys_addr_t pmd_phys = pgtable_alloc(); + pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* * need to have the 1G of mappings continue to be @@ -151,12 +167,13 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, */ split_pud(pud, pmd); } - pud_populate(mm, pud, pmd); + __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); flush_tlb_all(); + pmd_clear_fixmap(); } BUG_ON(pud_bad(*pud)); - pmd = pmd_offset(pud, addr); + pmd = pmd_set_fixmap_offset(pud, addr); do { next = pmd_addr_end(addr, end); /* try section mapping first */ @@ -182,6 +199,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, } phys += next - addr; } while (pmd++, addr = next, addr != end); + + pmd_clear_fixmap(); } static inline bool use_1G_block(unsigned long addr, unsigned long next, @@ -199,18 +218,18 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pud_t *pud; unsigned long next; if (pgd_none(*pgd)) { - pud = pgtable_alloc(); - pgd_populate(mm, pgd, pud); + phys_addr_t pud_phys = pgtable_alloc(); + __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); - pud = pud_offset(pgd, addr); + pud = pud_set_fixmap_offset(pgd, addr); do { next = pud_addr_end(addr, end); @@ -243,6 +262,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } phys += next - addr; } while (pud++, addr = next, addr != end); + + pud_clear_fixmap(); } /* @@ -252,7 +273,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { unsigned long addr, length, end, next; @@ -275,14 +296,14 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, } while (pgd++, addr = next, addr != end); } -static void *late_pgtable_alloc(void) +static phys_addr_t late_pgtable_alloc(void) { void *ptr = (void *)__get_free_page(PGALLOC_GFP); BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ dsb(ishst); - return ptr; + return __pa(ptr); } static void __init create_mapping(phys_addr_t phys, unsigned long virt, From caf0cdb7ae7d5b5a4da584ff37228c3f26a55aa0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:09 +0000 Subject: [PATCH 426/607] UPSTREAM: arm64: mm: allocate pagetables anywhere Now that create_mapping uses fixmap slots to modify pte, pmd, and pud entries, we can access page tables anywhere in physical memory, regardless of the extent of the linear mapping. Given that, we no longer need to limit memblock allocations during page table creation, and can leave the limit as its default MEMBLOCK_ALLOC_ANYWHERE. We never add memory which will fall outside of the linear map range given phys_offset and MAX_MEMBLOCK_ADDR are configured appropriately, so any tables we create will fall in the linear map of the final tables. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit cdef5f6e9e0e5ee397759b664a9f875ff59ccf01) Signed-off-by: Jeff Vander Stoep Change-Id: I9b6487491f47351303a147147c3d274da20def59 --- arch/arm64/mm/mmu.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d7fa7f52f446..e41f30b35928 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -383,20 +383,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end) static void __init map_mem(void) { struct memblock_region *reg; - phys_addr_t limit; - - /* - * Temporarily limit the memblock range. We need to do this as - * create_mapping requires puds, pmds and ptes to be allocated from - * memory addressable from the initial direct kernel mapping. - * - * The initial direct kernel mapping, located at swapper_pg_dir, gives - * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps, - * memory starting from PHYS_OFFSET (which must be aligned to 2MB as - * per Documentation/arm64/booting.txt). - */ - limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE; - memblock_set_current_limit(limit); /* map all the memory banks */ for_each_memblock(memory, reg) { @@ -406,29 +392,8 @@ static void __init map_mem(void) if (start >= end) break; - if (ARM64_SWAPPER_USES_SECTION_MAPS) { - /* - * For the first memory bank align the start address and - * current memblock limit to prevent create_mapping() from - * allocating pte page tables from unmapped memory. With - * the section maps, if the first block doesn't end on section - * size boundary, create_mapping() will try to allocate a pte - * page, which may be returned from an unmapped area. - * When section maps are not used, the pte page table for the - * current limit is already present in swapper_pg_dir. - */ - if (start < limit) - start = ALIGN(start, SECTION_SIZE); - if (end < limit) { - limit = end & SECTION_MASK; - memblock_set_current_limit(limit); - } - } __map_memblock(start, end); } - - /* Limit no longer required. */ - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } static void __init fixup_executable(void) From 880bee2ed6ea6051d1c387a8eb2cd81140b2ec64 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:10 +0000 Subject: [PATCH 427/607] UPSTREAM: arm64: mm: allow passing a pgdir to alloc_init_* To allow us to initialise pgdirs which are fixmapped, allow explicitly passing a pgdir rather than an mm. A new __create_pgd_mapping function is added for this, with existing __create_mapping callers migrated to this. The mm argument was previously only used at the top level. Now that it is redundant at all levels, it is removed. To indicate its new found similarity to alloc_init_{pud,pmd,pte}, __create_mapping is renamed to init_pgd. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 11509a306bb6ea595878b2d246d2d56b1783e040) Signed-off-by: Jeff Vander Stoep Change-Id: I554f90b986a0fce6c96c0a0a9da1d6e61602d0a9 --- arch/arm64/mm/mmu.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e41f30b35928..f949fe238643 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -146,8 +146,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } -static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, +static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -215,8 +214,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, return true; } -static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, +static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -257,7 +255,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } } } else { - alloc_init_pmd(mm, pud, addr, next, phys, prot, + alloc_init_pmd(pud, addr, next, phys, prot, pgtable_alloc); } phys += next - addr; @@ -270,8 +268,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, * Create the page directory entries and any necessary page tables for the * mapping specified by 'md'. */ -static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, - phys_addr_t phys, unsigned long virt, +static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -291,7 +288,7 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc); + alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -306,6 +303,14 @@ static phys_addr_t late_pgtable_alloc(void) return __pa(ptr); } +static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*alloc)(void)) +{ + init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); +} + static void __init create_mapping(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { @@ -314,16 +319,16 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, - size, prot, early_pgtable_alloc); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, + early_pgtable_alloc); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { - __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot, - late_pgtable_alloc); + __create_pgd_mapping(mm->pgd, phys, virt, size, prot, + late_pgtable_alloc); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -335,8 +340,8 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, return; } - return __create_mapping(&init_mm, pgd_offset_k(virt), - phys, virt, size, prot, late_pgtable_alloc); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, + late_pgtable_alloc); } #ifdef CONFIG_DEBUG_RODATA From 9f31360553319ee4e8fc3a1c97ef4788c024d6de Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:11 +0000 Subject: [PATCH 428/607] BACKPORT: arm64: ensure _stext and _etext are page-aligned Currently we have separate ALIGN_DEBUG_RO{,_MIN} directives to align _etext and __init_begin. While we ensure that __init_begin is page-aligned, we do not provide the same guarantee for _etext. This is not problematic currently as the alignment of __init_begin is sufficient to prevent issues when we modify permissions. Subsequent patches will assume page alignment of segments of the kernel we wish to map with different permissions. To ensure this, move _etext after the ALIGN_DEBUG_RO_MIN for the init section. This renders the prior ALIGN_DEBUG_RO irrelevant, and hence it is removed. Likewise, upgrade to ALIGN_DEBUG_RO_MIN(PAGE_SIZE) for _stext. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit fca082bfb543ccaaff864fc0892379ccaa1711cd) Signed-off-by: Jeff Vander Stoep Change-Id: If1a829aa5c363f2c76eb1a18209c3c00ee3ffd0a --- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1a056bb26633..763e5aab5752 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -95,7 +95,7 @@ SECTIONS _text = .; HEAD_TEXT } - ALIGN_DEBUG_RO + ALIGN_DEBUG_RO_MIN(PAGE_SIZE) .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ __exception_text_start = .; @@ -118,9 +118,9 @@ SECTIONS RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES - ALIGN_DEBUG_RO ALIGN_DEBUG_RO_MIN(PAGE_SIZE) + _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) From 63048c3fad6dd5219008c7d842cf1a5e4bac5508 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:12 +0000 Subject: [PATCH 429/607] BACKPORT: arm64: mm: create new fine-grained mappings at boot At boot we may change the granularity of the tables mapping the kernel (by splitting or making sections). This may happen when we create the linear mapping (in __map_memblock), or at any point we try to apply fine-grained permissions to the kernel (e.g. fixup_executable, mark_rodata_ro, fixup_init). Changing the active page tables in this manner may result in multiple entries for the same address being allocated into TLBs, risking problems such as TLB conflict aborts or issues derived from the amalgamation of TLB entries. Generally, a break-before-make (BBM) approach is necessary to avoid conflicts, but we cannot do this for the kernel tables as it risks unmapping text or data being used to do so. Instead, we can create a new set of tables from scratch in the safety of the existing mappings, and subsequently migrate over to these using the new cpu_replace_ttbr1 helper, which avoids the two sets of tables being active simultaneously. To avoid issues when we later modify permissions of the page tables (e.g. in fixup_init), we must create the page tables at a granularity such that later modification does not result in splitting of tables. This patch applies this strategy, creating a new set of fine-grained page tables from scratch, and safely migrating to them. The existing fixmap and kasan shadow page tables are reused in the new fine-grained tables. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 068a17a5805dfbca4bbf03e664ca6b19709cc7a8) Signed-off-by: Jeff Vander Stoep Change-Id: I7ddaa296b94106846ebf8392d91186df8673df64 --- arch/arm64/include/asm/kasan.h | 3 + arch/arm64/mm/kasan_init.c | 15 ++++ arch/arm64/mm/mmu.c | 154 ++++++++++++++++++++------------- 3 files changed, 110 insertions(+), 62 deletions(-) diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index 2774fa384c47..de0d21211c34 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -7,6 +7,7 @@ #include #include +#include /* * KASAN_SHADOW_START: beginning of the kernel virtual addresses. @@ -28,10 +29,12 @@ #define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3))) void kasan_init(void); +void kasan_copy_shadow(pgd_t *pgdir); asmlinkage void kasan_early_init(void); #else static inline void kasan_init(void) { } +static inline void kasan_copy_shadow(pgd_t *pgdir) { } #endif #endif diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 263b59020500..cc569a38bc76 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -97,6 +97,21 @@ asmlinkage void __init kasan_early_init(void) kasan_map_early_shadow(); } +/* + * Copy the current shadow region into a new pgdir. + */ +void __init kasan_copy_shadow(pgd_t *pgdir) +{ + pgd_t *pgd, *pgd_new, *pgd_end; + + pgd = pgd_offset_k(KASAN_SHADOW_START); + pgd_end = pgd_offset_k(KASAN_SHADOW_END); + pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); + do { + set_pgd(pgd_new, *pgd); + } while (pgd++, pgd_new++, pgd != pgd_end); +} + static void __init clear_pgds(unsigned long start, unsigned long end) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f949fe238643..6f5770a1b140 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -344,48 +345,42 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, late_pgtable_alloc); } -#ifdef CONFIG_DEBUG_RODATA -static void __init __map_memblock(phys_addr_t start, phys_addr_t end) +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { + + unsigned long kernel_start = __pa(_stext); + unsigned long kernel_end = __pa(_end); + /* - * Set up the executable regions using the existing section mappings - * for now. This will get more fine grained later once all memory - * is mapped + * The kernel itself is mapped at page granularity. Map all other + * memory, making sure we don't overwrite the existing kernel mappings. */ - unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE); - unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE); - if (end < kernel_x_start) { - create_mapping(start, __phys_to_virt(start), - end - start, PAGE_KERNEL); - } else if (start >= kernel_x_end) { - create_mapping(start, __phys_to_virt(start), - end - start, PAGE_KERNEL); - } else { - if (start < kernel_x_start) - create_mapping(start, __phys_to_virt(start), - kernel_x_start - start, - PAGE_KERNEL); - create_mapping(kernel_x_start, - __phys_to_virt(kernel_x_start), - kernel_x_end - kernel_x_start, - PAGE_KERNEL_EXEC); - if (kernel_x_end < end) - create_mapping(kernel_x_end, - __phys_to_virt(kernel_x_end), - end - kernel_x_end, - PAGE_KERNEL); + /* No overlap with the kernel. */ + if (end < kernel_start || start >= kernel_end) { + __create_pgd_mapping(pgd, start, __phys_to_virt(start), + end - start, PAGE_KERNEL, + early_pgtable_alloc); + return; } -} -#else -static void __init __map_memblock(phys_addr_t start, phys_addr_t end) -{ - create_mapping(start, __phys_to_virt(start), end - start, - PAGE_KERNEL_EXEC); -} -#endif -static void __init map_mem(void) + /* + * This block overlaps the kernel mapping. Map the portion(s) which + * don't overlap. + */ + if (start < kernel_start) + __create_pgd_mapping(pgd, start, + __phys_to_virt(start), + kernel_start - start, PAGE_KERNEL, + early_pgtable_alloc); + if (kernel_end < end) + __create_pgd_mapping(pgd, kernel_end, + __phys_to_virt(kernel_end), + end - kernel_end, PAGE_KERNEL, + early_pgtable_alloc); +} + +static void __init map_mem(pgd_t *pgd) { struct memblock_region *reg; @@ -397,33 +392,10 @@ static void __init map_mem(void) if (start >= end) break; - __map_memblock(start, end); + __map_memblock(pgd, start, end); } } -static void __init fixup_executable(void) -{ -#ifdef CONFIG_DEBUG_RODATA - /* now that we are actually fully mapped, make the start/end more fine grained */ - if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) { - unsigned long aligned_start = round_down(__pa(_stext), - SWAPPER_BLOCK_SIZE); - - create_mapping(aligned_start, __phys_to_virt(aligned_start), - __pa(_stext) - aligned_start, - PAGE_KERNEL); - } - - if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) { - unsigned long aligned_end = round_up(__pa(__init_end), - SWAPPER_BLOCK_SIZE); - create_mapping(__pa(__init_end), (unsigned long)__init_end, - aligned_end - __pa(__init_end), - PAGE_KERNEL); - } -#endif -} - #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { @@ -441,14 +413,72 @@ void fixup_init(void) PAGE_KERNEL); } +static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, + pgprot_t prot) +{ + phys_addr_t pa_start = __pa(va_start); + unsigned long size = va_end - va_start; + + BUG_ON(!PAGE_ALIGNED(pa_start)); + BUG_ON(!PAGE_ALIGNED(size)); + + __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, + early_pgtable_alloc); +} + +/* + * Create fine-grained mappings for the kernel. + */ +static void __init map_kernel(pgd_t *pgd) +{ + + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); + + /* + * The fixmap falls in a separate pgd to the kernel, and doesn't live + * in the carveout for the swapper_pg_dir. We can simply re-use the + * existing dir for the fixmap. + */ + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); + + kasan_copy_shadow(pgd); +} + /* * paging_init() sets up the page tables, initialises the zone memory * maps and sets up the zero page. */ void __init paging_init(void) { - map_mem(); - fixup_executable(); + phys_addr_t pgd_phys = early_pgtable_alloc(); + pgd_t *pgd = pgd_set_fixmap(pgd_phys); + + map_kernel(pgd); + map_mem(pgd); + + /* + * We want to reuse the original swapper_pg_dir so we don't have to + * communicate the new address to non-coherent secondaries in + * secondary_entry, and so cpu_switch_mm can generate the address with + * adrp+add rather than a load from some global variable. + * + * To do this we need to go via a temporary pgd. + */ + cpu_replace_ttbr1(__va(pgd_phys)); + memcpy(swapper_pg_dir, pgd, PAGE_SIZE); + cpu_replace_ttbr1(swapper_pg_dir); + + pgd_clear_fixmap(); + memblock_free(pgd_phys, PAGE_SIZE); + + /* + * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd + * allocated with it. + */ + memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE, + SWAPPER_DIR_SIZE - PAGE_SIZE); bootmem_init(); } From 17e35cbf1863e5cf74425594056ef28b18f762bf Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Tue, 23 Aug 2016 18:22:33 -0400 Subject: [PATCH 430/607] UPSTREAM: tun: fix transmit timestamp support Instead of using sock_tx_timestamp, use skb_tx_timestamp to record software transmit timestamp of a packet. sock_tx_timestamp resets and overrides the tx_flags of the skb. The function is intended to be called from within the protocol layer when creating the skb, not from a device driver. This is inconsistent with other drivers and will cause issues for TCP. In TCP, we intend to sample the timestamps for the last byte for each sendmsg/sendpage. For that reason, tcp_sendmsg calls tcp_tx_timestamp only with the last skb that it generates. For example, if a 128KB message is split into two 64KB packets we want to sample the SND timestamp of the last packet. The current code in the tun driver, however, will result in sampling the SND timestamp for both packets. Also, when the last packet is split into smaller packets for retranmission (see tcp_fragment), the tun driver will record timestamps for all of the retransmitted packets and not only the last packet. Change-Id: If7458ab31de52aa15a12364b6c1ac2a8f93f17a7 Fixes: eda297729171 (tun: Support software transmit time stamping.) Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Francis Yan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2228a539184a..7b4c65effe5e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -858,10 +858,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; - if (skb->sk && sk_fullsock(skb->sk)) { - sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); - sw_tx_timestamp(skb); - } + skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. From fa42b29f530e15078b4e3070fd62e310211ee3de Mon Sep 17 00:00:00 2001 From: Eric Ernst Date: Fri, 2 Sep 2016 16:12:06 -0700 Subject: [PATCH 431/607] input: keyreset: switch to orderly_reboot Prior restart function would make a call to sys_sync and then execute a kernel reset. Rather than call the sync directly, thus necessitating this driver to be builtin, call orderly_reboot, which will take care of the file system sync. Note: since CONFIG_INPUT Kconfig is tristate, this driver can be built as module, despite being marked bool. Signed-off-by: Eric Ernst --- drivers/input/keyreset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c index 7fbf7247e65f..7e5222aec7c1 100644 --- a/drivers/input/keyreset.c +++ b/drivers/input/keyreset.c @@ -32,8 +32,7 @@ struct keyreset_state { static void do_restart(struct work_struct *unused) { - sys_sync(); - kernel_restart(NULL); + orderly_reboot(); } static void do_reset_fn(void *priv) From 5449876ad748ec9eae64267f3d7aab93c82a464d Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 19 Sep 2016 14:37:34 -0700 Subject: [PATCH 432/607] eas/sched/fair: Fixing comments in find_best_target. Change-Id: I83f5b9887e98f9fdb81318cde45408e7ebfc4b13 Signed-off-by: Srinath Sridharan --- kernel/sched/fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2cd3e2671f16..7e42cd11e430 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5674,17 +5674,19 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if(prefer_idle) { - // Find a target cpu with lowest - // utilization. + if (prefer_idle) { + /* Find a target cpu with highest + * utilization. + */ if (target_util == 0 || target_util < new_util) { target_cpu = i; target_util = new_util; } } else { - // Find a target cpu with highest - // utilization. + /* Find a target cpu with lowest + * utilization. + */ if (target_util == 0 || target_util > new_util) { target_cpu = i; From b7c491d2c40fde253ec06a63b3f93ea60c0467a9 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 18 Sep 2016 21:39:28 -0700 Subject: [PATCH 433/607] ANDROID: fiq_debugger: Pass task parameter to unwind_frame() Fixes: fe13f95b7200 ("arm64: pass a task parameter to unwind_frame()") Bug: 30369029 Patchset: rework-pagetable Change-Id: I9a4ab50ef61532d27282f189f063c938c196ec08 Signed-off-by: Jeff Vander Stoep --- drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c index 99c6584fcfa5..97246bcbcd62 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c @@ -197,6 +197,6 @@ void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output, frame.sp = regs->sp; frame.pc = regs->pc; output->printf(output, "\n"); - walk_stackframe(&frame, report_trace, &sts); + walk_stackframe(current, &frame, report_trace, &sts); } } From cf43809d7aa07868ffacd83892745c85f91c8a53 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 20 Sep 2016 17:00:47 +0100 Subject: [PATCH 434/607] sched/walt: Drop arch-specific timer access On at least one platform, occasionally the timer providing the wallclock was able to be reset/go backwards for at least some time after wakeup. Accept that this might happen and warn the first time, but otherwise just carry on. Change-Id: Id3164477ba79049561af7f0889cbeebc199ead4e Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 07b7f84b37e2..2ffb1680b380 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "sched.h" #include "walt.h" @@ -188,10 +187,8 @@ update_window_start(struct rq *rq, u64 wallclock) delta = wallclock - rq->window_start; /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ if (delta < 0) { - if (arch_timer_read_counter() == 0) - delta = 0; - else - BUG_ON(1); + delta = 0; + WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n"); } if (delta < walt_ravg_window) From 32cbbe59538d2dd0b77822cc27ce32ca487c467d Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Mon, 19 Sep 2016 17:33:50 -0700 Subject: [PATCH 435/607] ANDROID: fs: FS tracepoints to track IO. Adds tracepoints in ext4/f2fs/mpage to track readpages/buffered write()s. This allows us to track files that are being read/written to PIDs. Change-Id: I26bd36f933108927d6903da04d8cb42fd9c3ef3d Signed-off-by: Mohan Srinivasan --- fs/ext4/inline.c | 6 ++ fs/ext4/inode.c | 27 ++++++++ fs/ext4/readpage.c | 41 +++++++++-- fs/f2fs/data.c | 21 ++++++ fs/f2fs/inline.c | 11 +++ fs/mpage.c | 30 ++++++++ include/trace/events/android_fs.h | 31 +++++++++ include/trace/events/android_fs_template.h | 79 ++++++++++++++++++++++ 8 files changed, 242 insertions(+), 4 deletions(-) create mode 100644 include/trace/events/android_fs.h create mode 100644 include/trace/events/android_fs_template.h diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..af34979684a4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -18,6 +18,7 @@ #include "ext4.h" #include "xattr.h" #include "truncate.h" +#include #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -500,6 +501,9 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) return -EAGAIN; } + trace_android_fs_dataread_start(inode, page_offset(page), PAGE_SIZE, + current->pid, current->comm); + /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. @@ -511,6 +515,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) SetPageUptodate(page); } + trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE); + up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea433a7f4bca..a7208a662ee6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -44,6 +44,7 @@ #include "truncate.h" #include +#include #define MPAGE_DA_EXTENT_TAIL 0x01 @@ -982,6 +983,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, current->comm); trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case @@ -1118,6 +1121,7 @@ static int ext4_write_end(struct file *file, int ret = 0, ret2; int i_size_changed = 0; + trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_write_end(inode, pos, len, copied); if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { ret = ext4_jbd2_file_inode(handle, inode); @@ -1226,6 +1230,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; int size_changed = 0; + trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -2670,6 +2675,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, current->comm); trace_ext4_da_write_begin(inode, pos, len, flags); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -2788,6 +2795,7 @@ static int ext4_da_write_end(struct file *file, return ext4_write_end(file, mapping, pos, len, copied, page, fsdata); + trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -3276,12 +3284,31 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (ext4_has_inline_data(inode)) return 0; + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_start(inode, offset, count, + current->pid, + current->comm); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_start(inode, offset, count, + current->pid, + current->comm); + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(iocb, iter, offset); else ret = ext4_ind_direct_IO(iocb, iter, offset); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); + + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_end(inode, offset, count); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_end(inode, offset, count); + return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5dc5e95063de..1ce24a6759a0 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -45,6 +45,7 @@ #include #include "ext4.h" +#include /* * Call ext4_decrypt on every single page, reusing the encryption @@ -86,6 +87,17 @@ static inline bool ext4_bio_encrypted(struct bio *bio) #endif } +static void +ext4_trace_read_completion(struct bio *bio) +{ + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) + trace_android_fs_dataread_end(first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size); +} + /* * I/O completion handler for multipage BIOs. * @@ -103,6 +115,9 @@ static void mpage_end_io(struct bio *bio) struct bio_vec *bv; int i; + if (trace_android_fs_dataread_start_enabled()) + ext4_trace_read_completion(bio); + if (ext4_bio_encrypted(bio)) { struct ext4_crypto_ctx *ctx = bio->bi_private; @@ -130,6 +145,24 @@ static void mpage_end_io(struct bio *bio) bio_put(bio); } +static void +ext4_submit_bio_read(struct bio *bio) +{ + if (trace_android_fs_dataread_start_enabled()) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) { + trace_android_fs_dataread_start( + first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size, + current->pid, + current->comm); + } + } + submit_bio(READ, bio); +} + int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages) @@ -271,7 +304,7 @@ int ext4_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != blocks[0] - 1)) { submit_and_realloc: - submit_bio(READ, bio); + ext4_submit_bio_read(bio); bio = NULL; } if (bio == NULL) { @@ -303,14 +336,14 @@ int ext4_mpage_readpages(struct address_space *mapping, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { - submit_bio(READ, bio); + ext4_submit_bio_read(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { - submit_bio(READ, bio); + ext4_submit_bio_read(bio); bio = NULL; } if (!PageUptodate(page)) @@ -323,6 +356,6 @@ int ext4_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + ext4_submit_bio_read(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac071..e692958d6e78 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -26,6 +26,7 @@ #include "segment.h" #include "trace.h" #include +#include static void f2fs_read_end_io(struct bio *bio) { @@ -1401,6 +1402,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct dnode_of_data dn; int err = 0; + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, current->comm); trace_f2fs_write_begin(inode, pos, len, flags); f2fs_balance_fs(sbi); @@ -1529,6 +1532,7 @@ static int f2fs_write_end(struct file *file, { struct inode *inode = page->mapping->host; + trace_android_fs_datawrite_end(inode, pos, len); trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); @@ -1582,6 +1586,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_start(inode, offset, + count, current->pid, + current->comm); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_start(inode, offset, count, + current->pid, current->comm); + if (iov_iter_rw(iter) == WRITE) { __allocate_data_blocks(inode, offset, count); if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -1595,6 +1609,13 @@ out: if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_end(inode, offset, count); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_end(inode, offset, count); + trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); return err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..d2c5d69ba0b1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -13,6 +13,7 @@ #include "f2fs.h" #include "node.h" +#include bool f2fs_may_inline_data(struct inode *inode) { @@ -84,14 +85,22 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; + trace_android_fs_dataread_start(inode, page_offset(page), + PAGE_SIZE, current->pid, + current->comm); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { + trace_android_fs_dataread_end(inode, page_offset(page), + PAGE_SIZE); unlock_page(page); return PTR_ERR(ipage); } if (!f2fs_has_inline_data(inode)) { f2fs_put_page(ipage, 1); + trace_android_fs_dataread_end(inode, page_offset(page), + PAGE_SIZE); return -EAGAIN; } @@ -102,6 +111,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) SetPageUptodate(page); f2fs_put_page(ipage, 1); + trace_android_fs_dataread_end(inode, page_offset(page), + PAGE_SIZE); unlock_page(page); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index 1480d3a18037..5c65d8942692 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -30,6 +30,14 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start); +EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end); +EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start); +EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end); + /* * I/O completion handler for multipage BIOs. * @@ -47,6 +55,16 @@ static void mpage_end_io(struct bio *bio) struct bio_vec *bv; int i; + if (trace_android_fs_dataread_end_enabled() && + (bio_data_dir(bio) == READ)) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) + trace_android_fs_dataread_end(first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size); + } + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; page_endio(page, bio_data_dir(bio), bio->bi_error); @@ -57,6 +75,18 @@ static void mpage_end_io(struct bio *bio) static struct bio *mpage_bio_submit(int rw, struct bio *bio) { + if (trace_android_fs_dataread_start_enabled() && (rw == READ)) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) { + trace_android_fs_dataread_start( + first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size, + current->pid, + current->comm); + } + } bio->bi_end_io = mpage_end_io; guard_bio_eod(rw, bio); submit_bio(rw, bio); diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h new file mode 100644 index 000000000000..531da433a7bc --- /dev/null +++ b/include/trace/events/android_fs.h @@ -0,0 +1,31 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM android_fs + +#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ANDROID_FS_H + +#include +#include + +DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start, + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *command), + TP_ARGS(inode, offset, bytes, pid, command)); + +DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end, + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + TP_ARGS(inode, offset, bytes)); + +DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start, + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *command), + TP_ARGS(inode, offset, bytes, pid, command)); + +DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end, + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + TP_ARGS(inode, offset, bytes)); + +#endif /* _TRACE_ANDROID_FS_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h new file mode 100644 index 000000000000..618988b047c1 --- /dev/null +++ b/include/trace/events/android_fs_template.h @@ -0,0 +1,79 @@ +#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ANDROID_FS_TEMPLATE_H + +#include + +DECLARE_EVENT_CLASS(android_fs_data_start_template, + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *command), + TP_ARGS(inode, offset, bytes, pid, command), + TP_STRUCT__entry( + __array(char, path, MAX_FILTER_STR_VAL); + __field(char *, pathname); + __field(loff_t, offset); + __field(int, bytes); + __field(loff_t, i_size); + __string(cmdline, command); + __field(pid_t, pid); + __field(ino_t, ino); + ), + TP_fast_assign( + { + struct dentry *d; + + /* + * Grab a reference to the inode here because + * d_obtain_alias() will either drop the inode + * reference if it locates an existing dentry + * or transfer the reference to the new dentry + * created. In our case, the file is still open, + * so the dentry is guaranteed to exist (connected), + * so d_obtain_alias() drops the reference we + * grabbed here. + */ + ihold(inode); + d = d_obtain_alias(inode); + if (!IS_ERR(d)) { + __entry->pathname = dentry_path(d, + __entry->path, + MAX_FILTER_STR_VAL); + dput(d); + } else + __entry->pathname = ERR_PTR(-EINVAL); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + __entry->pid = pid; + __entry->ino = inode->i_ino; + } + ), + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + (IS_ERR(__entry->pathname) ? "ERROR" : __entry->pathname), + __entry->offset, __entry->bytes, __get_str(cmdline), + __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(android_fs_data_end_template, + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + TP_ARGS(inode, offset, bytes), + TP_STRUCT__entry( + __field(ino_t, ino); + __field(loff_t, offset); + __field(int, bytes); + ), + TP_fast_assign( + { + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + } + ), + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */ From df232437710122fcb4e4a0484a1eded5aec29a6a Mon Sep 17 00:00:00 2001 From: Caesar Wang Date: Tue, 23 Aug 2016 11:47:02 +0100 Subject: [PATCH 436/607] sched/fair: remove printk while schedule is in progress It will cause deadlock and while(1) if call printk while schedule is in progress. The block state like as below: cpu0(hold the console sem): printk->console_unlock->up_sem->spin_lock(&sem->lock)->wake_up_process(cpu1) ->try_to_wake_up(cpu1)->while(p->on_cpu). cpu1(request console sem): console_lock->down_sem->schedule->idle_banlance->update_cpu_capacity-> printk->console_trylock->spin_lock(&sem->lock). p->on_cpu will be 1 forever, because the task is still running on cpu1, so cpu0 is blocked in while(p->on_cpu), but cpu1 could not get spin_lock(&sem->lock), it is blocked too, it means the task will running on cpu1 forever. Signed-off-by: Caesar Wang --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e42cd11e430..6b881cf81d79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7089,7 +7089,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) mcc->cpu = cpu; #ifdef CONFIG_SCHED_DEBUG raw_spin_unlock_irqrestore(&mcc->lock, flags); - pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); goto skip_unlock; #endif } From 53c63d3b4f06e31269d66173d327164f273e4233 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 20 Sep 2016 18:42:22 -0700 Subject: [PATCH 437/607] sched: Add Kconfig option DEFAULT_USE_ENERGY_AWARE to set ENERGY_AWARE feature flag The ENERGY_AWARE sched feature flag cannot be set unless CONFIG_SCHED_DEBUG is enabled. So this patch allows the flag to default to true at build time if the config is set. Change-Id: I8835a571fdb7a8f8ee6a54af1e11a69f3b5ce8e6 Signed-off-by: John Stultz --- init/Kconfig | 10 ++++++++++ kernel/sched/features.h | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index e71e35cf723c..acb6645ffda5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1290,6 +1290,16 @@ config SCHED_TUNE If unsure, say N. +config DEFAULT_USE_ENERGY_AWARE + bool "Default to enabling the Energy Aware Scheduler feature" + default n + help + This option defaults the ENERGY_AWARE scheduling feature to true, + as without SCHED_DEBUG set this feature can't be enabled or disabled + via sysctl. + + Say N if unsure. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/features.h b/kernel/sched/features.h index b634151ce286..55e461055332 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) * Energy aware scheduling. Use platform energy model to guide scheduling * decisions optimizing for energy efficiency. */ +#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE +SCHED_FEAT(ENERGY_AWARE, true) +#else SCHED_FEAT(ENERGY_AWARE, false) +#endif From 55b6a243332cde20882ba331a429d4ce5ec93ae7 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 26 Jan 2016 11:10:38 +0000 Subject: [PATCH 438/607] BACKPORT: arm64: kernel: implement ACPI parking protocol The SBBR and ACPI specifications allow ACPI based systems that do not implement PSCI (eg systems with no EL3) to boot through the ACPI parking protocol specification[1]. This patch implements the ACPI parking protocol CPU operations, and adds code that eases parsing the parking protocol data structures to the ARM64 SMP initializion carried out at the same time as cpus enumeration. To wake-up the CPUs from the parked state, this patch implements a wakeup IPI for ARM64 (ie arch_send_wakeup_ipi_mask()) that mirrors the ARM one, so that a specific IPI is sent for wake-up purpose in order to distinguish it from other IPI sources. Given the current ACPI MADT parsing API, the patch implements a glue layer that helps passing MADT GICC data structure from SMP initialization code to the parking protocol implementation somewhat overriding the CPU operations interfaces. This to avoid creating a completely trasparent DT/ACPI CPU operations layer that would require creating opaque structure handling for CPUs data (DT represents CPU through DT nodes, ACPI through static MADT table entries), which seems overkill given that ACPI on ARM64 mandates only two booting protocols (PSCI and parking protocol), so there is no need for further protocol additions. Based on the original work by Mark Salter [1] https://acpica.org/sites/acpica/files/MP%20Startup%20for%20ARM%20platforms.docx Signed-off-by: Lorenzo Pieralisi Tested-by: Loc Ho Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Mark Rutland Cc: Mark Salter Cc: Al Stone [catalin.marinas@arm.com: Added WARN_ONCE(!acpi_parking_protocol_valid() on the IPI] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 5e89c55e4ed81d7abb1ce8828db35fa389dc0e90) Signed-off-by: Jeff Vander Stoep Change-Id: Ie9a872bff124ca6b3551c812df768cc378658bcc --- arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/acpi.h | 19 ++- arch/arm64/include/asm/hardirq.h | 2 +- arch/arm64/include/asm/smp.h | 9 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/acpi_parking_protocol.c | 153 ++++++++++++++++++++++ arch/arm64/kernel/cpu_ops.c | 27 +++- arch/arm64/kernel/smp.c | 28 ++++ 8 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/kernel/acpi_parking_protocol.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ac3dbd933064..1405a1bcc26d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -740,6 +740,15 @@ endmenu menu "Boot options" +config ARM64_ACPI_PARKING_PROTOCOL + bool "Enable support for the ARM64 ACPI parking protocol" + depends on ACPI + help + Enable support for the ARM64 ACPI parking protocol. If disabled + the kernel will not allow booting through the ARM64 ACPI parking + protocol even if the corresponding data is present in the ACPI + MADT table. + config CMDLINE string "Default kernel command string" default "" diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index caafd63b8092..aee323b13802 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -87,9 +87,26 @@ void __init acpi_init_cpus(void); static inline void acpi_init_cpus(void) { } #endif /* CONFIG_ACPI */ +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +bool acpi_parking_protocol_valid(int cpu); +void __init +acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor); +#else +static inline bool acpi_parking_protocol_valid(int cpu) { return false; } +static inline void +acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor) +{} +#endif + static inline const char *acpi_get_enable_method(int cpu) { - return acpi_psci_present() ? "psci" : NULL; + if (acpi_psci_present()) + return "psci"; + + if (acpi_parking_protocol_valid(cpu)) + return "parking-protocol"; + + return NULL; } #ifdef CONFIG_ACPI_APEI diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index a57601f9d17c..8740297dac77 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -20,7 +20,7 @@ #include #include -#define NR_IPI 5 +#define NR_IPI 6 typedef struct { unsigned int __softirq_pending; diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index d9c3d6a6100a..2013a4dc5124 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -64,6 +64,15 @@ extern void secondary_entry(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); +#else +static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask) +{ + BUILD_BUG(); +} +#endif + extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 474691f8b13a..c4e2f70c0aa0 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -41,6 +41,7 @@ arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o +arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c new file mode 100644 index 000000000000..4b1e5a7a98da --- /dev/null +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -0,0 +1,153 @@ +/* + * ARM64 ACPI Parking Protocol implementation + * + * Authors: Lorenzo Pieralisi + * Mark Salter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include + +#include + +struct cpu_mailbox_entry { + phys_addr_t mailbox_addr; + u8 version; + u8 gic_cpu_id; +}; + +static struct cpu_mailbox_entry cpu_mailbox_entries[NR_CPUS]; + +void __init acpi_set_mailbox_entry(int cpu, + struct acpi_madt_generic_interrupt *p) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + + cpu_entry->mailbox_addr = p->parked_address; + cpu_entry->version = p->parking_version; + cpu_entry->gic_cpu_id = p->cpu_interface_number; +} + +bool acpi_parking_protocol_valid(int cpu) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + + return cpu_entry->mailbox_addr && cpu_entry->version; +} + +static int acpi_parking_protocol_cpu_init(unsigned int cpu) +{ + pr_debug("%s: ACPI parked addr=%llx\n", __func__, + cpu_mailbox_entries[cpu].mailbox_addr); + + return 0; +} + +static int acpi_parking_protocol_cpu_prepare(unsigned int cpu) +{ + return 0; +} + +struct parking_protocol_mailbox { + __le32 cpu_id; + __le32 reserved; + __le64 entry_point; +}; + +static int acpi_parking_protocol_cpu_boot(unsigned int cpu) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + struct parking_protocol_mailbox __iomem *mailbox; + __le32 cpu_id; + + /* + * Map mailbox memory with attribute device nGnRE (ie ioremap - + * this deviates from the parking protocol specifications since + * the mailboxes are required to be mapped nGnRnE; the attribute + * discrepancy is harmless insofar as the protocol specification + * is concerned). + * If the mailbox is mistakenly allocated in the linear mapping + * by FW ioremap will fail since the mapping will be prevented + * by the kernel (it clashes with the linear mapping attributes + * specifications). + */ + mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox)); + if (!mailbox) + return -EIO; + + cpu_id = readl_relaxed(&mailbox->cpu_id); + /* + * Check if firmware has set-up the mailbox entry properly + * before kickstarting the respective cpu. + */ + if (cpu_id != ~0U) { + iounmap(mailbox); + return -ENXIO; + } + + /* + * We write the entry point and cpu id as LE regardless of the + * native endianness of the kernel. Therefore, any boot-loaders + * that read this address need to convert this address to the + * Boot-Loader's endianness before jumping. + */ + writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point); + writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); + + arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + + iounmap(mailbox); + + return 0; +} + +static void acpi_parking_protocol_cpu_postboot(void) +{ + int cpu = smp_processor_id(); + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + struct parking_protocol_mailbox __iomem *mailbox; + __le64 entry_point; + + /* + * Map mailbox memory with attribute device nGnRE (ie ioremap - + * this deviates from the parking protocol specifications since + * the mailboxes are required to be mapped nGnRnE; the attribute + * discrepancy is harmless insofar as the protocol specification + * is concerned). + * If the mailbox is mistakenly allocated in the linear mapping + * by FW ioremap will fail since the mapping will be prevented + * by the kernel (it clashes with the linear mapping attributes + * specifications). + */ + mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox)); + if (!mailbox) + return; + + entry_point = readl_relaxed(&mailbox->entry_point); + /* + * Check if firmware has cleared the entry_point as expected + * by the protocol specification. + */ + WARN_ON(entry_point); + + iounmap(mailbox); +} + +const struct cpu_operations acpi_parking_protocol_ops = { + .name = "parking-protocol", + .cpu_init = acpi_parking_protocol_cpu_init, + .cpu_prepare = acpi_parking_protocol_cpu_prepare, + .cpu_boot = acpi_parking_protocol_cpu_boot, + .cpu_postboot = acpi_parking_protocol_cpu_postboot +}; diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index b6bd7d447768..c7cfb8fe06f9 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -25,19 +25,30 @@ #include extern const struct cpu_operations smp_spin_table_ops; +extern const struct cpu_operations acpi_parking_protocol_ops; extern const struct cpu_operations cpu_psci_ops; const struct cpu_operations *cpu_ops[NR_CPUS]; -static const struct cpu_operations *supported_cpu_ops[] __initconst = { +static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = { &smp_spin_table_ops, &cpu_psci_ops, NULL, }; +static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = { +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL + &acpi_parking_protocol_ops, +#endif + &cpu_psci_ops, + NULL, +}; + static const struct cpu_operations * __init cpu_get_ops(const char *name) { - const struct cpu_operations **ops = supported_cpu_ops; + const struct cpu_operations **ops; + + ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops; while (*ops) { if (!strcmp(name, (*ops)->name)) @@ -75,8 +86,16 @@ static const char *__init cpu_read_enable_method(int cpu) } } else { enable_method = acpi_get_enable_method(cpu); - if (!enable_method) - pr_err("Unsupported ACPI enable-method\n"); + if (!enable_method) { + /* + * In ACPI systems the boot CPU does not require + * checking the enable method since for some + * boot protocol (ie parking protocol) it need not + * be initialized. Don't warn spuriously. + */ + if (cpu != 0) + pr_err("Unsupported ACPI enable-method\n"); + } } return enable_method; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 68e7f79630d4..24cb4f800033 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -70,6 +70,7 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_TIMER, IPI_IRQ_WORK, + IPI_WAKEUP }; /* @@ -443,6 +444,17 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) /* map the logical cpu id to cpu MPIDR */ cpu_logical_map(cpu_count) = hwid; + /* + * Set-up the ACPI parking protocol cpu entries + * while initializing the cpu_logical_map to + * avoid parsing MADT entries multiple times for + * nothing (ie a valid cpu_logical_map entry should + * contain a valid parking protocol data set to + * initialize the cpu if the parking protocol is + * the only available enable method). + */ + acpi_set_mailbox_entry(cpu_count, processor); + cpu_count++; } @@ -625,6 +637,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = { S(IPI_CPU_STOP, "CPU stop interrupts"), S(IPI_TIMER, "Timer broadcast interrupts"), S(IPI_IRQ_WORK, "IRQ work interrupts"), + S(IPI_WAKEUP, "CPU wake-up interrupts"), }; static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) @@ -668,6 +681,13 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC); } +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +void arch_send_wakeup_ipi_mask(const struct cpumask *mask) +{ + smp_cross_call(mask, IPI_WAKEUP); +} +#endif + #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { @@ -745,6 +765,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; #endif +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL + case IPI_WAKEUP: + WARN_ONCE(!acpi_parking_protocol_valid(cpu), + "CPU%u: Wake-up IPI outside the ACPI parking protocol\n", + cpu); + break; +#endif + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); break; From 0b0254ded773d1f241b1ae7c7fdb137a72f60911 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 27 Jan 2016 10:50:19 +0100 Subject: [PATCH 439/607] UPSTREAM: arm64: allow vmalloc regions to be set with set_memory_* The range of set_memory_* is currently restricted to the module address range because of difficulties in breaking down larger block sizes. vmalloc maps PAGE_SIZE pages so it is safe to use as well. Update the function ranges and add a comment explaining why the range is restricted the way it is. Suggested-by: Laura Abbott Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 95f5c80050ad723163aa80dc8bffd48ef4afc6d5) Signed-off-by: Jeff Vander Stoep Change-Id: I80117bfd93ffe90a8edc89cbf6f456d9423bcf73 --- arch/arm64/mm/pageattr.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 3571c7309c5e..29ee7d85ca4c 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ static int change_memory_common(unsigned long addr, int numpages, unsigned long end = start + size; int ret; struct page_change_data data; + struct vm_struct *area; if (!PAGE_ALIGNED(addr)) { start &= PAGE_MASK; @@ -51,10 +53,23 @@ static int change_memory_common(unsigned long addr, int numpages, WARN_ON_ONCE(1); } - if (start < MODULES_VADDR || start >= MODULES_END) - return -EINVAL; - - if (end < MODULES_VADDR || end >= MODULES_END) + /* + * Kernel VA mappings are always live, and splitting live section + * mappings into page mappings may cause TLB conflicts. This means + * we have to ensure that changing the permission bits of the range + * we are operating on does not result in such splitting. + * + * Let's restrict ourselves to mappings created by vmalloc (or vmap). + * Those are guaranteed to consist entirely of page mappings, and + * splitting is never needed. + * + * So check whether the [addr, addr + size) interval is entirely + * covered by precisely one VM area that has the VM_ALLOC flag set. + */ + area = find_vm_area((void *)addr); + if (!area || + end > (unsigned long)area->addr + area->size || + !(area->flags & VM_ALLOC)) return -EINVAL; data.set_mask = set_mask; From 58ffa07081e1aabd092257eb6f2fb718dcd3d486 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:23 +0000 Subject: [PATCH 440/607] UPSTREAM: arm64: prefetch: don't provide spin_lock_prefetch with LSE The LSE atomics rely on us not dirtying data at L1 if we can avoid it, otherwise many of the potential scalability benefits are lost. This patch replaces spin_lock_prefetch with a nop when the LSE atomics are in use, so that users don't shoot themselves in the foot by causing needless coherence traffic at L1. Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit cd5e10bdf3795d22f10787bb1991c43798c885d5) Signed-off-by: Jeff Vander Stoep Change-Id: Ib8bc3f38d9306c13e017139ae4f2a7c8d6b61e18 --- arch/arm64/include/asm/processor.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 4acb7ca94fcd..31b76fce4477 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -29,6 +29,7 @@ #include +#include #include #include #include @@ -177,9 +178,11 @@ static inline void prefetchw(const void *ptr) } #define ARCH_HAS_SPINLOCK_PREFETCH -static inline void spin_lock_prefetch(const void *x) +static inline void spin_lock_prefetch(const void *ptr) { - prefetchw(x); + asm volatile(ARM64_LSE_ATOMIC_INSN( + "prfm pstl1strm, %a0", + "nop") : : "p" (ptr)); } #define HAVE_ARCH_PICK_MMAP_LAYOUT From b5924f376a221ffa0ee64a4410aff54416c2fd2d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:24 +0000 Subject: [PATCH 441/607] UPSTREAM: arm64: prefetch: add alternative pattern for CPUs without a prefetcher Most CPUs have a hardware prefetcher which generally performs better without explicit prefetch instructions issued by software, however some CPUs (e.g. Cavium ThunderX) rely solely on explicit prefetch instructions. This patch adds an alternative pattern (ARM64_HAS_NO_HW_PREFETCH) to allow our library code to make use of explicit prefetch instructions during things like copy routines only when the CPU does not have the capability to perform the prefetching itself. Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit d5370f754875460662abe8561388e019d90dd0c4) Signed-off-by: Jeff Vander Stoep Change-Id: Ie33097c9b7786922ff8c457d16515c3188b8e94b --- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/cputype.h | 17 ++++++++++++++++- arch/arm64/kernel/cpu_errata.c | 18 +++--------------- arch/arm64/kernel/cpufeature.c | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8f271b83f910..8d56bd8550dc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -30,8 +30,9 @@ #define ARM64_HAS_LSE_ATOMICS 5 #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 +#define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_NCAPS 8 +#define ARM64_NCAPS 9 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 1a5949364ed0..7540284a17fe 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -57,11 +57,22 @@ #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) -#define MIDR_CPU_PART(imp, partnum) \ +#define MIDR_CPU_MODEL(imp, partnum) \ (((imp) << MIDR_IMPLEMENTOR_SHIFT) | \ (0xf << MIDR_ARCHITECTURE_SHIFT) | \ ((partnum) << MIDR_PARTNUM_SHIFT)) +#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +#define MIDR_IS_CPU_MODEL_RANGE(midr, model, rv_min, rv_max) \ +({ \ + u32 _model = (midr) & MIDR_CPU_MODEL_MASK; \ + u32 rv = (midr) & (MIDR_REVISION_MASK | MIDR_VARIANT_MASK); \ + \ + _model == (model) && rv >= (rv_min) && rv <= (rv_max); \ + }) + #define ARM_CPU_IMP_ARM 0x41 #define ARM_CPU_IMP_APM 0x50 #define ARM_CPU_IMP_CAVIUM 0x43 @@ -75,6 +86,10 @@ #define CAVIUM_CPU_PART_THUNDERX 0x0A1 +#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) +#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) + #ifndef __ASSEMBLY__ /* diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index feb6b4efa641..e6bc988e8dbf 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -21,24 +21,12 @@ #include #include -#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) -#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) -#define MIDR_THUNDERX MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) - -#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ - MIDR_ARCHITECTURE_MASK) - static bool __maybe_unused is_affected_midr_range(const struct arm64_cpu_capabilities *entry) { - u32 midr = read_cpuid_id(); - - if ((midr & CPU_MODEL_MASK) != entry->midr_model) - return false; - - midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - - return (midr >= entry->midr_range_min && midr <= entry->midr_range_max); + return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model, + entry->midr_range_min, + entry->midr_range_max); } #define MIDR_RANGE(model, min, max) \ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5c90aa490a2b..3615d7d7c9af 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -621,6 +621,18 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry) return has_sre; } +static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry) +{ + u32 midr = read_cpuid_id(); + u32 rv_min, rv_max; + + /* Cavium ThunderX pass 1.x and 2.x */ + rv_min = 0; + rv_max = (1 << MIDR_VARIANT_SHIFT) | MIDR_REVISION_MASK; + + return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -651,6 +663,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 2, }, #endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */ + { + .desc = "Software prefetching using PRFM", + .capability = ARM64_HAS_NO_HW_PREFETCH, + .matches = has_no_hw_prefetch, + }, {}, }; From 297bf83fa4e7804c28c6d950f39dbd3860552cb9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:25 +0000 Subject: [PATCH 442/607] UPSTREAM: arm64: lib: improve copy_page to deal with 128 bytes at a time We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 223e23e8aa26b0bb62c597637e77295e14f6a62c) Signed-off-by: Jeff Vander Stoep Change-Id: Icabd86bbecc60ad0d730ab796e33b8762cecb1fb --- arch/arm64/lib/copy_page.S | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 512b9a7b980e..2534533ceb1d 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -27,20 +27,50 @@ * x1 - src */ ENTRY(copy_page) - /* Assume cache line size is 64 bytes. */ - prfm pldl1strm, [x1, #64] -1: ldp x2, x3, [x1] + ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] ldp x8, x9, [x1, #48] - add x1, x1, #64 - prfm pldl1strm, [x1, #64] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + mov x18, #(PAGE_SIZE - 128) + add x1, x1, #128 +1: + subs x18, x18, #128 + + stnp x2, x3, [x0] + ldp x2, x3, [x1] + stnp x4, x5, [x0, #16] + ldp x4, x5, [x1, #16] + stnp x6, x7, [x0, #32] + ldp x6, x7, [x1, #32] + stnp x8, x9, [x0, #48] + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.gt 1b + stnp x2, x3, [x0] stnp x4, x5, [x0, #16] stnp x6, x7, [x0, #32] stnp x8, x9, [x0, #48] - add x0, x0, #64 - tst x1, #(PAGE_SIZE - 1) - b.ne 1b + stnp x10, x11, [x0, #64] + stnp x12, x13, [x0, #80] + stnp x14, x15, [x0, #96] + stnp x16, x17, [x0, #112] + ret ENDPROC(copy_page) From 6e1f8e3c0fb86a90fe98667cfa1be71957534c66 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Tue, 2 Feb 2016 12:46:26 +0000 Subject: [PATCH 443/607] UPSTREAM: arm64: lib: patch in prfm for copy_page if requested On ThunderX T88 pass 1 and pass 2, there is no hardware prefetching so we need to patch in explicit software prefetching instructions Prefetching improves this code by 60% over the original code and 2x over the code without prefetching for the affected hardware using the benchmark code at https://github.com/apinski-cavium/copy_page_benchmark Signed-off-by: Andrew Pinski Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 60e0a09db24adc8809696307e5d97cc4ba7cb3e0) Signed-off-by: Jeff Vander Stoep Change-Id: I3821a4d3a7b6fd68b4b0aca31478ec960e4e5172 --- arch/arm64/lib/copy_page.S | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 2534533ceb1d..4c1e700840b6 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -18,6 +18,8 @@ #include #include #include +#include +#include /* * Copy a page from src to dest (both are page aligned) @@ -27,6 +29,15 @@ * x1 - src */ ENTRY(copy_page) +alternative_if_not ARM64_HAS_NO_HW_PREFETCH + nop + nop +alternative_else + # Prefetch two cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] +alternative_endif + ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] @@ -41,6 +52,12 @@ ENTRY(copy_page) 1: subs x18, x18, #128 +alternative_if_not ARM64_HAS_NO_HW_PREFETCH + nop +alternative_else + prfm pldl1strm, [x1, #384] +alternative_endif + stnp x2, x3, [x0] ldp x2, x3, [x1] stnp x4, x5, [x0, #16] From f54fba19ed7891dc214085f55d73990f19e9c756 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 10 Feb 2016 10:07:30 +0000 Subject: [PATCH 444/607] UPSTREAM: arm64: prefetch: add missing #include for spin_lock_prefetch As of 52e662326e1e ("arm64: prefetch: don't provide spin_lock_prefetch with LSE"), spin_lock_prefetch is patched at runtime when the LSE atomics are in use. This relies on the ARM64_LSE_ATOMIC_INSN macro to drive the alternatives framework, but that macro is only available via asm/lse.h, which isn't explicitly included in processor.h. Consequently, drivers can run into build failures such as: In file included from include/linux/prefetch.h:14:0, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:27: arch/arm64/include/asm/processor.h: In function 'spin_lock_prefetch': arch/arm64/include/asm/processor.h:183:15: error: expected string literal before 'ARM64_LSE_ATOMIC_INSN' asm volatile(ARM64_LSE_ATOMIC_INSN( This patch add the missing include and gets things building again. Reported-by: kbuild test robot Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit afb83cc3f0e4f86ea0e1cc3db7a90f58f1abd4d5) Signed-off-by: Jeff Vander Stoep Change-Id: I9a9e6b920f96b060781a01bff50a0ca8db78cf7f --- arch/arm64/include/asm/processor.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 31b76fce4477..5bb1d763d17a 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include From a8a3d94dbb8ef81a8710b4fec7dfd1a07f2647f9 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:46 -0800 Subject: [PATCH 445/607] UPSTREAM: arm64: Drop alloc function from create_mapping create_mapping is only used in fixmap_remap_fdt. All the create_mapping calls need to happen on existing translation table pages without additional allocations. Rather than have an alloc function be called and fail, just set it to NULL and catch its use. Also change the name to create_mapping_noalloc to better capture what exactly is going on. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 132233a759580f5ce9b1bfaac9073e47d03c460d) Signed-off-by: Jeff Vander Stoep Change-Id: I48db9adae540e79d47328b4045f26f22ecebc482 --- arch/arm64/mm/mmu.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6f5770a1b140..b4788d3e1551 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -116,7 +116,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - phys_addr_t pte_phys = pgtable_alloc(); + phys_addr_t pte_phys; + BUG_ON(!pgtable_alloc); + pte_phys = pgtable_alloc(); pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); @@ -158,7 +160,9 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - phys_addr_t pmd_phys = pgtable_alloc(); + phys_addr_t pmd_phys; + BUG_ON(!pgtable_alloc); + pmd_phys = pgtable_alloc(); pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* @@ -223,7 +227,9 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long next; if (pgd_none(*pgd)) { - phys_addr_t pud_phys = pgtable_alloc(); + phys_addr_t pud_phys; + BUG_ON(!pgtable_alloc); + pud_phys = pgtable_alloc(); __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); @@ -312,7 +318,12 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); } -static void __init create_mapping(phys_addr_t phys, unsigned long virt, +/* + * This function can only be used to modify existing table entries, + * without allocating new levels of table. Note that this permits the + * creation of new section or page entries. + */ +static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { if (virt < VMALLOC_START) { @@ -321,7 +332,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, return; } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - early_pgtable_alloc); + NULL); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -678,7 +689,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) /* * Make sure that the FDT region can be mapped without the need to * allocate additional translation table pages, so that it is safe - * to call create_mapping() this early. + * to call create_mapping_noalloc() this early. * * On 64k pages, the FDT will be mapped using PTEs, so we need to * be in the same PMD as the rest of the fixmap. @@ -694,8 +705,8 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) dt_virt = (void *)dt_virt_base + offset; /* map the first chunk so we can read the size from the header */ - create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, - SWAPPER_BLOCK_SIZE, prot); + create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), + dt_virt_base, SWAPPER_BLOCK_SIZE, prot); if (fdt_check_header(dt_virt) != 0) return NULL; @@ -705,7 +716,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return NULL; if (offset + size > SWAPPER_BLOCK_SIZE) - create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, + create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, round_up(offset + size, SWAPPER_BLOCK_SIZE), prot); memblock_reserve(dt_phys, size); From 644210f3f890d1e5481378d28d150d1c5f4191fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Tue, 26 Jan 2016 15:47:25 +0000 Subject: [PATCH 446/607] UPSTREAM: arm64: mm: avoid calling apply_to_page_range on empty range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling apply_to_page_range with an empty range results in a BUG_ON from the core code. This can be triggered by trying to load the st_drv module with CONFIG_DEBUG_SET_MODULE_RONX enabled: kernel BUG at mm/memory.c:1874! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 3 PID: 1764 Comm: insmod Not tainted 4.5.0-rc1+ #2 Hardware name: ARM Juno development board (r0) (DT) task: ffffffc9763b8000 ti: ffffffc975af8000 task.ti: ffffffc975af8000 PC is at apply_to_page_range+0x2cc/0x2d0 LR is at change_memory_common+0x80/0x108 This patch fixes the issue by making change_memory_common (called by the set_memory_* functions) a NOP when numpages == 0, therefore avoiding the erroneous call to apply_to_page_range and bringing us into line with x86 and s390. Cc: Reviewed-by: Laura Abbott Acked-by: David Rientjes Signed-off-by: Mika Penttilä Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 57adec866c0440976c96a4b8f5b59fb411b1cacb) Signed-off-by: Jeff Vander Stoep Change-Id: Ia107d3b324cc8237f669778a7c9c3abae8637501 --- arch/arm64/mm/pageattr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 29ee7d85ca4c..0795c3a36d8f 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -72,6 +72,9 @@ static int change_memory_common(unsigned long addr, int numpages, !(area->flags & VM_ALLOC)) return -EINVAL; + if (!numpages) + return 0; + data.set_mask = set_mask; data.clear_mask = clear_mask; From 664e159255fe7afc89145b4eed47b370d01489fa Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:47 -0800 Subject: [PATCH 447/607] UPSTREAM: arm64: Add support for ARCH_SUPPORTS_DEBUG_PAGEALLOC ARCH_SUPPORTS_DEBUG_PAGEALLOC provides a hook to map and unmap pages for debugging purposes. This requires memory be mapped with PAGE_SIZE mappings since breaking down larger mappings at runtime will lead to TLB conflicts. Check if debug_pagealloc is enabled at runtime and if so, map everyting with PAGE_SIZE pages. Implement the functions to actually map/unmap the pages at runtime. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Laura Abbott [catalin.marinas@arm.com: static annotation block_mappings_allowed() and #ifdef] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 83863f25e4b8214e994ef8b5647aad614d74b45d) Signed-off-by: Jeff Vander Stoep Change-Id: Ia9b9bc5fcc1938bafbc2930f08d27973cfc3d038 --- arch/arm64/Kconfig | 3 +++ arch/arm64/mm/mmu.c | 26 +++++++++++++++++++++-- arch/arm64/mm/pageattr.c | 46 +++++++++++++++++++++++++++++++--------- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1405a1bcc26d..49988da6f342 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -541,6 +541,9 @@ config HOTPLUG_CPU source kernel/Kconfig.preempt source kernel/Kconfig.hz +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + config ARCH_HAS_HOLES_MEMORYMODEL def_bool y if SPARSEMEM diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b4788d3e1551..8d0e49d2cba9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -149,6 +149,26 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } +#ifdef CONFIG_DEBUG_PAGEALLOC +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) +{ + + /* + * If debug_page_alloc is enabled we must map the linear map + * using pages. However, other mappings created by + * create_mapping_noalloc must use sections in some cases. Allow + * sections to be used in those cases, where no pgtable_alloc + * function is provided. + */ + return !pgtable_alloc || !debug_pagealloc_enabled(); +} +#else +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) +{ + return true; +} +#endif + static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) @@ -181,7 +201,8 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, do { next = pmd_addr_end(addr, end); /* try section mapping first */ - if (((addr | next | phys) & ~SECTION_MASK) == 0) { + if (((addr | next | phys) & ~SECTION_MASK) == 0 && + block_mappings_allowed(pgtable_alloc)) { pmd_t old_pmd =*pmd; set_pmd(pmd, __pmd(phys | pgprot_val(mk_sect_prot(prot)))); @@ -241,7 +262,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys)) { + if (use_1G_block(addr, next, phys) && + block_mappings_allowed(pgtable_alloc)) { pud_t old_pud = *pud; set_pud(pud, __pud(phys | pgprot_val(mk_sect_prot(prot)))); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0795c3a36d8f..ca6d268e3313 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -37,14 +37,31 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, return 0; } +/* + * This function assumes that the range is mapped with PAGE_SIZE pages. + */ +static int __change_memory_common(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) +{ + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = apply_to_page_range(&init_mm, start, size, change_page_range, + &data); + + flush_tlb_kernel_range(start, start + size); + return ret; +} + static int change_memory_common(unsigned long addr, int numpages, pgprot_t set_mask, pgprot_t clear_mask) { unsigned long start = addr; unsigned long size = PAGE_SIZE*numpages; unsigned long end = start + size; - int ret; - struct page_change_data data; struct vm_struct *area; if (!PAGE_ALIGNED(addr)) { @@ -75,14 +92,7 @@ static int change_memory_common(unsigned long addr, int numpages, if (!numpages) return 0; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); - - flush_tlb_kernel_range(start, end); - return ret; + return __change_memory_common(start, size, set_mask, clear_mask); } int set_memory_ro(unsigned long addr, int numpages) @@ -114,3 +124,19 @@ int set_memory_x(unsigned long addr, int numpages) __pgprot(PTE_PXN)); } EXPORT_SYMBOL_GPL(set_memory_x); + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long addr = (unsigned long) page_address(page); + + if (enable) + __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); + else + __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(0), + __pgprot(PTE_VALID)); +} +#endif From 4d4c2259aa3270aceb9c55562afdf586c8328eee Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:48 -0800 Subject: [PATCH 448/607] UPSTREAM: arm64: ptdump: Indicate whether memory should be faulting With CONFIG_DEBUG_PAGEALLOC, pages do not have the valid bit set when free in the buddy allocator. Add an indiciation to the page table dumping code that the valid bit is not set, 'F' for fault, to make this easier to understand. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit d7e9d59494a9a5d83274f5af2148b82ca22dff3f) Signed-off-by: Jeff Vander Stoep Change-Id: I7e8200549e22cb3a01b5e827bf12b872b9cefc58 --- arch/arm64/mm/dump.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0adbebbc2803..0841b2bf0e6a 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -90,6 +90,11 @@ struct prot_bits { static const struct prot_bits pte_bits[] = { { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { .mask = PTE_USER, .val = PTE_USER, .set = "USR", From 3aa18069d0038c0475ca598d2a9e9533ab6c28f7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Feb 2016 15:50:18 -0800 Subject: [PATCH 449/607] UPSTREAM: arm64: ubsan: select ARCH_HAS_UBSAN_SANITIZE_ALL To enable UBSAN on arm64, ARCH_HAS_UBSAN_SANITIZE_ALL need to be selected. Basic kernel bootup test is passed on arm64 with CONFIG_UBSAN_SANITIZE_ALL enabled. Signed-off-by: Yang Shi Acked-by: Andrey Ryabinin Tested-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f0b7f8a4b44657386273a67179dd901c81cd11a6) Signed-off-by: Jeff Vander Stoep Change-Id: I640f50e70e3562bf1caf2ce164d6ed5640e041f6 --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 49988da6f342..71ce487d7b78 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -13,6 +13,7 @@ config ARM64 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC From a275e00e2833f58216ed58baae6de790a4dc27cb Mon Sep 17 00:00:00 2001 From: David Brown Date: Wed, 10 Feb 2016 13:52:22 -0800 Subject: [PATCH 450/607] UPSTREAM: arm64: vdso: Mark vDSO code as read-only Although the arm64 vDSO is cleanly separated by code/data with the code being read-only in userspace mappings, the code page is still writable from the kernel. There have been exploits (such as http://itszn.com/blog/?p=21) that take advantage of this on x86 to go from a bad kernel write to full root. Prevent this specific exploit on arm64 by putting the vDSO code page in read-only memory as well. Before the change: [ 3.138366] vdso: 2 pages (1 code @ ffffffc000a71000, 1 data @ ffffffc000a70000) ---[ Kernel Mapping ]--- 0xffffffc000000000-0xffffffc000082000 520K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000082000-0xffffffc000200000 1528K ro x SHD AF UXN MEM/NORMAL 0xffffffc000200000-0xffffffc000800000 6M ro x SHD AF BLK UXN MEM/NORMAL 0xffffffc000800000-0xffffffc0009b6000 1752K ro x SHD AF UXN MEM/NORMAL 0xffffffc0009b6000-0xffffffc000c00000 2344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000c00000-0xffffffc008000000 116M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc00c000000-0xffffffc07f000000 1840M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc800000000-0xffffffc840000000 1G RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc840000000-0xffffffc87ae00000 942M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87ae00000-0xffffffc87ae70000 448K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af80000-0xffffffc87af8a000 40K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af8b000-0xffffffc87b000000 468K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87b000000-0xffffffc87fe00000 78M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87fe00000-0xffffffc87ff50000 1344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87ff90000-0xffffffc87ffa0000 64K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87fff0000-0xffffffc880000000 64K RW NX SHD AF UXN MEM/NORMAL After: [ 3.138368] vdso: 2 pages (1 code @ ffffffc0006de000, 1 data @ ffffffc000a74000) ---[ Kernel Mapping ]--- 0xffffffc000000000-0xffffffc000082000 520K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000082000-0xffffffc000200000 1528K ro x SHD AF UXN MEM/NORMAL 0xffffffc000200000-0xffffffc000800000 6M ro x SHD AF BLK UXN MEM/NORMAL 0xffffffc000800000-0xffffffc0009b8000 1760K ro x SHD AF UXN MEM/NORMAL 0xffffffc0009b8000-0xffffffc000c00000 2336K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000c00000-0xffffffc008000000 116M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc00c000000-0xffffffc07f000000 1840M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc800000000-0xffffffc840000000 1G RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc840000000-0xffffffc87ae00000 942M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87ae00000-0xffffffc87ae70000 448K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af80000-0xffffffc87af8a000 40K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af8b000-0xffffffc87b000000 468K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87b000000-0xffffffc87fe00000 78M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87fe00000-0xffffffc87ff50000 1344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87ff90000-0xffffffc87ffa0000 64K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87fff0000-0xffffffc880000000 64K RW NX SHD AF UXN MEM/NORMAL Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the PaX Team, Brad Spengler, and Kees Cook. Signed-off-by: David Brown Acked-by: Will Deacon Acked-by: Ard Biesheuvel [catalin.marinas@arm.com: removed superfluous __PAGE_ALIGNED_DATA] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 88d8a7994e564d209d4b2583496631c2357d386b) Signed-off-by: Jeff Vander Stoep Change-Id: I3fe4b48df8b27313ac61c947746805442757932c --- arch/arm64/kernel/vdso/vdso.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S index 60c1db54b41a..82379a70ef03 100644 --- a/arch/arm64/kernel/vdso/vdso.S +++ b/arch/arm64/kernel/vdso/vdso.S @@ -21,9 +21,8 @@ #include #include - __PAGE_ALIGNED_DATA - .globl vdso_start, vdso_end + .section .rodata .balign PAGE_SIZE vdso_start: .incbin "arch/arm64/kernel/vdso/vdso.so" From de30ecfe314d3aca506ed45e0892a942ce523bd4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Feb 2016 09:51:49 +0100 Subject: [PATCH 451/607] UPSTREAM: arm64: use local label prefixes for __reg_num symbols The __reg_num_xNN symbols that are used to implement the msr_s and mrs_s macros are recorded in the ELF metadata of each object file. This does not affect the size of the final binary, but it does clutter the output of tools like readelf, i.e., $ readelf -a vmlinux |grep -c __reg_num_x 50976 So let's use symbols with the .L prefix, these are strictly local, and don't end up in the object files. $ readelf -a vmlinux |grep -c __reg_num_x 0 Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 7abc7d833c9eb16efc8a59239d3771a6e30be367) Signed-off-by: Jeff Vander Stoep Change-Id: Idefe9841ef7d1ddcc5161fc8de14153cfadaf4f3 --- arch/arm64/include/asm/sysreg.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d48ab5b41f52..76907c94b11f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -194,32 +194,32 @@ #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 - .equ __reg_num_x\num, \num + .equ .L__reg_num_x\num, \num .endr - .equ __reg_num_xzr, 31 + .equ .L__reg_num_xzr, 31 .macro mrs_s, rt, sreg - .inst 0xd5200000|(\sreg)|(__reg_num_\rt) + .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt) .endm .macro msr_s, sreg, rt - .inst 0xd5000000|(\sreg)|(__reg_num_\rt) + .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt) .endm #else asm( " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" -" .equ __reg_num_x\\num, \\num\n" +" .equ .L__reg_num_x\\num, \\num\n" " .endr\n" -" .equ __reg_num_xzr, 31\n" +" .equ .L__reg_num_xzr, 31\n" "\n" " .macro mrs_s, rt, sreg\n" -" .inst 0xd5200000|(\\sreg)|(__reg_num_\\rt)\n" +" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" " .endm\n" "\n" " .macro msr_s, sreg, rt\n" -" .inst 0xd5000000|(\\sreg)|(__reg_num_\\rt)\n" +" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" " .endm\n" ); From ac3060aed4ea0173b093c474fbd8981c4e1dc274 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:46 +0000 Subject: [PATCH 452/607] UPSTREAM: arm64: cpufeature: Change read_cpuid() to use sysreg's mrs_s macro Older assemblers may not have support for newer feature registers. To get round this, sysreg.h provides a 'mrs_s' macro that takes a register encoding and generates the raw instruction. Change read_cpuid() to use mrs_s in all cases so that new registers don't have to be a special case. Including sysreg.h means we need to move the include and definition of read_cpuid() after the #ifndef __ASSEMBLY__ to avoid syntax errors in vmlinux.lds. Signed-off-by: James Morse Acked-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 0f54b14e76f5302afe164dc911b049b5df836ff5) Signed-off-by: Jeff Vander Stoep Change-Id: Ic59c81a97b4585b3c0964f293ddee08f7cb594ac --- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/cputype.h | 20 ++++++----- arch/arm64/kernel/cpufeature.c | 54 ++++++++++++++--------------- arch/arm64/kernel/cpuinfo.c | 50 +++++++++++++------------- arch/arm64/mm/context.c | 2 +- 5 files changed, 65 insertions(+), 63 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8d56bd8550dc..8131abfabb0a 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -177,7 +177,7 @@ u64 read_system_reg(u32 id); static inline bool cpu_supports_mixed_endian_el0(void) { - return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); + return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1)); } static inline bool system_supports_mixed_endian_el0(void) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 7540284a17fe..b3a83da152a7 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -32,12 +32,6 @@ #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK) -#define read_cpuid(reg) ({ \ - u64 __val; \ - asm("mrs %0, " #reg : "=r" (__val)); \ - __val; \ -}) - #define MIDR_REVISION_MASK 0xf #define MIDR_REVISION(midr) ((midr) & MIDR_REVISION_MASK) #define MIDR_PARTNUM_SHIFT 4 @@ -92,6 +86,14 @@ #ifndef __ASSEMBLY__ +#include + +#define read_cpuid(reg) ({ \ + u64 __val; \ + asm("mrs_s %0, " __stringify(reg) : "=r" (__val)); \ + __val; \ +}) + /* * The CPU ID never changes at run time, so we might as well tell the * compiler that it's constant. Use this function to read the CPU ID @@ -99,12 +101,12 @@ */ static inline u32 __attribute_const__ read_cpuid_id(void) { - return read_cpuid(MIDR_EL1); + return read_cpuid(SYS_MIDR_EL1); } static inline u64 __attribute_const__ read_cpuid_mpidr(void) { - return read_cpuid(MPIDR_EL1); + return read_cpuid(SYS_MPIDR_EL1); } static inline unsigned int __attribute_const__ read_cpuid_implementor(void) @@ -119,7 +121,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void) static inline u32 __attribute_const__ read_cpuid_cachetype(void) { - return read_cpuid(CTR_EL0); + return read_cpuid(SYS_CTR_EL0); } #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3615d7d7c9af..1ef10e784031 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -808,35 +808,35 @@ static inline void set_sys_caps_initialised(void) static u64 __raw_read_system_reg(u32 sys_id) { switch (sys_id) { - case SYS_ID_PFR0_EL1: return (u64)read_cpuid(ID_PFR0_EL1); - case SYS_ID_PFR1_EL1: return (u64)read_cpuid(ID_PFR1_EL1); - case SYS_ID_DFR0_EL1: return (u64)read_cpuid(ID_DFR0_EL1); - case SYS_ID_MMFR0_EL1: return (u64)read_cpuid(ID_MMFR0_EL1); - case SYS_ID_MMFR1_EL1: return (u64)read_cpuid(ID_MMFR1_EL1); - case SYS_ID_MMFR2_EL1: return (u64)read_cpuid(ID_MMFR2_EL1); - case SYS_ID_MMFR3_EL1: return (u64)read_cpuid(ID_MMFR3_EL1); - case SYS_ID_ISAR0_EL1: return (u64)read_cpuid(ID_ISAR0_EL1); - case SYS_ID_ISAR1_EL1: return (u64)read_cpuid(ID_ISAR1_EL1); - case SYS_ID_ISAR2_EL1: return (u64)read_cpuid(ID_ISAR2_EL1); - case SYS_ID_ISAR3_EL1: return (u64)read_cpuid(ID_ISAR3_EL1); - case SYS_ID_ISAR4_EL1: return (u64)read_cpuid(ID_ISAR4_EL1); - case SYS_ID_ISAR5_EL1: return (u64)read_cpuid(ID_ISAR4_EL1); - case SYS_MVFR0_EL1: return (u64)read_cpuid(MVFR0_EL1); - case SYS_MVFR1_EL1: return (u64)read_cpuid(MVFR1_EL1); - case SYS_MVFR2_EL1: return (u64)read_cpuid(MVFR2_EL1); + case SYS_ID_PFR0_EL1: return read_cpuid(SYS_ID_PFR0_EL1); + case SYS_ID_PFR1_EL1: return read_cpuid(SYS_ID_PFR1_EL1); + case SYS_ID_DFR0_EL1: return read_cpuid(SYS_ID_DFR0_EL1); + case SYS_ID_MMFR0_EL1: return read_cpuid(SYS_ID_MMFR0_EL1); + case SYS_ID_MMFR1_EL1: return read_cpuid(SYS_ID_MMFR1_EL1); + case SYS_ID_MMFR2_EL1: return read_cpuid(SYS_ID_MMFR2_EL1); + case SYS_ID_MMFR3_EL1: return read_cpuid(SYS_ID_MMFR3_EL1); + case SYS_ID_ISAR0_EL1: return read_cpuid(SYS_ID_ISAR0_EL1); + case SYS_ID_ISAR1_EL1: return read_cpuid(SYS_ID_ISAR1_EL1); + case SYS_ID_ISAR2_EL1: return read_cpuid(SYS_ID_ISAR2_EL1); + case SYS_ID_ISAR3_EL1: return read_cpuid(SYS_ID_ISAR3_EL1); + case SYS_ID_ISAR4_EL1: return read_cpuid(SYS_ID_ISAR4_EL1); + case SYS_ID_ISAR5_EL1: return read_cpuid(SYS_ID_ISAR4_EL1); + case SYS_MVFR0_EL1: return read_cpuid(SYS_MVFR0_EL1); + case SYS_MVFR1_EL1: return read_cpuid(SYS_MVFR1_EL1); + case SYS_MVFR2_EL1: return read_cpuid(SYS_MVFR2_EL1); - case SYS_ID_AA64PFR0_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64PFR1_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64DFR0_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64DFR1_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64MMFR0_EL1: return (u64)read_cpuid(ID_AA64MMFR0_EL1); - case SYS_ID_AA64MMFR1_EL1: return (u64)read_cpuid(ID_AA64MMFR1_EL1); - case SYS_ID_AA64ISAR0_EL1: return (u64)read_cpuid(ID_AA64ISAR0_EL1); - case SYS_ID_AA64ISAR1_EL1: return (u64)read_cpuid(ID_AA64ISAR1_EL1); + case SYS_ID_AA64PFR0_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1); + case SYS_ID_AA64PFR1_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1); + case SYS_ID_AA64DFR0_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); + case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); + case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1); + case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1); + case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1); + case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1); - case SYS_CNTFRQ_EL0: return (u64)read_cpuid(CNTFRQ_EL0); - case SYS_CTR_EL0: return (u64)read_cpuid(CTR_EL0); - case SYS_DCZID_EL0: return (u64)read_cpuid(DCZID_EL0); + case SYS_CNTFRQ_EL0: return read_cpuid(SYS_CNTFRQ_EL0); + case SYS_CTR_EL0: return read_cpuid(SYS_CTR_EL0); + case SYS_DCZID_EL0: return read_cpuid(SYS_DCZID_EL0); default: BUG(); return 0; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 212ae6361d8b..76df22272804 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -201,35 +201,35 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) { info->reg_cntfrq = arch_timer_get_cntfrq(); info->reg_ctr = read_cpuid_cachetype(); - info->reg_dczid = read_cpuid(DCZID_EL0); + info->reg_dczid = read_cpuid(SYS_DCZID_EL0); info->reg_midr = read_cpuid_id(); - info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); - info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); - info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); - info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); - info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); - info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); - info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); - info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); + info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1); + info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1); + info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1); + info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1); + info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1); + info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1); + info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1); + info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1); - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1); + info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1); + info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1); + info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1); - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); + info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1); cpuinfo_detect_icache_policy(info); diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index e87f53ff5f58..7275628ba59f 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -187,7 +187,7 @@ switch_mm_fastpath: static int asids_init(void) { - int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4); + int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4); switch (fld) { default: From 8d6623808d9b75778dce5afba7fbbbd387eb4160 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:47 +0000 Subject: [PATCH 453/607] UPSTREAM: arm64: add ARMv8.2 id_aa64mmfr2 boiler plate ARMv8.2 adds a new feature register id_aa64mmfr2. This patch adds the cpu feature boiler plate used by the actual features in later patches. Signed-off-by: James Morse Reviewed-by: Suzuki K Poulose Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 406e308770a92bd33995b2e5b681e86358328bb0) Signed-off-by: Jeff Vander Stoep Change-Id: I51db326696ba1e18e9a4c667acbeb3e25e0b151e --- arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + 4 files changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index b5e9cee4b5f8..13a6103130cd 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -36,6 +36,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64isar1; u64 reg_id_aa64mmfr0; u64 reg_id_aa64mmfr1; + u64 reg_id_aa64mmfr2; u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 76907c94b11f..4bc8655529df 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -70,6 +70,7 @@ #define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0) #define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1) +#define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2) #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0) #define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1) @@ -135,6 +136,9 @@ #define ID_AA64MMFR1_VMIDBITS_SHIFT 4 #define ID_AA64MMFR1_HADBS_SHIFT 0 +/* id_aa64mmfr2 */ +#define ID_AA64MMFR2_UAO_SHIFT 4 + /* id_aa64dfr0 */ #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 #define ID_AA64DFR0_WRPS_SHIFT 20 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1ef10e784031..42918c797e8e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -123,6 +123,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { ARM64_FTR_END, }; +static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static struct arm64_ftr_bits ftr_ctr[] = { U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0), @@ -284,6 +289,7 @@ static struct arm64_ftr_reg arm64_ftr_regs[] = { /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), + ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), /* Op1 = 3, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr), @@ -408,6 +414,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); + init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); @@ -517,6 +524,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu, info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, + info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); /* * EL3 is not our concern. @@ -831,6 +840,7 @@ static u64 __raw_read_system_reg(u32 sys_id) case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1); case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1); + case SYS_ID_AA64MMFR2_EL1: return read_cpuid(SYS_ID_AA64MMFR2_EL1); case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1); case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 76df22272804..966fbd52550b 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -210,6 +210,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1); info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1); + info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1); info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1); From dba95a8428b15b2cff37f5b07e4539ccb82a330e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:48 +0000 Subject: [PATCH 454/607] UPSTREAM: arm64: kernel: Add support for User Access Override 'User Access Override' is a new ARMv8.2 feature which allows the unprivileged load and store instructions to be overridden to behave in the normal way. This patch converts {get,put}_user() and friends to use ldtr*/sttr* instructions - so that they can only access EL0 memory, then enables UAO when fs==KERNEL_DS so that these functions can access kernel memory. This allows user space's read/write permissions to be checked against the page tables, instead of testing addr [catalin.marinas@arm.com: move uao_thread_switch() above dsb()] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 57f4959bad0a154aeca125b7d38d1d9471a12422) Signed-off-by: Jeff Vander Stoep Change-Id: I1a6a74a1f33b92d54368bd99387b55cf62930903 --- arch/arm64/Kconfig | 21 ++++++++ arch/arm64/include/asm/alternative.h | 72 ++++++++++++++++++++++++++++ arch/arm64/include/asm/cpufeature.h | 3 +- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 ++ arch/arm64/include/asm/thread_info.h | 6 +++ arch/arm64/include/asm/uaccess.h | 44 ++++++++++++----- arch/arm64/include/uapi/asm/ptrace.h | 1 + arch/arm64/kernel/cpufeature.c | 11 +++++ arch/arm64/kernel/process.c | 19 ++++++++ arch/arm64/lib/clear_user.S | 8 ++-- arch/arm64/lib/copy_from_user.S | 8 ++-- arch/arm64/lib/copy_in_user.S | 16 +++---- arch/arm64/lib/copy_to_user.S | 8 ++-- arch/arm64/mm/fault.c | 31 +++++++++--- 15 files changed, 213 insertions(+), 39 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 71ce487d7b78..ae51e1b18337 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -740,6 +740,27 @@ config ARM64_LSE_ATOMICS endmenu +config ARM64_UAO + bool "Enable support for User Access Override (UAO)" + default y + help + User Access Override (UAO; part of the ARMv8.2 Extensions) + causes the 'unprivileged' variant of the load/store instructions to + be overriden to be privileged. + + This option changes get_user() and friends to use the 'unprivileged' + variant of the load/store instructions. This ensures that user-space + really did have access to the supplied memory. When addr_limit is + set to kernel memory the UAO bit will be set, allowing privileged + access to kernel memory. + + Choosing this option will cause copy_to_user() et al to use user-space + memory permissions. + + The feature is detected at runtime, the kernel will use the + regular load/store instructions if the cpu does not implement the + feature. + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index e4962f04201e..a9fc24ec1aa9 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -1,6 +1,8 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H +#include + #ifndef __ASSEMBLY__ #include @@ -63,6 +65,8 @@ void apply_alternatives(void *start, size_t length); #else +#include + .macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len .word \orig_offset - . .word \alt_offset - . @@ -136,6 +140,74 @@ void apply_alternatives(void *start, size_t length); alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) +/* + * Generate the assembly for UAO alternatives with exception table entries. + * This is complicated as there is no post-increment or pair versions of the + * unprivileged instructions, and USER() only works for single instructions. + */ +#ifdef CONFIG_ARM64_UAO + .macro uao_ldp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: ldp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + ldtr \reg1, [\addr]; + ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .quad 8889b,\l; + .previous; + .endm + + .macro uao_stp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: stp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + sttr \reg1, [\addr]; + sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .quad 8889b,\l; + .previous + .endm + + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: \inst \reg, [\addr], \post_inc; + nop; + alternative_else + \alt_inst \reg, [\addr]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .previous + .endm +#else + .macro uao_ldp l, reg1, reg2, addr, post_inc + USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_stp l, reg1, reg2, addr, post_inc + USER(\l, stp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + USER(\l, \inst \reg, [\addr], \post_inc) + .endm +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8131abfabb0a..a5df7cde616b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -31,8 +31,9 @@ #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 +#define ARM64_HAS_UAO 9 -#define ARM64_NCAPS 9 +#define ARM64_NCAPS 10 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5bb1d763d17a..cef1cf398356 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -191,5 +191,6 @@ static inline void spin_lock_prefetch(const void *ptr) #endif void cpu_enable_pan(void *__unused); +void cpu_enable_uao(void *__unused); #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4bc8655529df..b9fd8ec79033 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -77,9 +77,12 @@ #define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7) #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) +#define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) #define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ + (!!x)<<8 | 0x1f) /* SCTLR_EL1 */ #define SCTLR_EL1_CP15BEN (0x1 << 5) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index abd64bd1f6d9..eba8db6838af 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -85,6 +85,12 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } +/* Access struct thread_info of another thread */ +static inline struct thread_info *get_thread_info(unsigned long thread_stack) +{ + return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1)); +} + #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 2a44b9795532..a227ba376f23 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -64,6 +64,16 @@ extern int fixup_exception(struct pt_regs *regs); static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + + /* + * Enable/disable UAO so that copy_to_user() etc can access + * kernel memory with the unprivileged instructions. + */ + if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); } #define segment_eq(a, b) ((a) == (b)) @@ -113,9 +123,10 @@ static inline void set_fs(mm_segment_t fs) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_user_asm(instr, reg, x, addr, err) \ +#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ - "1: " instr " " reg "1, [%2]\n" \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ + alt_instr " " reg "1, [%2]\n", feature) \ "2:\n" \ " .section .fixup, \"ax\"\n" \ " .align 2\n" \ @@ -138,16 +149,20 @@ do { \ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm("ldrb", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __get_user_asm("ldrh", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __get_user_asm("ldr", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __get_user_asm("ldr", "%", __gu_val, (ptr), (err)); \ + __get_user_asm("ldr", "ldtr", "%", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ @@ -181,9 +196,10 @@ do { \ ((x) = 0, -EFAULT); \ }) -#define __put_user_asm(instr, reg, x, addr, err) \ +#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ - "1: " instr " " reg "1, [%2]\n" \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ + alt_instr " " reg "1, [%2]\n", feature) \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .align 2\n" \ @@ -205,16 +221,20 @@ do { \ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_user_asm("strb", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __put_user_asm("strh", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __put_user_asm("str", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __put_user_asm("str", "%", __pu_val, (ptr), (err)); \ + __put_user_asm("str", "sttr", "%", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 208db3df135a..b5c3933ed441 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -45,6 +45,7 @@ #define PSR_A_BIT 0x00000100 #define PSR_D_BIT 0x00000200 #define PSR_PAN_BIT 0x00400000 +#define PSR_UAO_BIT 0x00800000 #define PSR_Q_BIT 0x08000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 42918c797e8e..ae22edf9d3c9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -677,6 +677,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NO_HW_PREFETCH, .matches = has_no_hw_prefetch, }, +#ifdef CONFIG_ARM64_UAO + { + .desc = "User Access Override", + .capability = ARM64_HAS_UAO, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .field_pos = ID_AA64MMFR2_UAO_SHIFT, + .min_field_value = 1, + .enable = cpu_enable_uao, + }, +#endif /* CONFIG_ARM64_UAO */ {}, }; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 5f6244526e31..4431e8364881 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -46,6 +46,7 @@ #include #include +#include #include #include #include @@ -346,6 +347,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } else { memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; + if (IS_ENABLED(CONFIG_ARM64_UAO) && + cpus_have_cap(ARM64_HAS_UAO)) + childregs->pstate |= PSR_UAO_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; } @@ -374,6 +378,20 @@ static void tls_thread_switch(struct task_struct *next) : : "r" (tpidr), "r" (tpidrro)); } +/* Restore the UAO state depending on next's addr_limit */ +static void uao_thread_switch(struct task_struct *next) +{ + unsigned long next_sp = next->thread.cpu_context.sp; + + if (IS_ENABLED(CONFIG_ARM64_UAO) && + get_thread_info(next_sp)->addr_limit == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); +} + /* * Thread switching. */ @@ -386,6 +404,7 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); + uao_thread_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index a9723c71c52b..3f950b677c07 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -39,20 +39,20 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ subs x1, x1, #8 b.mi 2f 1: -USER(9f, str xzr, [x0], #8 ) +uao_user_alternative 9f, str, sttr, xzr, x0, 8 subs x1, x1, #8 b.pl 1b 2: adds x1, x1, #4 b.mi 3f -USER(9f, str wzr, [x0], #4 ) +uao_user_alternative 9f, str, sttr, wzr, x0, 4 sub x1, x1, #4 3: adds x1, x1, #2 b.mi 4f -USER(9f, strh wzr, [x0], #2 ) +uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 sub x1, x1, #2 4: adds x1, x1, #1 b.mi 5f -USER(9f, strb wzr, [x0] ) +uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 281e75db899a..be09fda09986 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -34,7 +34,7 @@ */ .macro ldrb1 ptr, regB, val - USER(9998f, ldrb \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val .endm .macro strb1 ptr, regB, val @@ -42,7 +42,7 @@ .endm .macro ldrh1 ptr, regB, val - USER(9998f, ldrh \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val .endm .macro strh1 ptr, regB, val @@ -50,7 +50,7 @@ .endm .macro ldr1 ptr, regB, val - USER(9998f, ldr \ptr, [\regB], \val) + uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val .endm .macro str1 ptr, regB, val @@ -58,7 +58,7 @@ .endm .macro ldp1 ptr, regB, regC, val - USER(9998f, ldp \ptr, \regB, [\regC], \val) + uao_ldp 9998f, \ptr, \regB, \regC, \val .endm .macro stp1 ptr, regB, regC, val diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 81c8fc93c100..feaad1520dc1 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -35,35 +35,35 @@ * x0 - bytes not copied */ .macro ldrb1 ptr, regB, val - USER(9998f, ldrb \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val .endm .macro strb1 ptr, regB, val - USER(9998f, strb \ptr, [\regB], \val) + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val .endm .macro ldrh1 ptr, regB, val - USER(9998f, ldrh \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val .endm .macro strh1 ptr, regB, val - USER(9998f, strh \ptr, [\regB], \val) + uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val .endm .macro ldr1 ptr, regB, val - USER(9998f, ldr \ptr, [\regB], \val) + uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val .endm .macro str1 ptr, regB, val - USER(9998f, str \ptr, [\regB], \val) + uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val .endm .macro ldp1 ptr, regB, regC, val - USER(9998f, ldp \ptr, \regB, [\regC], \val) + uao_ldp 9998f, \ptr, \regB, \regC, \val .endm .macro stp1 ptr, regB, regC, val - USER(9998f, stp \ptr, \regB, [\regC], \val) + uao_stp 9998f, \ptr, \regB, \regC, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index db4d187de61f..b4b9178aa36b 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -37,7 +37,7 @@ .endm .macro strb1 ptr, regB, val - USER(9998f, strb \ptr, [\regB], \val) + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val .endm .macro ldrh1 ptr, regB, val @@ -45,7 +45,7 @@ .endm .macro strh1 ptr, regB, val - USER(9998f, strh \ptr, [\regB], \val) + uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val .endm .macro ldr1 ptr, regB, val @@ -53,7 +53,7 @@ .endm .macro str1 ptr, regB, val - USER(9998f, str \ptr, [\regB], \val) + uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val .endm .macro ldp1 ptr, regB, regC, val @@ -61,7 +61,7 @@ .endm .macro stp1 ptr, regB, regC, val - USER(9998f, stp \ptr, \regB, [\regC], \val) + uao_stp 9998f, \ptr, \regB, \regC, \val .endm end .req x5 diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 92ddac1e8ca2..820d47353cf0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -192,6 +192,14 @@ out: return fault; } +static inline int permission_fault(unsigned int esr) +{ + unsigned int ec = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT; + unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -225,12 +233,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - /* - * PAN bit set implies the fault happened in kernel space, but not - * in the arch's user access functions. - */ - if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT)) - goto no_context; + if (permission_fault(esr) && (addr < USER_DS)) { + if (!search_exception_tables(regs->pc)) + panic("Accessing user space memory outside uaccess.h routines"); + } /* * As per x86, we may deadlock here. However, since the kernel only @@ -561,3 +567,16 @@ void cpu_enable_pan(void *__unused) config_sctlr_el1(SCTLR_EL1_SPAN, 0); } #endif /* CONFIG_ARM64_PAN */ + +#ifdef CONFIG_ARM64_UAO +/* + * Kernel threads have fs=KERNEL_DS by default, and don't need to call + * set_fs(), devtmpfs in particular relies on this behaviour. + * We need to enable the feature at runtime (instead of adding it to + * PSR_MODE_EL1h) as the feature may not be implemented by the cpu. + */ +void cpu_enable_uao(void *__unused) +{ + asm(SET_PSTATE_UAO(1)); +} +#endif /* CONFIG_ARM64_UAO */ From f7a32945ec495b6b88c251bd240b0d54f6636f06 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:49 +0000 Subject: [PATCH 455/607] UPSTREAM: arm64: cpufeature: Test 'matches' pointer to find the end of the list CPU feature code uses the desc field as a test to find the end of the list, this means every entry must have a description. This generates noise for entries in the list that aren't really features, but combinations of them. e.g. > CPU features: detected feature: Privileged Access Never > CPU features: detected feature: PAN and not UAO These combination features are needed for corner cases with alternatives, where cpu features interact. Change all walkers of the arm64_features[] and arm64_hwcaps[] lists to test 'matches' not 'desc', and only print 'desc' if it is non-NULL. Signed-off-by: James Morse Reviewed-by : Suzuki K Poulose Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 644c2ae198412c956700e55a2acf80b2541f6aa5) Signed-off-by: Jeff Vander Stoep Change-Id: I4500bb7c547e2e67ea56e242a8621df539f6fd67 --- arch/arm64/kernel/cpufeature.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ae22edf9d3c9..9cc8186cd14b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -771,7 +771,7 @@ static void __init setup_cpu_hwcaps(void) int i; const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; - for (i = 0; hwcaps[i].desc; i++) + for (i = 0; hwcaps[i].matches; i++) if (hwcaps[i].matches(&hwcaps[i])) cap_set_hwcap(&hwcaps[i]); } @@ -781,11 +781,11 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, { int i; - for (i = 0; caps[i].desc; i++) { + for (i = 0; caps[i].matches; i++) { if (!caps[i].matches(&caps[i])) continue; - if (!cpus_have_cap(caps[i].capability)) + if (!cpus_have_cap(caps[i].capability) && caps[i].desc) pr_info("%s %s\n", info, caps[i].desc); cpus_set_cap(caps[i].capability); } @@ -800,7 +800,7 @@ enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { int i; - for (i = 0; caps[i].desc; i++) + for (i = 0; caps[i].matches; i++) if (caps[i].enable && cpus_have_cap(caps[i].capability)) on_each_cpu(caps[i].enable, NULL, true); } @@ -907,7 +907,7 @@ void verify_local_cpu_capabilities(void) return; caps = arm64_features; - for (i = 0; caps[i].desc; i++) { + for (i = 0; caps[i].matches; i++) { if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg) continue; /* @@ -920,7 +920,7 @@ void verify_local_cpu_capabilities(void) caps[i].enable(NULL); } - for (i = 0, caps = arm64_hwcaps; caps[i].desc; i++) { + for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) { if (!cpus_have_hwcap(&caps[i])) continue; if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) From 612055f383a023e12042f75d6d951e76ee4c5748 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:50 +0000 Subject: [PATCH 456/607] BACKPORT: arm64: kernel: Don't toggle PAN on systems with UAO If a CPU supports both Privileged Access Never (PAN) and User Access Override (UAO), we don't need to disable/re-enable PAN round all copy_to_user() like calls. UAO alternatives cause these calls to use the 'unprivileged' load/store instructions, which are overridden to be the privileged kind when fs==KERNEL_DS. This patch changes the copy_to_user() calls to have their PAN toggling depend on a new composite 'feature' ARM64_ALT_PAN_NOT_UAO. If both features are detected, PAN will be enabled, but the copy_to_user() alternatives will not be applied. This means PAN will be enabled all the time for these functions. If only PAN is detected, the toggling will be enabled as normal. This will save the time taken to disable/re-enable PAN, and allow us to catch copy_to_user() accesses that occur with fs==KERNEL_DS. Futex and swp-emulation code continue to hang their PAN toggling code on ARM64_HAS_PAN. Signed-off-by: James Morse Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 705441960033e66b63524521f153fbb28c99ddbd) Signed-off-by: Jeff Vander Stoep Change-Id: I3fa35ebacaf401e1344e76932a26fdd14a8a3cdb --- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/uaccess.h | 8 ++++---- arch/arm64/kernel/cpufeature.c | 16 ++++++++++++++++ arch/arm64/lib/clear_user.S | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_in_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- arch/arm64/mm/fault.c | 3 +++ 8 files changed, 33 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index a5df7cde616b..37a53fc6b384 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -32,8 +32,9 @@ #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 #define ARM64_HAS_UAO 9 +#define ARM64_ALT_PAN_NOT_UAO 10 -#define ARM64_NCAPS 10 +#define ARM64_NCAPS 11 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index a227ba376f23..7b07e73b6316 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -145,7 +145,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -168,7 +168,7 @@ do { \ BUILD_BUG(); \ } \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ } while (0) @@ -217,7 +217,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -239,7 +239,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ } while (0) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9cc8186cd14b..7566cad9fa1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -67,6 +67,10 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); .width = 0, \ } +/* meta feature for alternatives */ +static bool __maybe_unused +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry); + static struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0), ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0), @@ -688,6 +692,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .enable = cpu_enable_uao, }, #endif /* CONFIG_ARM64_UAO */ +#ifdef CONFIG_ARM64_PAN + { + .capability = ARM64_ALT_PAN_NOT_UAO, + .matches = cpufeature_pan_not_uao, + }, +#endif /* CONFIG_ARM64_PAN */ {}, }; @@ -966,3 +976,9 @@ void __init setup_cpu_features(void) pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", L1_CACHE_BYTES, cls); } + +static bool __maybe_unused +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry) +{ + return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); +} diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 3f950b677c07..5d1cad3ce6d6 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -33,7 +33,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x2, x1 // save the size for fixup return subs x1, x1, #8 @@ -54,7 +54,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index be09fda09986..0b90497d4424 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -67,11 +67,11 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index feaad1520dc1..f7292dd08c84 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -68,11 +68,11 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index b4b9178aa36b..7a7efe255034 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -66,11 +66,11 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 820d47353cf0..d0762a729d01 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -234,6 +234,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (permission_fault(esr) && (addr < USER_DS)) { + if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS) + panic("Accessing user space memory with fs=KERNEL_DS"); + if (!search_exception_tables(regs->pc)) panic("Accessing user space memory outside uaccess.h routines"); } From c39e2026d6c2daef475d66cc0656769f93b42fc4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 18 Feb 2016 15:50:04 +0000 Subject: [PATCH 457/607] UPSTREAM: arm64: Remove the get_thread_info() function This function was introduced by previous commits implementing UAO. However, it can be replaced with task_thread_info() in uao_thread_switch() or get_fs() in do_page_fault() (the latter being called only on the current context, so no need for using the saved pt_regs). Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit e950631e84e7e38892ffbeee5e1816b270026b0e) Signed-off-by: Jeff Vander Stoep Change-Id: Ic6e9b6af7314fa83d9b0773ae3fac5a2ff34e67a --- arch/arm64/include/asm/thread_info.h | 6 ------ arch/arm64/kernel/process.c | 15 ++++++--------- arch/arm64/mm/fault.c | 2 +- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eba8db6838af..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -85,12 +85,6 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } -/* Access struct thread_info of another thread */ -static inline struct thread_info *get_thread_info(unsigned long thread_stack) -{ - return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1)); -} - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4431e8364881..6f3fb46170bf 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -381,15 +381,12 @@ static void tls_thread_switch(struct task_struct *next) /* Restore the UAO state depending on next's addr_limit */ static void uao_thread_switch(struct task_struct *next) { - unsigned long next_sp = next->thread.cpu_context.sp; - - if (IS_ENABLED(CONFIG_ARM64_UAO) && - get_thread_info(next_sp)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); + if (IS_ENABLED(CONFIG_ARM64_UAO)) { + if (task_thread_info(next)->addr_limit == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); + } } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d0762a729d01..a8eafeceb08a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -234,7 +234,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (permission_fault(esr) && (addr < USER_DS)) { - if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS) + if (get_fs() == KERNEL_DS) panic("Accessing user space memory with fs=KERNEL_DS"); if (!search_exception_tables(regs->pc)) From 6bfaecfc5801909762309f4ec831f789581f279b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:32 +0100 Subject: [PATCH 458/607] UPSTREAM: of/fdt: make memblock minimum physical address arch configurable By default, early_init_dt_add_memory_arch() ignores memory below the base of the kernel image since it won't be addressable via the linear mapping. However, this is not appropriate anymore once we decouple the kernel text mapping from the linear mapping, so archs may want to drop the low limit entirely. So allow the minimum to be overridden by setting MIN_MEMBLOCK_ADDR. Acked-by: Mark Rutland Acked-by: Rob Herring Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 270522a04f7a9911983878fa37da467f9ff1c938) Signed-off-by: Jeff Vander Stoep Change-Id: I4bb2626a87493262a64584b3d808de260129127e --- drivers/of/fdt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 86fff625f6d5..628f436e16f6 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1006,13 +1006,16 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, } #ifdef CONFIG_HAVE_MEMBLOCK +#ifndef MIN_MEMBLOCK_ADDR +#define MIN_MEMBLOCK_ADDR __pa(PAGE_OFFSET) +#endif #ifndef MAX_MEMBLOCK_ADDR #define MAX_MEMBLOCK_ADDR ((phys_addr_t)~0) #endif void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) { - const u64 phys_offset = __pa(PAGE_OFFSET); + const u64 phys_offset = MIN_MEMBLOCK_ADDR; if (!PAGE_ALIGNED(base)) { if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { From 6f77d1a3662212ea72e5c37c381bf0d2a09a463e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:33 +0100 Subject: [PATCH 459/607] UPSTREAM: of/fdt: factor out assignment of initrd_start/initrd_end Since architectures may not yet have their linear mapping up and running when the initrd address is discovered from the DT, factor out the assignment of initrd_start and initrd_end, so that an architecture can override it and use the translation it needs. Signed-off-by: Ard Biesheuvel Acked-by: Rob Herring Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 369bc9abf22bf026e8645a4dd746b90649a2f6ee) Signed-off-by: Jeff Vander Stoep Change-Id: I8c258bdc4367955314e9a5223dc4c7751a06a98d --- drivers/of/fdt.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 628f436e16f6..c6d196188bc9 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -760,6 +760,16 @@ const void * __init of_flat_dt_match_machine(const void *default_match, } #ifdef CONFIG_BLK_DEV_INITRD +#ifndef __early_init_dt_declare_initrd +static void __early_init_dt_declare_initrd(unsigned long start, + unsigned long end) +{ + initrd_start = (unsigned long)__va(start); + initrd_end = (unsigned long)__va(end); + initrd_below_start_ok = 1; +} +#endif + /** * early_init_dt_check_for_initrd - Decode initrd location from flat tree * @node: reference to node containing initrd location ('chosen') @@ -782,9 +792,7 @@ static void __init early_init_dt_check_for_initrd(unsigned long node) return; end = of_read_number(prop, len/4); - initrd_start = (unsigned long)__va(start); - initrd_end = (unsigned long)__va(end); - initrd_below_start_ok = 1; + __early_init_dt_declare_initrd(start, end); pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", (unsigned long long)start, (unsigned long long)end); From 6727d505317c6032c58063c46fe317045f3dacb4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:34 +0100 Subject: [PATCH 460/607] BACKPORT: arm64: prevent potential circular header dependencies in asm/bug.h Currently, using BUG_ON() in header files is cumbersome, due to the fact that asm/bug.h transitively includes a lot of other header files, resulting in the actual BUG_ON() invocation appearing before its definition in the preprocessor input. So let's reverse the #include dependency between asm/bug.h and asm/debug-monitors.h, by moving the definition of BUG_BRK_IMM from the latter to the former. Also fix up one user of asm/debug-monitors.h which relied on a transitive include. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 03336b1df9929e5d9c28fd9768948b6151cb046c) Signed-off-by: Jeff Vander Stoep Change-Id: I71470b7db9ef858a5a8368a872f931936c723a25 --- arch/arm64/include/asm/bug.h | 2 +- arch/arm64/include/asm/debug-monitors.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 4a748ce9ba1a..679d49221998 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,7 +18,7 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include +#define BUG_BRK_IMM 0x800 #ifdef CONFIG_GENERIC_BUG #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 279c85b5ec09..e893a1fca9c2 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,6 @@ #define FAULT_BRK_IMM 0x100 #define KGDB_DYN_DBG_BRK_IMM 0x400 #define KGDB_COMPILED_DBG_BRK_IMM 0x401 -#define BUG_BRK_IMM 0x800 /* * BRK instruction encoding From e3ec18326ffe69b6736232499ddf8634ed40601c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:35 +0100 Subject: [PATCH 461/607] BACKPORT: arm64: add support for ioremap() block mappings This wires up the existing generic huge-vmap feature, which allows ioremap() to use PMD or PUD sized block mappings. It also adds support to the unmap path for dealing with block mappings, which will allow us to unmap the __init region using unmap_kernel_range() in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 324420bf91f60582bb481133db9547111768ef17) Signed-off-by: Jeff Vander Stoep Change-Id: I4765ae77f7d67c3972b7e5b19d43db434e8b777c --- .../features/vm/huge-vmap/arch-support.txt | 2 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/memory.h | 6 +++ arch/arm64/mm/mmu.c | 41 +++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index af6816bccb43..df1d1f3c9af2 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | TODO | | arc: | TODO | | arm: | TODO | - | arm64: | TODO | + | arm64: | ok | | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ae51e1b18337..14866e975277 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -50,6 +50,7 @@ config ARM64 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE select HAVE_ARCH_HARDENED_USERCOPY + select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 853953cd1f08..c65aad7b13dc 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -100,6 +100,12 @@ #define MT_S2_NORMAL 0xf #define MT_S2_DEVICE_nGnRE 0x1 +#ifdef CONFIG_ARM64_4K_PAGES +#define IOREMAP_MAX_ORDER (PUD_SHIFT) +#else +#define IOREMAP_MAX_ORDER (PMD_SHIFT) +#endif + #ifndef __ASSEMBLY__ extern phys_addr_t memstart_addr; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8d0e49d2cba9..1f4f4d21457f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -745,3 +745,44 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; } + +int __init arch_ioremap_pud_supported(void) +{ + /* only 4k granule supports level 1 block mappings */ + return IS_ENABLED(CONFIG_ARM64_4K_PAGES); +} + +int __init arch_ioremap_pmd_supported(void) +{ + return 1; +} + +int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) +{ + BUG_ON(phys & ~PUD_MASK); + set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) +{ + BUG_ON(phys & ~PMD_MASK); + set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (!pud_sect(*pud)) + return 0; + pud_clear(pud); + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (!pmd_sect(*pmd)) + return 0; + pmd_clear(pmd); + return 1; +} From c8d7bc708e8d0c197c3a0e3197bfb7f283ef352a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:36 +0100 Subject: [PATCH 462/607] UPSTREAM: arm64: introduce KIMAGE_VADDR as the virtual base of the kernel region This introduces the preprocessor symbol KIMAGE_VADDR which will serve as the symbolic virtual base of the kernel region, i.e., the kernel's virtual offset will be KIMAGE_VADDR + TEXT_OFFSET. For now, we define it as being equal to PAGE_OFFSET, but in the future, it will be moved below it once we move the kernel virtual mapping out of the linear mapping. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit ab893fb9f1b17f02139bce547bb4b69e96b9ae16) Signed-off-by: Jeff Vander Stoep Change-Id: I31427bd2b948a22bb8ce1d22109682fc66efb98d --- arch/arm64/include/asm/memory.h | 10 ++++++++-- arch/arm64/kernel/head.S | 2 +- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c65aad7b13dc..aebc739f5a11 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -51,7 +51,8 @@ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) -#define MODULES_END (PAGE_OFFSET) +#define KIMAGE_VADDR (PAGE_OFFSET) +#define MODULES_END (KIMAGE_VADDR) #define MODULES_VADDR (MODULES_END - SZ_64M) #define PCI_IO_END (MODULES_VADDR - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) @@ -75,8 +76,13 @@ * private definitions which should NOT be used outside memory.h * files. Use virt_to_phys/phys_to_virt/__pa/__va instead. */ -#define __virt_to_phys(x) (((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET)) +#define __virt_to_phys(x) ({ \ + phys_addr_t __x = (phys_addr_t)(x); \ + __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ + (__x - KIMAGE_VADDR + PHYS_OFFSET); }) + #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) +#define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) /* * Convert a page to/from a physical address diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 53b9f9f128c2..04d38a058b19 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -389,7 +389,7 @@ __create_page_tables: * Map the kernel image (starting with PHYS_OFFSET). */ mov x0, x26 // swapper_pg_dir - mov x5, #PAGE_OFFSET + ldr x5, =KIMAGE_VADDR create_pgd_entry x0, x5, x3, x6 ldr x6, =KERNEL_END // __va(KERNEL_END) mov x3, x24 // phys offset diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 763e5aab5752..7703feba23d6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -89,7 +89,7 @@ SECTIONS *(.discard.*) } - . = PAGE_OFFSET + TEXT_OFFSET; + . = KIMAGE_VADDR + TEXT_OFFSET; .head.text : { _text = .; @@ -188,4 +188,4 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (PAGE_OFFSET + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") From 58539890b74ae227da611e91965c2e5ee2f57645 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:37 +0100 Subject: [PATCH 463/607] UPSTREAM: arm64: pgtable: implement static [pte|pmd|pud]_offset variants The page table accessors pte_offset(), pud_offset() and pmd_offset() rely on __va translations, so they can only be used after the linear mapping has been installed. For the early fixmap and kasan init routines, whose page tables are allocated statically in the kernel image, these functions will return bogus values. So implement pte_offset_kimg(), pmd_offset_kimg() and pud_offset_kimg(), which can be used instead before any page tables have been allocated dynamically. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6533945a32c762c5db70d7a3ec251a040b2d9661) Signed-off-by: Jeff Vander Stoep Change-Id: Ibea400f0938db568524fb83eb2d22d8658bbb56b --- arch/arm64/include/asm/pgtable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6a58dd48663f..f3e887d5376c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -453,6 +453,9 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. @@ -496,6 +499,9 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) + #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) @@ -505,6 +511,8 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) #define pmd_clear_fixmap() +#define pmd_offset_kimg(dir,addr) ((pmd_t *)dir) + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -543,6 +551,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) + #else #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) @@ -552,6 +563,8 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) #define pud_clear_fixmap() +#define pud_offset_kimg(dir,addr) ((pud_t *)dir) + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) From 3b177382c499864e0fd6a6a11236a616a7105be5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:38 +0100 Subject: [PATCH 464/607] UPSTREAM: arm64: decouple early fixmap init from linear mapping Since the early fixmap page tables are populated using pages that are part of the static footprint of the kernel, they are covered by the initial kernel mapping, and we can refer to them without using __va/__pa translations, which are tied to the linear mapping. Since the fixmap page tables are disjoint from the kernel mapping up to the top level pgd entry, we can refer to bm_pte[] directly, and there is no need to walk the page tables and perform __pa()/__va() translations at each step. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 157962f5a8f236cab898b68bdaa69ce68922f0bf) Signed-off-by: Jeff Vander Stoep Change-Id: I49221a199962aec6d4f3712bfb3dd041d64ba99b --- arch/arm64/mm/mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1f4f4d21457f..84ef01d7aaf8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -614,7 +614,7 @@ static inline pud_t * fixmap_pud(unsigned long addr) BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd)); - return pud_offset(pgd, addr); + return pud_offset_kimg(pgd, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) @@ -623,16 +623,12 @@ static inline pmd_t * fixmap_pmd(unsigned long addr) BUG_ON(pud_none(*pud) || pud_bad(*pud)); - return pmd_offset(pud, addr); + return pmd_offset_kimg(pud, addr); } static inline pte_t * fixmap_pte(unsigned long addr) { - pmd_t *pmd = fixmap_pmd(addr); - - BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd)); - - return pte_offset_kernel(pmd, addr); + return &bm_pte[pte_index(addr)]; } void __init early_fixmap_init(void) @@ -644,14 +640,14 @@ void __init early_fixmap_init(void) pgd = pgd_offset_k(addr); pgd_populate(&init_mm, pgd, bm_pud); - pud = pud_offset(pgd, addr); + pud = fixmap_pud(addr); pud_populate(&init_mm, pud, bm_pmd); - pmd = pmd_offset(pud, addr); + pmd = fixmap_pmd(addr); pmd_populate_kernel(&init_mm, pmd, bm_pte); /* * The boot-ioremap range spans multiple pmds, for which - * we are not preparted: + * we are not prepared: */ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); From 1ca6787a7310da94e9bf0846b0a2e1a9e9a91291 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:39 +0100 Subject: [PATCH 465/607] BACKPORT: arm64: kvm: deal with kernel symbols outside of linear mapping KVM on arm64 uses a fixed offset between the linear mapping at EL1 and the HYP mapping at EL2. Before we can move the kernel virtual mapping out of the linear mapping, we have to make sure that references to kernel symbols that are accessed via the HYP mapping are translated to their linear equivalent. Reviewed-by: Mark Rutland Acked-by: Marc Zyngier Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a0bf9776cd0be4490d4675d4108e13379849fc7f) Signed-off-by: Jeff Vander Stoep Change-Id: I316f029d22a16773c168a151dba59bed7921fa7e --- arch/arm/include/asm/kvm_asm.h | 2 ++ arch/arm/kvm/arm.c | 5 +++-- arch/arm64/include/asm/kvm_asm.h | 17 +++++++++++++++++ arch/arm64/include/asm/kvm_host.h | 8 +++++--- arch/arm64/kvm/hyp.S | 6 +++--- 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 194c91b610ff..c35c349da069 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -79,6 +79,8 @@ #define rr_lo_hi(a1, a2) a1, a2 #endif +#define kvm_ksym_ref(kva) (kva) + #ifndef __ASSEMBLY__ struct kvm; struct kvm_vcpu; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e06fd299de08..70e6d557c75f 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -969,7 +969,7 @@ static void cpu_init_hyp_mode(void *dummy) pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)__kvm_hyp_vector; + vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); @@ -1061,7 +1061,8 @@ static int init_hyp_mode(void) /* * Map the Hyp-code called directly from the host */ - err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end); + err = create_hyp_mappings(kvm_ksym_ref(__kvm_hyp_code_start), + kvm_ksym_ref(__kvm_hyp_code_end)); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_free_mappings; diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 5e377101f919..e95c39543629 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,7 +102,24 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) +#define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) + #ifndef __ASSEMBLY__ +#if __GNUC__ > 4 +#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) +#else +/* + * GCC versions 4.9 and older will fold the constant below into the addend of + * the reference to 'sym' above if kvm_ksym_shift is declared static or if the + * constant is used directly. However, since we use the small code model for + * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, + * with a +/- 4 GB range, resulting in linker relocation errors if the shift + * is sufficiently large. So prevent the compiler from folding the shift into + * the addend, by making the shift a variable with external linkage. + */ +__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; +#endif + struct kvm; struct kvm_vcpu; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a35ce7266aac..90c6368ad7c8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -222,7 +222,7 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); -u64 kvm_call_hyp(void *hypfn, ...); +u64 __kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); @@ -243,8 +243,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, * Call initialization code, and switch to the full blown * HYP code. */ - kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, - hyp_stack_ptr, vector_ptr); + __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, + hyp_stack_ptr, vector_ptr); } static inline void kvm_arch_hardware_disable(void) {} @@ -258,4 +258,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 86c289832272..309e3479dc2c 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -923,7 +923,7 @@ __hyp_panic_str: .align 2 /* - * u64 kvm_call_hyp(void *hypfn, ...); + * u64 __kvm_call_hyp(void *hypfn, ...); * * This is not really a variadic function in the classic C-way and care must * be taken when calling this to ensure parameters are passed in registers @@ -940,10 +940,10 @@ __hyp_panic_str: * used to implement __hyp_get_vectors in the same way as in * arch/arm64/kernel/hyp_stub.S. */ -ENTRY(kvm_call_hyp) +ENTRY(__kvm_call_hyp) hvc #0 ret -ENDPROC(kvm_call_hyp) +ENDPROC(__kvm_call_hyp) .macro invalid_vector label, target .align 2 From c3a32b2934bd3190c651fb53ad734b296a45f15e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:40 +0100 Subject: [PATCH 466/607] UPSTREAM: arm64: move kernel image to base of vmalloc area This moves the module area to right before the vmalloc area, and moves the kernel image to the base of the vmalloc area. This is an intermediate step towards implementing KASLR, which allows the kernel image to be located anywhere in the vmalloc area. Since other subsystems such as hibernate may still need to refer to the kernel text or data segments via their linears addresses, both are mapped in the linear region as well. The linear alias of the text region is mapped read-only/non-executable to prevent inadvertent modification or execution. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f9040773b7bbbd9e98eb6184a263512a7cfc133f) Signed-off-by: Jeff Vander Stoep Change-Id: I698faed47bb7cfc256a1b5b5407a7c586bdc63b3 --- arch/arm64/include/asm/kasan.h | 2 +- arch/arm64/include/asm/memory.h | 21 ++++-- arch/arm64/include/asm/pgtable.h | 10 +-- arch/arm64/mm/dump.c | 12 ++-- arch/arm64/mm/init.c | 23 +++---- arch/arm64/mm/kasan_init.c | 27 +++++++- arch/arm64/mm/mmu.c | 110 +++++++++++++++++++++---------- 7 files changed, 137 insertions(+), 68 deletions(-) diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index de0d21211c34..71ad0f93eb71 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -14,7 +14,7 @@ * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses. */ #define KASAN_SHADOW_START (VA_START) -#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1UL << (VA_BITS - 3))) +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) /* * This value is used to map an address to the corresponding shadow diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index aebc739f5a11..4388651d1f0d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -45,16 +45,15 @@ * VA_START - the first kernel virtual address. * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. - * The module space lives between the addresses given by TASK_SIZE - * and PAGE_OFFSET - it must be within 128MB of the kernel text. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) -#define KIMAGE_VADDR (PAGE_OFFSET) -#define MODULES_END (KIMAGE_VADDR) -#define MODULES_VADDR (MODULES_END - SZ_64M) -#define PCI_IO_END (MODULES_VADDR - SZ_2M) +#define KIMAGE_VADDR (MODULES_END) +#define MODULES_END (MODULES_VADDR + MODULES_VSIZE) +#define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) +#define MODULES_VSIZE (SZ_64M) +#define PCI_IO_END (PAGE_OFFSET - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) #define TASK_SIZE_64 (UL(1) << VA_BITS) @@ -71,6 +70,16 @@ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) +/* + * The size of the KASAN shadow region. This should be 1/8th of the + * size of the entire kernel virtual address space. + */ +#ifdef CONFIG_KASAN +#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#else +#define KASAN_SHADOW_SIZE (0) +#endif + /* * Physical vs virtual RAM address space conversion. These are * private definitions which should NOT be used outside memory.h diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f3e887d5376c..9bf72b72b76d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -36,19 +36,13 @@ * * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array * (rounded up to PUD_SIZE). - * VMALLOC_START: beginning of the kernel VA space + * VMALLOC_START: beginning of the kernel vmalloc space * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ #define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) -#ifndef CONFIG_KASAN -#define VMALLOC_START (VA_START) -#else -#include -#define VMALLOC_START (KASAN_SHADOW_END + SZ_64K) -#endif - +#define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0841b2bf0e6a..6be918478f85 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -35,7 +35,9 @@ struct addr_marker { }; enum address_markers_idx { - VMALLOC_START_NR = 0, + MODULES_START_NR = 0, + MODULES_END_NR, + VMALLOC_START_NR, VMALLOC_END_NR, #ifdef CONFIG_SPARSEMEM_VMEMMAP VMEMMAP_START_NR, @@ -45,12 +47,12 @@ enum address_markers_idx { FIXADDR_END_NR, PCI_START_NR, PCI_END_NR, - MODULES_START_NR, - MODULES_END_NR, KERNEL_SPACE_NR, }; static struct addr_marker address_markers[] = { + { MODULES_VADDR, "Modules start" }, + { MODULES_END, "Modules end" }, { VMALLOC_START, "vmalloc() Area" }, { VMALLOC_END, "vmalloc() End" }, #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = { { FIXADDR_TOP, "Fixmap end" }, { PCI_IO_START, "PCI I/O start" }, { PCI_IO_END, "PCI I/O end" }, - { MODULES_VADDR, "Modules start" }, - { MODULES_END, "Modules end" }, - { PAGE_OFFSET, "Kernel Mapping" }, + { PAGE_OFFSET, "Linear Mapping" }, { -1, NULL }, }; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f13078bbf66e..46ae88e0744e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -304,22 +305,26 @@ void __init mem_init(void) #ifdef CONFIG_KASAN " kasan : 0x%16lx - 0x%16lx (%6ld GB)\n" #endif + " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" + " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .data : 0x%p" " - 0x%p" " (%6ld KB)\n" #ifdef CONFIG_SPARSEMEM_VMEMMAP " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" " 0x%16lx - 0x%16lx (%6ld MB actual)\n" #endif " fixed : 0x%16lx - 0x%16lx (%6ld KB)\n" " PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n" - " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" - " memory : 0x%16lx - 0x%16lx (%6ld MB)\n" - " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .data : 0x%p" " - 0x%p" " (%6ld KB)\n", + " memory : 0x%16lx - 0x%16lx (%6ld MB)\n", #ifdef CONFIG_KASAN MLG(KASAN_SHADOW_START, KASAN_SHADOW_END), #endif + MLM(MODULES_VADDR, MODULES_END), MLG(VMALLOC_START, VMALLOC_END), + MLK_ROUNDUP(__init_begin, __init_end), + MLK_ROUNDUP(_text, _etext), + MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG((unsigned long)vmemmap, (unsigned long)vmemmap + VMEMMAP_SIZE), @@ -328,11 +333,7 @@ void __init mem_init(void) #endif MLK(FIXADDR_START, FIXADDR_TOP), MLM(PCI_IO_START, PCI_IO_END), - MLM(MODULES_VADDR, MODULES_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory), - MLK_ROUNDUP(__init_begin, __init_end), - MLK_ROUNDUP(_text, _etext), - MLK_ROUNDUP(_sdata, _edata)); + MLM(PAGE_OFFSET, (unsigned long)high_memory)); #undef MLK #undef MLM @@ -360,8 +361,8 @@ void __init mem_init(void) void free_initmem(void) { - fixup_init(); free_initmem_default(0); + fixup_init(); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cc569a38bc76..7f10cc91fa8a 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -17,9 +17,11 @@ #include #include +#include #include #include #include +#include #include static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); @@ -33,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, if (pmd_none(*pmd)) pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); - pte = pte_offset_kernel(pmd, addr); + pte = pte_offset_kimg(pmd, addr); do { next = addr + PAGE_SIZE; set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page), @@ -51,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud, if (pud_none(*pud)) pud_populate(&init_mm, pud, kasan_zero_pmd); - pmd = pmd_offset(pud, addr); + pmd = pmd_offset_kimg(pud, addr); do { next = pmd_addr_end(addr, end); kasan_early_pte_populate(pmd, addr, next); @@ -68,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd, if (pgd_none(*pgd)) pgd_populate(&init_mm, pgd, kasan_zero_pud); - pud = pud_offset(pgd, addr); + pud = pud_offset_kimg(pgd, addr); do { next = pud_addr_end(addr, end); kasan_early_pmd_populate(pud, addr, next); @@ -126,9 +128,13 @@ static void __init clear_pgds(unsigned long start, void __init kasan_init(void) { + u64 kimg_shadow_start, kimg_shadow_end; struct memblock_region *reg; int i; + kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); + kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call bellow). @@ -142,8 +148,23 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); + + /* + * vmemmap_populate() has populated the shadow region that covers the + * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round + * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent + * kasan_populate_zero_shadow() from replacing the PMD block mappings + * with PMD table mappings at the edges of the shadow region for the + * kernel image. + */ + if (ARM64_SWAPPER_USES_SECTION_MAPS) + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, kasan_mem_to_shadow((void *)MODULES_VADDR)); + kasan_populate_zero_shadow((void *)kimg_shadow_end, + kasan_mem_to_shadow((void *)PAGE_OFFSET)); for_each_memblock(memory, reg) { void *start = (void *)__phys_to_virt(reg->base); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 84ef01d7aaf8..de7a8627928a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -53,6 +53,10 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -380,16 +384,15 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { - unsigned long kernel_start = __pa(_stext); - unsigned long kernel_end = __pa(_end); + unsigned long kernel_end = __pa(_etext); /* - * The kernel itself is mapped at page granularity. Map all other - * memory, making sure we don't overwrite the existing kernel mappings. + * Take care not to create a writable alias for the + * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel. */ + /* No overlap with the kernel text */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -398,8 +401,8 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel mapping. Map the portion(s) which - * don't overlap. + * This block overlaps the kernel text mapping. + * Map the portion(s) which don't overlap. */ if (start < kernel_start) __create_pgd_mapping(pgd, start, @@ -411,6 +414,16 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, early_pgtable_alloc); + + /* + * Map the linear alias of the [_stext, _etext) interval as + * read-only/non-executable. This makes the contents of the + * region accessible to subsystems such as hibernate, but + * protects it from inadvertent modification or execution. + */ + __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), + kernel_end - kernel_start, PAGE_KERNEL_RO, + early_pgtable_alloc); } static void __init map_mem(pgd_t *pgd) @@ -429,25 +442,28 @@ static void __init map_mem(pgd_t *pgd) } } -#ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { + if (!IS_ENABLED(CONFIG_DEBUG_RODATA)) + return; + create_mapping_late(__pa(_stext), (unsigned long)_stext, (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_ROX); - } -#endif void fixup_init(void) { - create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin, - (unsigned long)__init_end - (unsigned long)__init_begin, - PAGE_KERNEL); + /* + * Unmap the __init region but leave the VM area in place. This + * prevents the region from being reused for kernel modules, which + * is not supported by kallsyms. + */ + unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); } static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, - pgprot_t prot) + pgprot_t prot, struct vm_struct *vma) { phys_addr_t pa_start = __pa(va_start); unsigned long size = va_end - va_start; @@ -457,6 +473,14 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, early_pgtable_alloc); + + vma->addr = va_start; + vma->phys_addr = pa_start; + vma->size = size; + vma->flags = VM_MAP; + vma->caller = __builtin_return_address(0); + + vm_area_add_early(vma); } /* @@ -464,17 +488,35 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, */ static void __init map_kernel(pgd_t *pgd) { + static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data; - map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); - map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); - map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, + &vmlinux_init); + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); - /* - * The fixmap falls in a separate pgd to the kernel, and doesn't live - * in the carveout for the swapper_pg_dir. We can simply re-use the - * existing dir for the fixmap. - */ - set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); + if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { + /* + * The fixmap falls in a separate pgd to the kernel, and doesn't + * live in the carveout for the swapper_pg_dir. We can simply + * re-use the existing dir for the fixmap. + */ + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), + *pgd_offset_k(FIXADDR_START)); + } else if (CONFIG_PGTABLE_LEVELS > 3) { + /* + * The fixmap shares its top level pgd entry with the kernel + * mapping. This can really only occur when we are running + * with 16k/4 levels, so we can simply reuse the pud level + * entry instead. + */ + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); + set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START), + __pud(__pa(bm_pmd) | PUD_TYPE_TABLE)); + pud_clear_fixmap(); + } else { + BUG(); + } kasan_copy_shadow(pgd); } @@ -600,14 +642,6 @@ void vmemmap_free(unsigned long start, unsigned long end) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; -#if CONFIG_PGTABLE_LEVELS > 2 -static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; -#endif -#if CONFIG_PGTABLE_LEVELS > 3 -static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; -#endif - static inline pud_t * fixmap_pud(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); @@ -639,8 +673,18 @@ void __init early_fixmap_init(void) unsigned long addr = FIXADDR_START; pgd = pgd_offset_k(addr); - pgd_populate(&init_mm, pgd, bm_pud); - pud = fixmap_pud(addr); + if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { + /* + * We only end up here if the kernel mapping and the fixmap + * share the top level pgd entry, which should only happen on + * 16k/4 levels configurations. + */ + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); + pud = pud_offset_kimg(pgd, addr); + } else { + pgd_populate(&init_mm, pgd, bm_pud); + pud = fixmap_pud(addr); + } pud_populate(&init_mm, pud, bm_pmd); pmd = fixmap_pmd(addr); pmd_populate_kernel(&init_mm, pmd, bm_pte); From 23b3715e4cc4262ba8fedabfe67e898549696126 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:41 +0100 Subject: [PATCH 467/607] UPSTREAM: arm64: defer __va translation of initrd_start and initrd_end Before deferring the assignment of memstart_addr in a subsequent patch, to the moment where all memory has been discovered and possibly clipped based on the size of the linear region and the presence of a mem= command line parameter, we need to ensure that memstart_addr is not used to perform __va translations before it is assigned. One such use is in the generic early DT discovery of the initrd location, which is recorded as a virtual address in the globals initrd_start and initrd_end. So wire up the generic support to declare the initrd addresses, and implement it without __va() translations, and perform the translation after memstart_addr has been assigned. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a89dea585371a9d5d85499db47c93f129be8e0c4) Signed-off-by: Jeff Vander Stoep Change-Id: I7d0b3dd7adcf069d4e7c1f58fd12e59c4cb62017 --- arch/arm64/include/asm/memory.h | 8 ++++++++ arch/arm64/mm/init.c | 13 +++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 4388651d1f0d..18b7e77c7495 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -121,6 +121,14 @@ #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif +#ifdef CONFIG_BLK_DEV_INITRD +#define __early_init_dt_declare_initrd(__start, __end) \ + do { \ + initrd_start = (__start); \ + initrd_end = (__end); \ + } while (0) +#endif + #ifndef __ASSEMBLY__ extern phys_addr_t memstart_addr; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 46ae88e0744e..16e18d35eb04 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -59,8 +59,8 @@ static int __init early_initrd(char *p) if (*endp == ',') { size = memparse(endp + 1, NULL); - initrd_start = (unsigned long)__va(start); - initrd_end = (unsigned long)__va(start + size); + initrd_start = start; + initrd_end = start + size; } return 0; } @@ -170,8 +170,13 @@ void __init arm64_memblock_init(void) */ memblock_reserve(__pa(_text), _end - _text); #ifdef CONFIG_BLK_DEV_INITRD - if (initrd_start) - memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start); + if (initrd_start) { + memblock_reserve(initrd_start, initrd_end - initrd_start); + + /* the generic initrd code expects virtual addresses */ + initrd_start = __phys_to_virt(initrd_start); + initrd_end = __phys_to_virt(initrd_end); + } #endif early_init_fdt_scan_reserved_mem(); From cb4d47d9a2e863d38c1a2c81b4f2c5181cf1c8df Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:42 +0100 Subject: [PATCH 468/607] UPSTREAM: arm64: allow kernel Image to be loaded anywhere in physical memory This relaxes the kernel Image placement requirements, so that it may be placed at any 2 MB aligned offset in physical memory. This is accomplished by ignoring PHYS_OFFSET when installing memblocks, and accounting for the apparent virtual offset of the kernel Image. As a result, virtual address references below PAGE_OFFSET are correctly mapped onto physical references into the kernel Image regardless of where it sits in memory. Special care needs to be taken for dealing with memory limits passed via mem=, since the generic implementation clips memory top down, which may clip the kernel image itself if it is loaded high up in memory. To deal with this case, we simply add back the memory covering the kernel image, which may result in more memory to be retained than was passed as a mem= parameter. Since mem= should not be considered a production feature, a panic notifier handler is installed that dumps the memory limit at panic time if one was set. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a7f8de168ace487fa7b88cb154e413cf40e87fc6) Signed-off-by: Jeff Vander Stoep Change-Id: I1d28cb66b658ef89f9648918565ddc07df4660f8 --- Documentation/arm64/booting.txt | 20 +++++--- arch/arm64/include/asm/boot.h | 6 +++ arch/arm64/include/asm/kernel-pgtable.h | 12 +++++ arch/arm64/include/asm/kvm_asm.h | 17 +------ arch/arm64/include/asm/memory.h | 18 +++---- arch/arm64/kernel/head.S | 6 ++- arch/arm64/kernel/image.h | 13 +++-- arch/arm64/mm/init.c | 63 ++++++++++++++++++++++++- arch/arm64/mm/mmu.c | 3 ++ 9 files changed, 119 insertions(+), 39 deletions(-) diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt index 701d39d3171a..56d6d8b796db 100644 --- a/Documentation/arm64/booting.txt +++ b/Documentation/arm64/booting.txt @@ -109,7 +109,13 @@ Header notes: 1 - 4K 2 - 16K 3 - 64K - Bits 3-63: Reserved. + Bit 3: Kernel physical placement + 0 - 2MB aligned base should be as close as possible + to the base of DRAM, since memory below it is not + accessible via the linear mapping + 1 - 2MB aligned base may be anywhere in physical + memory + Bits 4-63: Reserved. - When image_size is zero, a bootloader should attempt to keep as much memory as possible free for use by the kernel immediately after the @@ -117,14 +123,14 @@ Header notes: depending on selected features, and is effectively unbound. The Image must be placed text_offset bytes from a 2MB aligned base -address near the start of usable system RAM and called there. Memory -below that base address is currently unusable by Linux, and therefore it -is strongly recommended that this location is the start of system RAM. -The region between the 2 MB aligned base address and the start of the -image has no special significance to the kernel, and may be used for -other purposes. +address anywhere in usable system RAM and called there. The region +between the 2 MB aligned base address and the start of the image has no +special significance to the kernel, and may be used for other purposes. At least image_size bytes from the start of the image must be free for use by the kernel. +NOTE: versions prior to v4.6 cannot make use of memory below the +physical offset of the Image so it is recommended that the Image be +placed as close as possible to the start of system RAM. Any memory described to the kernel (even that below the start of the image) which is not marked as reserved from the kernel (e.g., with a diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h index 81151b67b26b..ebf2481889c3 100644 --- a/arch/arm64/include/asm/boot.h +++ b/arch/arm64/include/asm/boot.h @@ -11,4 +11,10 @@ #define MIN_FDT_ALIGN 8 #define MAX_FDT_SIZE SZ_2M +/* + * arm64 requires the kernel image to placed + * TEXT_OFFSET bytes beyond a 2 MB aligned base + */ +#define MIN_KIMG_ALIGN SZ_2M + #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index a459714ee29e..5c6375d8528b 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -79,5 +79,17 @@ #define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS) #endif +/* + * To make optimal use of block mappings when laying out the linear + * mapping, round down the base of physical memory to a size that can + * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE + * (64k granule), or a multiple that can be mapped using contiguous bits + * in the page tables: 32 * PMD_SIZE (16k granule) + */ +#ifdef CONFIG_ARM64_64K_PAGES +#define ARM64_MEMSTART_ALIGN SZ_512M +#else +#define ARM64_MEMSTART_ALIGN SZ_1G +#endif #endif /* __ASM_KERNEL_PGTABLE_H */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index e95c39543629..419bc6661b5c 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,24 +102,9 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) -#define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) +#define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset) #ifndef __ASSEMBLY__ -#if __GNUC__ > 4 -#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) -#else -/* - * GCC versions 4.9 and older will fold the constant below into the addend of - * the reference to 'sym' above if kvm_ksym_shift is declared static or if the - * constant is used directly. However, since we use the small code model for - * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, - * with a +/- 4 GB range, resulting in linker relocation errors if the shift - * is sufficiently large. So prevent the compiler from folding the shift into - * the addend, by making the shift a variable with external linkage. - */ -__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; -#endif - struct kvm; struct kvm_vcpu; diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 18b7e77c7495..3239e4d78e0d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -88,10 +89,10 @@ #define __virt_to_phys(x) ({ \ phys_addr_t __x = (phys_addr_t)(x); \ __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ - (__x - KIMAGE_VADDR + PHYS_OFFSET); }) + (__x - kimage_voffset); }) #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) -#define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) +#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* * Convert a page to/from a physical address @@ -133,15 +134,16 @@ extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ -#define PHYS_OFFSET ({ memstart_addr; }) +#define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) + +/* the offset between the kernel virtual and physical mappings */ +extern u64 kimage_voffset; /* - * The maximum physical address that the linear direct mapping - * of system RAM can cover. (PAGE_OFFSET can be interpreted as - * a 2's complement signed quantity and negated to derive the - * maximum size of the linear mapping.) + * Allow all memory at the discovery stage. We will clip it later. */ -#define MAX_MEMBLOCK_ADDR ({ memstart_addr - PAGE_OFFSET - 1; }) +#define MIN_MEMBLOCK_ADDR 0 +#define MAX_MEMBLOCK_ADDR U64_MAX /* * PFNs are used to describe any physical page; this means diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 04d38a058b19..05b98289093e 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -428,7 +428,11 @@ __mmap_switched: and x4, x4, #~(THREAD_SIZE - 1) msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer - str_l x24, memstart_addr, x6 // Save PHYS_OFFSET + + ldr x4, =KIMAGE_VADDR // Save the offset between + sub x4, x4, x24 // the kernel virtual and + str_l x4, kimage_voffset, x5 // physical mappings + mov x29, #0 #ifdef CONFIG_KASAN bl kasan_early_init diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 999633bd7294..c9c62cab25a4 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -42,15 +42,18 @@ #endif #ifdef CONFIG_CPU_BIG_ENDIAN -#define __HEAD_FLAG_BE 1 +#define __HEAD_FLAG_BE 1 #else -#define __HEAD_FLAG_BE 0 +#define __HEAD_FLAG_BE 0 #endif -#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) +#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) -#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ - (__HEAD_FLAG_PAGE_SIZE << 1)) +#define __HEAD_FLAG_PHYS_BASE 1 + +#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ + (__HEAD_FLAG_PAGE_SIZE << 1) | \ + (__HEAD_FLAG_PHYS_BASE << 3)) /* * These will output as part of the Image header, which should be little-endian diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 16e18d35eb04..e333f7f0b4f2 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -35,8 +35,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -46,7 +48,13 @@ #include "mm.h" -phys_addr_t memstart_addr __read_mostly = 0; +/* + * We need to be able to catch inadvertent references to memstart_addr + * that occur (potentially in generic code) before arm64_memblock_init() + * executes, which assigns it its actual value. So use a default value + * that cannot be mistaken for a real physical address. + */ +phys_addr_t memstart_addr __read_mostly = ~0ULL; phys_addr_t arm64_dma_phys_limit __read_mostly; #ifdef CONFIG_BLK_DEV_INITRD @@ -162,7 +170,33 @@ early_param("mem", early_mem); void __init arm64_memblock_init(void) { - memblock_enforce_memory_limit(memory_limit); + const s64 linear_region_size = -(s64)PAGE_OFFSET; + + /* + * Select a suitable value for the base of physical memory. + */ + memstart_addr = round_down(memblock_start_of_DRAM(), + ARM64_MEMSTART_ALIGN); + + /* + * Remove the memory that we will not be able to cover with the + * linear mapping. Take care not to clip the kernel which may be + * high in memory. + */ + memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), + ULLONG_MAX); + if (memblock_end_of_DRAM() > linear_region_size) + memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); + + /* + * Apply the memory limit if it was set. Since the kernel may be loaded + * high up in memory, add back the kernel region that must be accessible + * via the linear mapping. + */ + if (memory_limit != (phys_addr_t)ULLONG_MAX) { + memblock_enforce_memory_limit(memory_limit); + memblock_add(__pa(_text), (u64)(_end - _text)); + } /* * Register the kernel text, kernel data, initrd, and initial @@ -388,3 +422,28 @@ static int __init keepinitrd_setup(char *__unused) __setup("keepinitrd", keepinitrd_setup); #endif + +/* + * Dump out memory limit information on panic. + */ +static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p) +{ + if (memory_limit != (phys_addr_t)ULLONG_MAX) { + pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20); + } else { + pr_emerg("Memory Limit: none\n"); + } + return 0; +} + +static struct notifier_block mem_limit_notifier = { + .notifier_call = dump_mem_limit, +}; + +static int __init register_mem_limit_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &mem_limit_notifier); + return 0; +} +__initcall(register_mem_limit_dumper); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index de7a8627928a..8577437b6c0a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -46,6 +46,9 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 kimage_voffset __read_mostly; +EXPORT_SYMBOL(kimage_voffset); + /* * Empty_zero_page is a special page that is used for zero-initialized data * and COW. From 298b730c260f6766a31ac4b3fa203d7d251a9cbd Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 19 Feb 2016 14:28:58 +0000 Subject: [PATCH 469/607] UPSTREAM: arm64: User die() instead of panic() in do_page_fault() The former gives better error reporting on unhandled permission faults (introduced by the UAO patches). Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 70c8abc28762d04e36c92e07eee2ce6ab41049cb) Signed-off-by: Jeff Vander Stoep Change-Id: Ia419eccf1554a32fa4131ac15b277d4d2d4eb508 --- arch/arm64/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a8eafeceb08a..44e56de23f79 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -235,10 +235,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (permission_fault(esr) && (addr < USER_DS)) { if (get_fs() == KERNEL_DS) - panic("Accessing user space memory with fs=KERNEL_DS"); + die("Accessing user space memory with fs=KERNEL_DS", regs, esr); if (!search_exception_tables(regs->pc)) - panic("Accessing user space memory outside uaccess.h routines"); + die("Accessing user space memory outside uaccess.h routines", regs, esr); } /* From 477376b452f0d5fc564dfe2d0fc0fe8156051bb1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 18:46:03 +0100 Subject: [PATCH 470/607] UPSTREAM: arm64: mm: only perform memstart_addr sanity check if DEBUG_VM Checking whether memstart_addr has been assigned every time it is referenced adds a branch instruction that may hurt performance if the reference in question occurs on a hot path. So only perform the check if CONFIG_DEBUG_VM=y. Signed-off-by: Ard Biesheuvel [catalin.marinas@arm.com: replaced #ifdef with VM_BUG_ON] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a92405f082d43267575444a6927085e4c8a69e4e) Signed-off-by: Jeff Vander Stoep Change-Id: Ia5f206d9a2dbbdbfc3f05fe985d4eca309f0d889 --- arch/arm64/include/asm/memory.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 3239e4d78e0d..460d09bf9442 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -132,9 +132,11 @@ #ifndef __ASSEMBLY__ +#include + extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ -#define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) +#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; From b4fd0619fedde9b2849e844db1bde3929c336a82 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 18:46:04 +0100 Subject: [PATCH 471/607] UPSTREAM: arm64: mm: use bit ops rather than arithmetic in pa/va translations Since PAGE_OFFSET is chosen such that it cuts the kernel VA space right in half, and since the size of the kernel VA space itself is always a power of 2, we can treat PAGE_OFFSET as a bitmask and replace the additions/subtractions with 'or' and 'and-not' operations. For the comparison against PAGE_OFFSET, a mov/cmp/branch sequence ends up getting replaced with a single tbz instruction. For the additions and subtractions, we save a mov instruction since the mask is folded into the instruction's immediate field. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 8439e62a15614e8fcd43835d57b7245cd9870dc5) Signed-off-by: Jeff Vander Stoep Change-Id: I1ea4ef654dd7b7693f8713dab28ca0739b8a2c62 --- arch/arm64/include/asm/memory.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 460d09bf9442..eb798156cf56 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -88,10 +88,10 @@ */ #define __virt_to_phys(x) ({ \ phys_addr_t __x = (phys_addr_t)(x); \ - __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ - (__x - kimage_voffset); }) + __x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET : \ + (__x - kimage_voffset); }) -#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) +#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET) #define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* @@ -132,6 +132,7 @@ #ifndef __ASSEMBLY__ +#include #include extern phys_addr_t memstart_addr; From 0983513a290d07016a2b9c097157bec9685800ce Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 23 Feb 2016 08:56:45 +0100 Subject: [PATCH 472/607] UPSTREAM: arm64: move brk immediate argument definitions to separate header Instead of reversing the header dependency between asm/bug.h and asm/debug-monitors.h, split off the brk instruction immediate value defines into a new header asm/brk-imm.h, and include it from both. This solves the circular dependency issue that prevents BUG() from being used in some header files, and keeps the definitions together. Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f98deee9a9f8c47d05a0f64d86440882dca772ff) Signed-off-by: Jeff Vander Stoep Change-Id: Id4827af98ab3d413828c589bc379acecabeff108 --- arch/arm64/include/asm/brk-imm.h | 25 +++++++++++++++++++++++++ arch/arm64/include/asm/bug.h | 2 +- arch/arm64/include/asm/debug-monitors.h | 14 +------------- 3 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 arch/arm64/include/asm/brk-imm.h diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h new file mode 100644 index 000000000000..ed693c5bcec0 --- /dev/null +++ b/arch/arm64/include/asm/brk-imm.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __ASM_BRK_IMM_H +#define __ASM_BRK_IMM_H + +/* + * #imm16 values used for BRK instruction generation + * Allowed values for kgdb are 0x400 - 0x7ff + * 0x100: for triggering a fault on purpose (reserved) + * 0x400: for dynamic BRK instruction + * 0x401: for compile time BRK instruction + * 0x800: kernel-mode BUG() and WARN() traps + */ +#define FAULT_BRK_IMM 0x100 +#define KGDB_DYN_DBG_BRK_IMM 0x400 +#define KGDB_COMPILED_DBG_BRK_IMM 0x401 +#define BUG_BRK_IMM 0x800 + +#endif diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 679d49221998..561190d15881 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,7 +18,7 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#define BUG_BRK_IMM 0x800 +#include #ifdef CONFIG_GENERIC_BUG #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index e893a1fca9c2..2fcb9b7c876c 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include @@ -47,18 +47,6 @@ */ #define BREAK_INSTR_SIZE AARCH64_INSN_SIZE -/* - * #imm16 values used for BRK instruction generation - * Allowed values for kgbd are 0x400 - 0x7ff - * 0x100: for triggering a fault on purpose (reserved) - * 0x400: for dynamic BRK instruction - * 0x401: for compile time BRK instruction - * 0x800: kernel-mode BUG() and WARN() traps - */ -#define FAULT_BRK_IMM 0x100 -#define KGDB_DYN_DBG_BRK_IMM 0x400 -#define KGDB_COMPILED_DBG_BRK_IMM 0x401 - /* * BRK instruction encoding * The #imm16 value should be placed at bits[20:5] within BRK ins From 4166edc437c98ef74d6a1cf3a2b2fabeabca2b18 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 24 Nov 2015 12:37:35 +0100 Subject: [PATCH 473/607] UPSTREAM: arm64: add support for module PLTs This adds support for emitting PLTs at module load time for relative branches that are out of range. This is a prerequisite for KASLR, which may place the kernel and the modules anywhere in the vmalloc area, making it more likely that branch target offsets exceed the maximum range of +/- 128 MB. In this version, I removed the distinction between relocations against .init executable sections and ordinary executable sections. The reason is that it is hardly worth the trouble, given that .init.text usually does not contain that many far branches, and this version now only reserves PLT entry space for jump and call relocations against undefined symbols (since symbols defined in the same module can be assumed to be within +/- 128 MB) For example, the mac80211.ko module (which is fairly sizable at ~400 KB) built with -mcmodel=large gives the following relocation counts: relocs branches unique !local .text 3925 3347 518 219 .init.text 11 8 7 1 .exit.text 4 4 4 1 .text.unlikely 81 67 36 17 ('unique' means branches to unique type/symbol/addend combos, of which !local is the subset referring to undefined symbols) IOW, we are only emitting a single PLT entry for the .init sections, and we are better off just adding it to the core PLT section instead. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit fd045f6cd98ec4953147b318418bd45e441e52a3) Signed-off-by: Jeff Vander Stoep Change-Id: I1b46bb817e7d16a1b9a394b100c9e5de46c0837c --- arch/arm64/Kconfig | 9 ++ arch/arm64/Makefile | 6 +- arch/arm64/include/asm/module.h | 11 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/module-plts.c | 201 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/module.c | 22 ++++ arch/arm64/kernel/module.lds | 3 + 7 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/module-plts.c create mode 100644 arch/arm64/kernel/module.lds diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 14866e975277..65a8e7c9fdd1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -399,6 +399,7 @@ config ARM64_ERRATUM_843419 bool "Cortex-A53: 843419: A load or store might access an incorrect address" depends on MODULES default y + select ARM64_MODULE_CMODEL_LARGE help This option builds kernel modules using the large memory model in order to avoid the use of the ADRP instruction, which can cause @@ -762,6 +763,14 @@ config ARM64_UAO regular load/store instructions if the cpu does not implement the feature. +config ARM64_MODULE_CMODEL_LARGE + bool + +config ARM64_MODULE_PLTS + bool + select ARM64_MODULE_CMODEL_LARGE + select HAVE_MOD_ARCH_SPECIFIC + endmenu menu "Boot options" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 693a4dc6220c..763509ff6f24 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -43,10 +43,14 @@ endif CHECKFLAGS += -D__aarch64__ -ifeq ($(CONFIG_ARM64_ERRATUM_843419), y) +ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y) KBUILD_CFLAGS_MODULE += -mcmodel=large endif +ifeq ($(CONFIG_ARM64_MODULE_PLTS),y) +KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/arm64/kernel/module.lds +endif + # Default value head-y := arch/arm64/kernel/head.o diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index e80e232b730e..8652fb613304 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -20,4 +20,15 @@ #define MODULE_ARCH_VERMAGIC "aarch64" +#ifdef CONFIG_ARM64_MODULE_PLTS +struct mod_arch_specific { + struct elf64_shdr *plt; + int plt_num_entries; + int plt_max_entries; +}; +#endif + +u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, + Elf64_Sym *sym); + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index c4e2f70c0aa0..8d971f9c6ed5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ ../../arm/kernel/opcodes.o arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o +arm64-obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c new file mode 100644 index 000000000000..1ce90d8450ae --- /dev/null +++ b/arch/arm64/kernel/module-plts.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2014-2016 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +struct plt_entry { + /* + * A program that conforms to the AArch64 Procedure Call Standard + * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or + * IP1 (x17) may be inserted at any branch instruction that is + * exposed to a relocation that supports long branches. Since that + * is exactly what we are dealing with here, we are free to use x16 + * as a scratch register in the PLT veneers. + */ + __le32 mov0; /* movn x16, #0x.... */ + __le32 mov1; /* movk x16, #0x...., lsl #16 */ + __le32 mov2; /* movk x16, #0x...., lsl #32 */ + __le32 br; /* br x16 */ +}; + +u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, + Elf64_Sym *sym) +{ + struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr; + int i = mod->arch.plt_num_entries; + u64 val = sym->st_value + rela->r_addend; + + /* + * We only emit PLT entries against undefined (SHN_UNDEF) symbols, + * which are listed in the ELF symtab section, but without a type + * or a size. + * So, similar to how the module loader uses the Elf64_Sym::st_value + * field to store the resolved addresses of undefined symbols, let's + * borrow the Elf64_Sym::st_size field (whose value is never used by + * the module loader, even for symbols that are defined) to record + * the address of a symbol's associated PLT entry as we emit it for a + * zero addend relocation (which is the only kind we have to deal with + * in practice). This allows us to find duplicates without having to + * go through the table every time. + */ + if (rela->r_addend == 0 && sym->st_size != 0) { + BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]); + return sym->st_size; + } + + mod->arch.plt_num_entries++; + BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries); + + /* + * MOVK/MOVN/MOVZ opcode: + * +--------+------------+--------+-----------+-------------+---------+ + * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | + * +--------+------------+--------+-----------+-------------+---------+ + * + * Rd := 0x10 (x16) + * hw := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32) + * opc := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ) + * sf := 1 (64-bit variant) + */ + plt[i] = (struct plt_entry){ + cpu_to_le32(0x92800010 | (((~val ) & 0xffff)) << 5), + cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5), + cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5), + cpu_to_le32(0xd61f0200) + }; + + if (rela->r_addend == 0) + sym->st_size = (u64)&plt[i]; + + return (u64)&plt[i]; +} + +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) +{ + const Elf64_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info)); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; +} + +static bool duplicate_rel(const Elf64_Rela *rela, int num) +{ + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding + * slot. + */ + return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0; +} + +static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num) +{ + unsigned int ret = 0; + Elf64_Sym *s; + int i; + + for (i = 0; i < num; i++) { + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: + /* + * We only have to consider branch targets that resolve + * to undefined symbols. This is not simply a heuristic, + * it is a fundamental limitation, since the PLT itself + * is part of the module, and needs to be within 128 MB + * as well, so modules can never grow beyond that limit. + */ + s = syms + ELF64_R_SYM(rela[i].r_info); + if (s->st_shndx != SHN_UNDEF) + break; + + /* + * Jump relocations with non-zero addends against + * undefined symbols are supported by the ELF spec, but + * do not occur in practice (e.g., 'jump n bytes past + * the entry point of undefined function symbol f'). + * So we need to support them, but there is no need to + * take them into consideration when trying to optimize + * this code. So let's only check for duplicates when + * the addend is zero: this allows us to record the PLT + * entry address in the symbol table itself, rather than + * having to search the list for duplicates each time we + * emit one. + */ + if (rela[i].r_addend != 0 || !duplicate_rel(rela, i)) + ret++; + break; + } + } + return ret; +} + +int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + unsigned long plt_max_entries = 0; + Elf64_Sym *syms = NULL; + int i; + + /* + * Find the empty .plt section so we can expand it to store the PLT + * entries. Record the symtab address as well. + */ + for (i = 0; i < ehdr->e_shnum; i++) { + if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0) + mod->arch.plt = sechdrs + i; + else if (sechdrs[i].sh_type == SHT_SYMTAB) + syms = (Elf64_Sym *)sechdrs[i].sh_addr; + } + + if (!mod->arch.plt) { + pr_err("%s: module PLT section missing\n", mod->name); + return -ENOEXEC; + } + if (!syms) { + pr_err("%s: module symtab section missing\n", mod->name); + return -ENOEXEC; + } + + for (i = 0; i < ehdr->e_shnum; i++) { + Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; + int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; + + if (sechdrs[i].sh_type != SHT_RELA) + continue; + + /* ignore relocations that operate on non-exec sections */ + if (!(dstsec->sh_flags & SHF_EXECINSTR)) + continue; + + /* sort by type, symbol index and addend */ + sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + + plt_max_entries += count_plts(syms, rels, numrels); + } + + mod->arch.plt->sh_type = SHT_NOBITS; + mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry); + mod->arch.plt_num_entries = 0; + mod->arch.plt_max_entries = plt_max_entries; + return 0; +} diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 93e970231ca9..a9dde97f5ca5 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -38,6 +38,21 @@ void *module_alloc(unsigned long size) GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + !IS_ENABLED(CONFIG_KASAN)) + /* + * KASAN can only deal with module allocations being served + * from the reserved module region, since the remainder of + * the vmalloc region is already backed by zero shadow pages, + * and punching holes into it is non-trivial. Since the module + * region is not randomized when KASAN is enabled, it is even + * less likely that the module region gets exhausted, so we + * can simply omit this fallback in that case. + */ + p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); return NULL; @@ -361,6 +376,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, AARCH64_INSN_IMM_26); + + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + ovf == -ERANGE) { + val = module_emit_plt_entry(me, &rel[i], sym); + ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, + 26, AARCH64_INSN_IMM_26); + } break; default: diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds new file mode 100644 index 000000000000..8949f6c6f729 --- /dev/null +++ b/arch/arm64/kernel/module.lds @@ -0,0 +1,3 @@ +SECTIONS { + .plt (NOLOAD) : { BYTE(0) } +} From ee1ef5eefa116dd23b1311c019e6d4d115bb1f4d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 26 Dec 2015 13:48:02 +0100 Subject: [PATCH 474/607] UPSTREAM: arm64: avoid R_AARCH64_ABS64 relocations for Image header fields Unfortunately, the current way of using the linker to emit build time constants into the Image header will no longer work once we switch to the use of PIE executables. The reason is that such constants are emitted into the binary using R_AARCH64_ABS64 relocations, which are resolved at runtime, not at build time, and the places targeted by those relocations will contain zeroes before that. So refactor the endian swapping linker script constant generation code so that it emits the upper and lower 32-bit words separately. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6ad1fe5d9077a1ab40bf74b61994d2e770b00b14) Signed-off-by: Jeff Vander Stoep Change-Id: Iaa809a0b5fcf628e1e49cd6aaa0f31f31ce95c23 --- arch/arm64/include/asm/assembler.h | 11 ++++++++++ arch/arm64/kernel/head.S | 6 +++--- arch/arm64/kernel/image.h | 32 ++++++++++++++++++------------ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bb7b72734c24..ba5aff6c830e 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -215,4 +215,15 @@ lr .req x30 // link register .size __pi_##x, . - x; \ ENDPROC(x) + /* + * Emit a 64-bit absolute little endian symbol reference in a way that + * ensures that it will be resolved at build time, even when building a + * PIE binary. This requires cooperation from the linker script, which + * must emit the lo32/hi32 halves individually. + */ + .macro le64sym, sym + .long \sym\()_lo32 + .long \sym\()_hi32 + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 05b98289093e..f076debf392d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -83,9 +83,9 @@ efi_head: b stext // branch to kernel start, magic .long 0 // reserved #endif - .quad _kernel_offset_le // Image load offset from start of RAM, little-endian - .quad _kernel_size_le // Effective size of kernel image, little-endian - .quad _kernel_flags_le // Informative flags, little-endian + le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + le64sym _kernel_size_le // Effective size of kernel image, little-endian + le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index c9c62cab25a4..db1bf57948f1 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -26,21 +26,27 @@ * There aren't any ELF relocations we can use to endian-swap values known only * at link time (e.g. the subtraction of two symbol addresses), so we must get * the linker to endian-swap certain values before emitting them. + * + * Note that, in order for this to work when building the ELF64 PIE executable + * (for KASLR), these values should not be referenced via R_AARCH64_ABS64 + * relocations, since these are fixed up at runtime rather than at build time + * when PIE is in effect. So we need to split them up in 32-bit high and low + * words. */ #ifdef CONFIG_CPU_BIG_ENDIAN -#define DATA_LE64(data) \ - ((((data) & 0x00000000000000ff) << 56) | \ - (((data) & 0x000000000000ff00) << 40) | \ - (((data) & 0x0000000000ff0000) << 24) | \ - (((data) & 0x00000000ff000000) << 8) | \ - (((data) & 0x000000ff00000000) >> 8) | \ - (((data) & 0x0000ff0000000000) >> 24) | \ - (((data) & 0x00ff000000000000) >> 40) | \ - (((data) & 0xff00000000000000) >> 56)) +#define DATA_LE32(data) \ + ((((data) & 0x000000ff) << 24) | \ + (((data) & 0x0000ff00) << 8) | \ + (((data) & 0x00ff0000) >> 8) | \ + (((data) & 0xff000000) >> 24)) #else -#define DATA_LE64(data) ((data) & 0xffffffffffffffff) +#define DATA_LE32(data) ((data) & 0xffffffff) #endif +#define DEFINE_IMAGE_LE64(sym, data) \ + sym##_lo32 = DATA_LE32((data) & 0xffffffff); \ + sym##_hi32 = DATA_LE32((data) >> 32) + #ifdef CONFIG_CPU_BIG_ENDIAN #define __HEAD_FLAG_BE 1 #else @@ -61,9 +67,9 @@ * endian swapped in head.S, all are done here for consistency. */ #define HEAD_SYMBOLS \ - _kernel_size_le = DATA_LE64(_end - _text); \ - _kernel_offset_le = DATA_LE64(TEXT_OFFSET); \ - _kernel_flags_le = DATA_LE64(__HEAD_FLAGS); + DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ + DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ + DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); #ifdef CONFIG_EFI From 68c7746e3ca999b2d3c75af8ac714ad28c1a4288 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 26 Dec 2015 12:46:40 +0100 Subject: [PATCH 475/607] UPSTREAM: arm64: avoid dynamic relocations in early boot code Before implementing KASLR for arm64 by building a self-relocating PIE executable, we have to ensure that values we use before the relocation routine is executed are not subject to dynamic relocation themselves. This applies not only to virtual addresses, but also to values that are supplied by the linker at build time and relocated using R_AARCH64_ABS64 relocations. So instead, use assemble time constants, or force the use of static relocations by folding the constants into the instructions. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2bf31a4a05f5b00f37d65ba029d36a0230286cb7) Signed-off-by: Jeff Vander Stoep Change-Id: Icce0176591e3c0ae444e1ea54258efe677933c5b --- arch/arm64/kernel/efi-entry.S | 2 +- arch/arm64/kernel/head.S | 39 +++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index a773db92908b..f82036e02485 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -61,7 +61,7 @@ ENTRY(entry) */ mov x20, x0 // DTB address ldr x0, [sp, #16] // relocated _text address - ldr x21, =stext_offset + movz x21, #:abs_g0:stext_offset add x21, x0, x21 /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index f076debf392d..4cad8f9f2268 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -67,12 +67,11 @@ * in the entry routines. */ __HEAD - +_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ #ifdef CONFIG_EFI -efi_head: /* * This add instruction has no meaningful effect except that * its opcode forms the magic "MZ" signature required by UEFI. @@ -94,14 +93,14 @@ efi_head: .byte 0x4d .byte 0x64 #ifdef CONFIG_EFI - .long pe_header - efi_head // Offset to the PE header. + .long pe_header - _head // Offset to the PE header. #else .word 0 // reserved #endif #ifdef CONFIG_EFI .globl __efistub_stext_offset - .set __efistub_stext_offset, stext - efi_head + .set __efistub_stext_offset, stext - _head .align 3 pe_header: .ascii "PE" @@ -124,7 +123,7 @@ optional_header: .long _end - stext // SizeOfCode .long 0 // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_entry - efi_head // AddressOfEntryPoint + .long __efistub_entry - _head // AddressOfEntryPoint .long __efistub_stext_offset // BaseOfCode extra_header_fields: @@ -139,7 +138,7 @@ extra_header_fields: .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - efi_head // SizeOfImage + .long _end - _head // SizeOfImage // Everything before the kernel image is considered part of the header .long __efistub_stext_offset // SizeOfHeaders @@ -219,11 +218,13 @@ ENTRY(stext) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ - ldr x27, =__mmap_switched // address to jump to after + ldr x27, 0f // address to jump to after // MMU has been enabled adr_l lr, __enable_mmu // return (PIC) address b __cpu_setup // initialise processor ENDPROC(stext) + .align 3 +0: .quad __mmap_switched - (_head - TEXT_OFFSET) + KIMAGE_VADDR /* * Preserve the arguments passed by the bootloader in x0 .. x3 @@ -391,7 +392,8 @@ __create_page_tables: mov x0, x26 // swapper_pg_dir ldr x5, =KIMAGE_VADDR create_pgd_entry x0, x5, x3, x6 - ldr x6, =KERNEL_END // __va(KERNEL_END) + ldr w6, kernel_img_size + add x6, x6, x5 mov x3, x24 // phys offset create_block_map x0, x7, x3, x5, x6 @@ -408,6 +410,9 @@ __create_page_tables: mov lr, x27 ret ENDPROC(__create_page_tables) + +kernel_img_size: + .long _end - (_head - TEXT_OFFSET) .ltorg /* @@ -415,6 +420,10 @@ ENDPROC(__create_page_tables) */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: + adr_l x8, vectors // load VBAR_EL1 with virtual + msr vbar_el1, x8 // vector table address + isb + // Clear BSS adr_l x0, __bss_start mov x1, xzr @@ -610,13 +619,19 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x21, =secondary_data - ldr x27, =__secondary_switched // address to jump to after enabling the MMU + ldr x8, =KIMAGE_VADDR + ldr w9, 0f + sub x27, x8, w9, sxtw // address to jump to after enabling the MMU b __enable_mmu ENDPROC(secondary_startup) +0: .long (_text - TEXT_OFFSET) - __secondary_switched ENTRY(__secondary_switched) - ldr x0, [x21] // get secondary_data.stack + adr_l x5, vectors + msr vbar_el1, x5 + isb + + ldr_l x0, secondary_data // get secondary_data.stack mov sp, x0 and x0, x0, #~(THREAD_SIZE - 1) msr sp_el0, x0 // save thread_info @@ -641,8 +656,6 @@ __enable_mmu: ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED b.ne __no_granule_support - ldr x5, =vectors - msr vbar_el1, x5 msr ttbr0_el1, x25 // load TTBR0 msr ttbr1_el1, x26 // load TTBR1 isb From d78098f62acbb27b7b222b8e3a517581d0842a9f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 17:08:26 +0100 Subject: [PATCH 476/607] UPSTREAM: arm64: make asm/elf.h available to asm files This reshuffles some code in asm/elf.h and puts a #ifndef __ASSEMBLY__ around its C definitions so that the CPP defines can be used in asm source files as well. Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 4a2e034e5cdadde4c712f79bdd57d1455c76a3db) Signed-off-by: Jeff Vander Stoep Change-Id: Ic499e950d2ef297d10848862a6dfa07b90887f4c --- arch/arm64/include/asm/elf.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index faad6df49e5b..435f55952e1f 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -24,15 +24,6 @@ #include #include -typedef unsigned long elf_greg_t; - -#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t)) -#define ELF_CORE_COPY_REGS(dest, regs) \ - *(struct user_pt_regs *)&(dest) = (regs)->user_regs; - -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; -typedef struct user_fpsimd_state elf_fpregset_t; - /* * AArch64 static relocation types. */ @@ -127,6 +118,17 @@ typedef struct user_fpsimd_state elf_fpregset_t; */ #define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +#ifndef __ASSEMBLY__ + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t)) +#define ELF_CORE_COPY_REGS(dest, regs) \ + *(struct user_pt_regs *)&(dest) = (regs)->user_regs; + +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; +typedef struct user_fpsimd_state elf_fpregset_t; + /* * When the program starts, a1 contains a pointer to a function to be * registered with atexit, as per the SVR4 ABI. A value of 0 means we have no @@ -186,4 +188,6 @@ extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, #endif /* CONFIG_COMPAT */ +#endif /* !__ASSEMBLY__ */ + #endif From e0357208fbf9e2c9c33af1900c52c2cbaefbe910 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 2 Feb 2016 15:53:59 +0000 Subject: [PATCH 477/607] UPSTREAM: arm64: futex.h: Add missing PAN toggling futex.h's futex_atomic_cmpxchg_inatomic() does not use the __futex_atomic_op() macro and needs its own PAN toggling. This was missed when the feature was implemented. Fixes: 338d4f49d6f ("arm64: kernel: Add support for Privileged Access Never") Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 811d61e384e24759372bb3f01772f3744b0a8327) Signed-off-by: Jeff Vander Stoep Change-Id: I6e7b338a1af17b784d4196101422c3acee3b88ed --- arch/arm64/include/asm/futex.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 007a69fc4f40..5f3ab8c1db55 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -121,6 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return -EFAULT; asm volatile("// futex_atomic_cmpxchg_inatomic\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -137,6 +138,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, " .align 3\n" " .quad 1b, 4b, 2b, 4b\n" " .popsection\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); From eebed210c5e62cfe0f9f8de4173099f2d32faae9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 10 Jan 2016 11:42:28 +0100 Subject: [PATCH 478/607] UPSTREAM: scripts/sortextable: add support for ET_DYN binaries Add support to scripts/sortextable for handling relocatable (PIE) executables, whose ELF type is ET_DYN, not ET_EXEC. Other than adding support for the new type, no changes are needed. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 7b957b6e603623ef8b2e8222fa94b976df613fa2) Signed-off-by: Jeff Vander Stoep Change-Id: If55296ef4934b99c38ceb5acbd7c4a7fb23f24c1 --- scripts/sortextable.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/sortextable.c b/scripts/sortextable.c index c2423d913b46..ecefa0a634f8 100644 --- a/scripts/sortextable.c +++ b/scripts/sortextable.c @@ -266,9 +266,9 @@ do_file(char const *const fname) break; } /* end switch */ if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 - || r2(&ehdr->e_type) != ET_EXEC + || (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN) || ehdr->e_ident[EI_VERSION] != EV_CURRENT) { - fprintf(stderr, "unrecognized ET_EXEC file %s\n", fname); + fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname); fail_file(); } @@ -304,7 +304,7 @@ do_file(char const *const fname) if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, - "unrecognized ET_EXEC file: %s\n", fname); + "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); fail_file(); } do32(ehdr, fname, custom_sort); @@ -314,7 +314,7 @@ do_file(char const *const fname) if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr) || r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { fprintf(stderr, - "unrecognized ET_EXEC file: %s\n", fname); + "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); fail_file(); } do64(ghdr, fname, custom_sort); From 27375cf76085992a805dc5ad1db397e7227717c9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jan 2016 12:39:09 +0100 Subject: [PATCH 479/607] UPSTREAM: extable: add support for relative extables to search and sort routines This adds support to the generic search_extable() and sort_extable() implementations for dealing with exception table entries whose fields contain relative offsets rather than absolute addresses. Acked-by: Helge Deller Acked-by: Heiko Carstens Acked-by: H. Peter Anvin Acked-by: Tony Luck Acked-by: Will Deacon Acked-by: Richard Henderson Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a272858a3c1ecd4a935ba23c66668f81214bd110) Signed-off-by: Jeff Vander Stoep Change-Id: I9d144d351d547c49bf3203a69dfff3cb71a51177 --- lib/extable.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/lib/extable.c b/lib/extable.c index 4cac81ec225e..0be02ad561e9 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -14,7 +14,37 @@ #include #include +#ifndef ARCH_HAS_RELATIVE_EXTABLE +#define ex_to_insn(x) ((x)->insn) +#else +static inline unsigned long ex_to_insn(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} +#endif + #ifndef ARCH_HAS_SORT_EXTABLE +#ifndef ARCH_HAS_RELATIVE_EXTABLE +#define swap_ex NULL +#else +static void swap_ex(void *a, void *b, int size) +{ + struct exception_table_entry *x = a, *y = b, tmp; + int delta = b - a; + + tmp = *x; + x->insn = y->insn + delta; + y->insn = tmp.insn - delta; + +#ifdef swap_ex_entry_fixup + swap_ex_entry_fixup(x, y, tmp, delta); +#else + x->fixup = y->fixup + delta; + y->fixup = tmp.fixup - delta; +#endif +} +#endif /* ARCH_HAS_RELATIVE_EXTABLE */ + /* * The exception table needs to be sorted so that the binary * search that we use to find entries in it works properly. @@ -26,9 +56,9 @@ static int cmp_ex(const void *a, const void *b) const struct exception_table_entry *x = a, *y = b; /* avoid overflow */ - if (x->insn > y->insn) + if (ex_to_insn(x) > ex_to_insn(y)) return 1; - if (x->insn < y->insn) + if (ex_to_insn(x) < ex_to_insn(y)) return -1; return 0; } @@ -37,7 +67,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, NULL); + cmp_ex, swap_ex); } #ifdef CONFIG_MODULES @@ -48,13 +78,15 @@ void sort_extable(struct exception_table_entry *start, void trim_init_extable(struct module *m) { /*trim the beginning*/ - while (m->num_exentries && within_module_init(m->extable[0].insn, m)) { + while (m->num_exentries && + within_module_init(ex_to_insn(&m->extable[0]), m)) { m->extable++; m->num_exentries--; } /*trim the end*/ while (m->num_exentries && - within_module_init(m->extable[m->num_exentries-1].insn, m)) + within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]), + m)) m->num_exentries--; } #endif /* CONFIG_MODULES */ @@ -81,13 +113,13 @@ search_extable(const struct exception_table_entry *first, * careful, the distance between value and insn * can be larger than MAX_LONG: */ - if (mid->insn < value) + if (ex_to_insn(mid) < value) first = mid + 1; - else if (mid->insn > value) + else if (ex_to_insn(mid) > value) last = mid - 1; else return mid; - } - return NULL; + } + return NULL; } #endif From 161994d19e72907ec95e545d1129675c9c966f1a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jan 2016 15:02:12 +0100 Subject: [PATCH 480/607] UPSTREAM: arm64: switch to relative exception tables Instead of using absolute addresses for both the exception location and the fixup, use offsets relative to the exception table entry values. Not only does this cut the size of the exception table in half, it is also a prerequisite for KASLR, since absolute exception table entries are subject to dynamic relocation, which is incompatible with the sorting of the exception table that occurs at build time. This patch also introduces the _ASM_EXTABLE preprocessor macro (which exists on x86 as well) and its _asm_extable assembly counterpart, as shorthands to emit exception table entries. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6c94f27ac847ff8ef15b3da5b200574923bd6287) Signed-off-by: Jeff Vander Stoep Change-Id: Icedda8ee8c32843c439765783816d7d71ca0073a --- arch/arm64/include/asm/alternative.h | 19 +++++----------- arch/arm64/include/asm/assembler.h | 15 +++++++++---- arch/arm64/include/asm/futex.h | 12 ++++------ arch/arm64/include/asm/uaccess.h | 30 +++++++++++++------------ arch/arm64/include/asm/word-at-a-time.h | 7 +++--- arch/arm64/kernel/armv8_deprecated.c | 7 ++---- arch/arm64/mm/extable.c | 2 +- scripts/sortextable.c | 2 +- 8 files changed, 43 insertions(+), 51 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index a9fc24ec1aa9..beccbdefa106 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -157,11 +157,8 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .quad 8889b,\l; - .previous; + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; .endm .macro uao_stp l, reg1, reg2, addr, post_inc @@ -175,11 +172,8 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .quad 8889b,\l; - .previous + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; .endm .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc @@ -191,10 +185,7 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .previous + _asm_extable 8888b,\l; .endm #else .macro uao_ldp l, reg1, reg2, addr, post_inc diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ba5aff6c830e..70f7b9e04598 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -94,12 +94,19 @@ dmb \opt .endm +/* + * Emit an entry into the exception table + */ + .macro _asm_extable, from, to + .pushsection __ex_table, "a" + .align 3 + .long (\from - .), (\to - .) + .popsection + .endm + #define USER(l, x...) \ 9999: x; \ - .section __ex_table,"a"; \ - .align 3; \ - .quad 9999b,l; \ - .previous + _asm_extable 9999b, l /* * Register aliases. diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 5f3ab8c1db55..f2585cdd32c2 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -42,10 +42,8 @@ "4: mov %w0, %w5\n" \ " b 3b\n" \ " .popsection\n" \ -" .pushsection __ex_table,\"a\"\n" \ -" .align 3\n" \ -" .quad 1b, 4b, 2b, 4b\n" \ -" .popsection\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ @@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) "4: mov %w0, %w6\n" " b 3b\n" " .popsection\n" -" .pushsection __ex_table,\"a\"\n" -" .align 3\n" -" .quad 1b, 4b, 2b, 4b\n" -" .popsection\n" + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 4b) ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 7b07e73b6316..c3d445b42351 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -36,11 +36,11 @@ #define VERIFY_WRITE 1 /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of pairs of relative offsets: the first + * is the relative offset to an instruction that is allowed to fault, + * and the second is the relative offset at which the program should + * continue. No registers are modified, so it is entirely up to the + * continuation code to figure out what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -50,9 +50,11 @@ struct exception_table_entry { - unsigned long insn, fixup; + int insn, fixup; }; +#define ARCH_HAS_RELATIVE_EXTABLE + extern int fixup_exception(struct pt_regs *regs); #define KERNEL_DS (-1UL) @@ -115,6 +117,12 @@ static inline void set_fs(mm_segment_t fs) #define access_ok(type, addr, size) __range_ok(addr, size) #define user_addr_max get_fs +#define _ASM_EXTABLE(from, to) \ + " .pushsection __ex_table, \"a\"\n" \ + " .align 3\n" \ + " .long (" #from " - .), (" #to " - .)\n" \ + " .popsection\n" + /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -134,10 +142,7 @@ static inline void set_fs(mm_segment_t fs) " mov %1, #0\n" \ " b 2b\n" \ " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 1b, 3b\n" \ - " .previous" \ + _ASM_EXTABLE(1b, 3b) \ : "+r" (err), "=&r" (x) \ : "r" (addr), "i" (-EFAULT)) @@ -206,10 +211,7 @@ do { \ "3: mov %w0, %3\n" \ " b 2b\n" \ " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 1b, 3b\n" \ - " .previous" \ + _ASM_EXTABLE(1b, 3b) \ : "+r" (err) \ : "r" (x), "r" (addr), "i" (-EFAULT)) diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index aab5bf09e9d9..2b79b8a89457 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -16,6 +16,8 @@ #ifndef __ASM_WORD_AT_A_TIME_H #define __ASM_WORD_AT_A_TIME_H +#include + #ifndef __AARCH64EB__ #include @@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) #endif " b 2b\n" " .popsection\n" - " .pushsection __ex_table,\"a\"\n" - " .align 3\n" - " .quad 1b, 3b\n" - " .popsection" + _ASM_EXTABLE(1b, 3b) : "=&r" (ret), "=&r" (offset) : "r" (addr), "Q" (*(unsigned long *)addr)); diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 3e01207917b1..c37202c0c838 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -297,11 +297,8 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) "4: mov %w0, %w5\n" \ " b 3b\n" \ " .popsection" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 0b, 4b\n" \ - " .quad 1b, 4b\n" \ - " .popsection\n" \ + _ASM_EXTABLE(0b, 4b) \ + _ASM_EXTABLE(1b, 4b) \ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index 79444279ba8c..81acd4706878 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(instruction_pointer(regs)); if (fixup) - regs->pc = fixup->fixup; + regs->pc = (unsigned long)&fixup->fixup + fixup->fixup; return fixup != NULL; } diff --git a/scripts/sortextable.c b/scripts/sortextable.c index ecefa0a634f8..19d83647846c 100644 --- a/scripts/sortextable.c +++ b/scripts/sortextable.c @@ -282,12 +282,12 @@ do_file(char const *const fname) case EM_386: case EM_X86_64: case EM_S390: + case EM_AARCH64: custom_sort = sort_relative_table; break; case EM_ARCOMPACT: case EM_ARCV2: case EM_ARM: - case EM_AARCH64: case EM_MICROBLAZE: case EM_MIPS: case EM_XTENSA: From 01684359230797235a0bdbd9768241c41f990105 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 09:13:44 +0100 Subject: [PATCH 481/607] UPSTREAM: arm64: add support for building vmlinux as a relocatable PIE binary This implements CONFIG_RELOCATABLE, which links the final vmlinux image with a dynamic relocation section, allowing the early boot code to perform a relocation to a different virtual address at runtime. This is a prerequisite for KASLR (CONFIG_RANDOMIZE_BASE). Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 1e48ef7fcc374051730381a2a05da77eb4eafdb0) Signed-off-by: Jeff Vander Stoep Change-Id: If02e065722d438f85feb62240fc230e16f58e912 --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/Makefile | 4 ++++ arch/arm64/include/asm/elf.h | 2 ++ arch/arm64/kernel/head.S | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 16 ++++++++++++++++ 5 files changed, 65 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 65a8e7c9fdd1..1faf7905f0fa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -771,6 +771,17 @@ config ARM64_MODULE_PLTS select ARM64_MODULE_CMODEL_LARGE select HAVE_MOD_ARCH_SPECIFIC +config RELOCATABLE + bool + help + This builds the kernel as a Position Independent Executable (PIE), + which retains all relocation metadata required to relocate the + kernel binary at runtime to a different virtual address than the + address it was linked at. + Since AArch64 uses the RELA relocation format, this requires a + relocation pass at runtime even if the kernel is loaded at the + same address it was linked at. + endmenu menu "Boot options" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 763509ff6f24..47f61e9b8fec 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -15,6 +15,10 @@ CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 +ifneq ($(CONFIG_RELOCATABLE),) +LDFLAGS_vmlinux += -pie +endif + KBUILD_DEFCONFIG := defconfig # Check for binutils support for specific extensions diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 435f55952e1f..24ed037f09fd 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -77,6 +77,8 @@ #define R_AARCH64_MOVW_PREL_G2_NC 292 #define R_AARCH64_MOVW_PREL_G3 293 +#define R_AARCH64_RELATIVE 1027 + /* * These are used to set parameters in the core dumps. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4cad8f9f2268..4e69412a7323 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -432,6 +433,37 @@ __mmap_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW +#ifdef CONFIG_RELOCATABLE + + /* + * Iterate over each entry in the relocation table, and apply the + * relocations in place. + */ + adr_l x8, __dynsym_start // start of symbol table + adr_l x9, __reloc_start // start of reloc table + adr_l x10, __reloc_end // end of reloc table + +0: cmp x9, x10 + b.hs 2f + ldp x11, x12, [x9], #24 + ldr x13, [x9, #-8] + cmp w12, #R_AARCH64_RELATIVE + b.ne 1f + str x13, [x11] + b 0b + +1: cmp w12, #R_AARCH64_ABS64 + b.ne 0b + add x12, x12, x12, lsl #1 // symtab offset: 24x top word + add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldr x15, [x12, #8] // Elf64_Sym::st_value + add x15, x13, x15 + str x15, [x11] + b 0b + +2: +#endif + adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7703feba23d6..14813a6ca13b 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -87,6 +87,7 @@ SECTIONS EXIT_CALL *(.discard) *(.discard.*) + *(.interp .dynamic) } . = KIMAGE_VADDR + TEXT_OFFSET; @@ -151,6 +152,21 @@ SECTIONS .altinstr_replacement : { *(.altinstr_replacement) } + .rela : ALIGN(8) { + __reloc_start = .; + *(.rela .rela*) + __reloc_end = .; + } + .dynsym : ALIGN(8) { + __dynsym_start = .; + *(.dynsym) + } + .dynstr : { + *(.dynstr) + } + .hash : { + *(.hash) + } . = ALIGN(PAGE_SIZE); __init_end = .; From a91a00b66c794c81d7afd04f9d2a5385952a24e1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 14:12:01 +0100 Subject: [PATCH 482/607] BACKPORT: arm64: add support for kernel ASLR This adds support for KASLR is implemented, based on entropy provided by the bootloader in the /chosen/kaslr-seed DT property. Depending on the size of the address space (VA_BITS) and the page size, the entropy in the virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all 4 levels), with the sidenote that displacements that result in the kernel image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB granule kernels, respectively) are not allowed, and will be rounded up to an acceptable value. If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is randomized independently from the core kernel. This makes it less likely that the location of core kernel data structures can be determined by an adversary, but causes all function calls from modules into the core kernel to be resolved via entries in the module PLTs. If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is randomized by choosing a page aligned 128 MB region inside the interval [_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of entropy (depending on page size), independently of the kernel randomization, but still guarantees that modules are within the range of relative branch and jump instructions (with the caveat that, since the module region is shared with other uses of the vmalloc area, modules may need to be loaded further away if the module region is exhausted) Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f80fb3a3d50843a401dac4b566b3b131da8077a2) Signed-off-by: Jeff Vander Stoep Change-Id: I3f5fafa4e92e5ff39259d57065541366237eb021 --- arch/arm64/Kconfig | 29 ++++++ arch/arm64/include/asm/memory.h | 5 +- arch/arm64/include/asm/module.h | 6 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/head.S | 59 +++++++++-- arch/arm64/kernel/kaslr.c | 173 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/module.c | 3 +- arch/arm64/kernel/setup.c | 29 ++++++ arch/arm64/mm/kasan_init.c | 17 +++- arch/arm64/mm/mmu.c | 29 ++++-- 10 files changed, 329 insertions(+), 22 deletions(-) create mode 100644 arch/arm64/kernel/kaslr.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1faf7905f0fa..a42e3235484b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -782,6 +782,35 @@ config RELOCATABLE relocation pass at runtime even if the kernel is loaded at the same address it was linked at. +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS + select RELOCATABLE + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts + relying on knowledge of the location of kernel internals. + + It is the bootloader's job to provide entropy, by passing a + random u64 value in /chosen/kaslr-seed at kernel entry. + + If unsure, say N. + +config RANDOMIZE_MODULE_REGION_FULL + bool "Randomize the module region independently from the core kernel" + depends on RANDOMIZE_BASE + default y + help + Randomizes the location of the module region without considering the + location of the core kernel. This way, it is impossible for modules + to leak information about the location of core kernel data structures + but it does imply that function calls between modules and the core + kernel will need to be resolved via veneers in the module PLT. + + When this option is not set, the module region will be randomized over + a limited range that contains the [_stext, _etext] interval of the + core kernel, so branch relocations are always in range. + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index eb798156cf56..5f8667a99e41 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -53,7 +53,7 @@ #define KIMAGE_VADDR (MODULES_END) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) -#define MODULES_VSIZE (SZ_64M) +#define MODULES_VSIZE (SZ_128M) #define PCI_IO_END (PAGE_OFFSET - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) @@ -139,6 +139,9 @@ extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) +/* the virtual base of the kernel image (minus TEXT_OFFSET) */ +extern u64 kimage_vaddr; + /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 8652fb613304..e12af6754634 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -31,4 +31,10 @@ struct mod_arch_specific { u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, Elf64_Sym *sym); +#ifdef CONFIG_RANDOMIZE_BASE +extern u64 module_alloc_base; +#else +#define module_alloc_base ((u64)_etext - MODULES_VSIZE) +#endif + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 8d971f9c6ed5..49a2430b0786 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -43,6 +43,7 @@ arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o +arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4e69412a7323..319f896c6e74 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -210,6 +210,7 @@ section_table: ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w20=cpu_boot_mode + mov x23, xzr // KASLR offset, defaults to 0 adrp x24, __PHYS_OFFSET bl set_cpu_boot_mode_flag bl __create_page_tables // x25=TTBR0, x26=TTBR1 @@ -313,7 +314,7 @@ ENDPROC(preserve_boot_args) __create_page_tables: adrp x25, idmap_pg_dir adrp x26, swapper_pg_dir - mov x27, lr + mov x28, lr /* * Invalidate the idmap and swapper page tables to avoid potential @@ -392,6 +393,7 @@ __create_page_tables: */ mov x0, x26 // swapper_pg_dir ldr x5, =KIMAGE_VADDR + add x5, x5, x23 // add KASLR displacement create_pgd_entry x0, x5, x3, x6 ldr w6, kernel_img_size add x6, x6, x5 @@ -408,8 +410,7 @@ __create_page_tables: dmb sy bl __inval_cache_range - mov lr, x27 - ret + ret x28 ENDPROC(__create_page_tables) kernel_img_size: @@ -421,6 +422,7 @@ kernel_img_size: */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: + mov x28, lr // preserve LR adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -449,19 +451,26 @@ __mmap_switched: ldr x13, [x9, #-8] cmp w12, #R_AARCH64_RELATIVE b.ne 1f - str x13, [x11] + add x13, x13, x23 // relocate + str x13, [x11, x23] b 0b 1: cmp w12, #R_AARCH64_ABS64 b.ne 0b add x12, x12, x12, lsl #1 // symtab offset: 24x top word add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx ldr x15, [x12, #8] // Elf64_Sym::st_value + cmp w14, #-0xf // SHN_ABS (0xfff1) ? + add x14, x15, x23 // relocate + csel x15, x14, x15, ne add x15, x13, x15 - str x15, [x11] + str x15, [x11, x23] b 0b -2: +2: adr_l x8, kimage_vaddr // make relocated kimage_vaddr + dc cvac, x8 // value visible to secondaries + dsb sy // with MMU off #endif adr_l sp, initial_sp, x4 @@ -470,13 +479,23 @@ __mmap_switched: msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr x4, =KIMAGE_VADDR // Save the offset between + ldr_l x4, kimage_vaddr // Save the offset between sub x4, x4, x24 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings mov x29, #0 #ifdef CONFIG_KASAN bl kasan_early_init +#endif +#ifdef CONFIG_RANDOMIZE_BASE + cbnz x23, 0f // already running randomized? + mov x0, x21 // pass FDT address in x0 + bl kaslr_early_init // parse FDT for KASLR options + cbz x0, 0f // KASLR disabled? just proceed + mov x23, x0 // record KASLR offset + ret x28 // we must enable KASLR, return + // to __enable_mmu() +0: #endif b start_kernel ENDPROC(__mmap_switched) @@ -486,6 +505,10 @@ ENDPROC(__mmap_switched) * hotplug and needs to have the same protections as the text region */ .section ".text","ax" + +ENTRY(kimage_vaddr) + .quad _text - TEXT_OFFSET + /* * If we're fortunate enough to boot at EL2, ensure that the world is * sane before dropping to EL1. @@ -651,7 +674,7 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x8, =KIMAGE_VADDR + ldr x8, kimage_vaddr ldr w9, 0f sub x27, x8, w9, sxtw // address to jump to after enabling the MMU b __enable_mmu @@ -684,6 +707,7 @@ ENDPROC(__secondary_switched) */ .section ".idmap.text", "ax" __enable_mmu: + mrs x18, sctlr_el1 // preserve old SCTLR_EL1 value mrs x1, ID_AA64MMFR0_EL1 ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED @@ -701,6 +725,25 @@ __enable_mmu: ic iallu dsb nsh isb +#ifdef CONFIG_RANDOMIZE_BASE + mov x19, x0 // preserve new SCTLR_EL1 value + blr x27 + + /* + * If we return here, we have a KASLR displacement in x23 which we need + * to take into account by discarding the current kernel mapping and + * creating a new one. + */ + msr sctlr_el1, x18 // disable the MMU + isb + bl __create_page_tables // recreate kernel mapping + + msr sctlr_el1, x19 // re-enable the MMU + isb + ic ialluis // flush instructions fetched + isb // via old mapping + add x27, x27, x23 // relocated __mmap_switched +#endif br x27 ENDPROC(__enable_mmu) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c new file mode 100644 index 000000000000..8b32a1f8f09f --- /dev/null +++ b/arch/arm64/kernel/kaslr.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2016 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +u64 __read_mostly module_alloc_base; + +static __init u64 get_kaslr_seed(void *fdt) +{ + int node, len; + u64 *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +static __init const u8 *get_cmdline(void *fdt) +{ + static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + int node; + const u8 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + return prop; + } +out: + return default_cmdline; +} + +extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, + pgprot_t prot); + +/* + * This routine will be executed with the kernel mapped at its default virtual + * address, and if it returns successfully, the kernel will be remapped, and + * start_kernel() will be executed from a randomized virtual offset. The + * relocation will result in all absolute references (e.g., static variables + * containing function pointers) to be reinitialized, and zero-initialized + * .bss variables will be reset to 0. + */ +u64 __init kaslr_early_init(u64 dt_phys) +{ + void *fdt; + u64 seed, offset, mask, module_range; + const u8 *cmdline, *str; + int size; + + /* + * Set a reasonable default for module_alloc_base in case + * we end up running with module randomization disabled. + */ + module_alloc_base = (u64)_etext - MODULES_VSIZE; + + /* + * Try to map the FDT early. If this fails, we simply bail, + * and proceed with KASLR disabled. We will make another + * attempt at mapping the FDT in setup_machine() + */ + early_fixmap_init(); + fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); + if (!fdt) + return 0; + + /* + * Retrieve (and wipe) the seed from the FDT + */ + seed = get_kaslr_seed(fdt); + if (!seed) + return 0; + + /* + * Check if 'nokaslr' appears on the command line, and + * return 0 if that is the case. + */ + cmdline = get_cmdline(fdt); + str = strstr(cmdline, "nokaslr"); + if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) + return 0; + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * lower half of the VMALLOC area (VA_BITS - 2). + * Even if we could randomize at page granularity for 16k and 64k pages, + * let's always round to 2 MB so we don't interfere with the ability to + * map using contiguous PTEs + */ + mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); + offset = seed & mask; + + /* + * The kernel Image should not extend across a 1GB/32MB/512MB alignment + * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this + * happens, increase the KASLR offset by the size of the kernel image. + */ + if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + offset = (offset + (u64)(_end - _text)) & mask; + + if (IS_ENABLED(CONFIG_KASAN)) + /* + * KASAN does not expect the module region to intersect the + * vmalloc region, since shadow memory is allocated for each + * module at load time, whereas the vmalloc region is shadowed + * by KASAN zero pages. So keep modules out of the vmalloc + * region if KASAN is enabled. + */ + return offset; + + if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { + /* + * Randomize the module region independently from the core + * kernel. This prevents modules from leaking any information + * about the address of the kernel itself, but results in + * branches between modules and the core kernel that are + * resolved via PLTs. (Branches between modules will be + * resolved normally.) + */ + module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE; + module_alloc_base = VMALLOC_START; + } else { + /* + * Randomize the module region by setting module_alloc_base to + * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, + * _stext) . This guarantees that the resulting region still + * covers [_stext, _etext], and that all relative branches can + * be resolved without veneers. + */ + module_range = MODULES_VSIZE - (u64)(_etext - _stext); + module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; + } + + /* use the lower 21 bits to randomize the base of the module region */ + module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; + module_alloc_base &= PAGE_MASK; + + return offset; +} diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index a9dde97f5ca5..7f316982ce00 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -34,7 +34,8 @@ void *module_alloc(unsigned long size) { void *p; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, + module_alloc_base + MODULES_VSIZE, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 079db3ba21c4..29b8c247d56f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -388,3 +388,32 @@ static int __init topology_init(void) return 0; } subsys_initcall(topology_init); + +/* + * Dump out kernel offset information on panic. + */ +static int dump_kernel_offset(struct notifier_block *self, unsigned long v, + void *p) +{ + u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR; + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) { + pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n", + kaslr_offset, KIMAGE_VADDR); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } + return 0; +} + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + return 0; +} +__initcall(register_kernel_offset_dumper); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 7f10cc91fa8a..56e19d150c21 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -129,12 +129,16 @@ static void __init clear_pgds(unsigned long start, void __init kasan_init(void) { u64 kimg_shadow_start, kimg_shadow_end; + u64 mod_shadow_start, mod_shadow_end; struct memblock_region *reg; int i; kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); + mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call bellow). @@ -158,13 +162,20 @@ void __init kasan_init(void) * with PMD table mappings at the edges of the shadow region for the * kernel image. */ - if (ARM64_SWAPPER_USES_SECTION_MAPS) + if (ARM64_SWAPPER_USES_SECTION_MAPS) { + kimg_shadow_start = round_down(kimg_shadow_start, + SWAPPER_BLOCK_SIZE); kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + } kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, - kasan_mem_to_shadow((void *)MODULES_VADDR)); + (void *)mod_shadow_start); kasan_populate_zero_shadow((void *)kimg_shadow_end, - kasan_mem_to_shadow((void *)PAGE_OFFSET)); + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + if (kimg_shadow_start > mod_shadow_end) + kasan_populate_zero_shadow((void *)mod_shadow_end, + (void *)kimg_shadow_start); for_each_memblock(memory, reg) { void *start = (void *)__phys_to_virt(reg->base); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8577437b6c0a..f4d1da0c1531 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -676,7 +676,8 @@ void __init early_fixmap_init(void) unsigned long addr = FIXADDR_START; pgd = pgd_offset_k(addr); - if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { + if (CONFIG_PGTABLE_LEVELS > 3 && + !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on @@ -733,11 +734,10 @@ void __set_fixmap(enum fixed_addresses idx, } } -void *__init fixmap_remap_fdt(phys_addr_t dt_phys) +void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) { const u64 dt_virt_base = __fix_to_virt(FIX_FDT); - pgprot_t prot = PAGE_KERNEL_RO; - int size, offset; + int offset; void *dt_virt; /* @@ -776,16 +776,27 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) if (fdt_check_header(dt_virt) != 0) return NULL; - size = fdt_totalsize(dt_virt); - if (size > MAX_FDT_SIZE) + *size = fdt_totalsize(dt_virt); + if (*size > MAX_FDT_SIZE) return NULL; - if (offset + size > SWAPPER_BLOCK_SIZE) + if (offset + *size > SWAPPER_BLOCK_SIZE) create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, - round_up(offset + size, SWAPPER_BLOCK_SIZE), prot); + round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot); + + return dt_virt; +} + +void *__init fixmap_remap_fdt(phys_addr_t dt_phys) +{ + void *dt_virt; + int size; + + dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); + if (!dt_virt) + return NULL; memblock_reserve(dt_phys, size); - return dt_virt; } From 09cbceb939d85d7d25b8e424445a2f83cac97c9d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Feb 2016 17:57:13 +0100 Subject: [PATCH 483/607] UPSTREAM: arm64: vmemmap: use virtual projection of linear region Commit dd006da21646 ("arm64: mm: increase VA range of identity map") made some changes to the memory mapping code to allow physical memory to reside at an offset that exceeds the size of the virtual mapping. However, since the size of the vmemmap area is proportional to the size of the VA area, but it is populated relative to the physical space, we may end up with the struct page array being mapped outside of the vmemmap region. For instance, on my Seattle A0 box, I can see the following output in the dmesg log. vmemmap : 0xffffffbdc0000000 - 0xffffffbfc0000000 ( 8 GB maximum) 0xffffffbfc0000000 - 0xffffffbfd0000000 ( 256 MB actual) We can fix this by deciding that the vmemmap region is not a projection of the physical space, but of the virtual space above PAGE_OFFSET, i.e., the linear region. This way, we are guaranteed that the vmemmap region is of sufficient size, and we can even reduce the size by half. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit dfd55ad85e4a7fbaa82df12467515ac3c81e8a3e) Signed-off-by: Jeff Vander Stoep Change-Id: I8112d910f9659941dab6de5b3791f395150c77f1 --- arch/arm64/include/asm/pgtable.h | 7 ++++--- arch/arm64/mm/init.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9bf72b72b76d..d48ea8fc789d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -34,18 +34,19 @@ /* * VMALLOC and SPARSEMEM_VMEMMAP ranges. * - * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array + * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array * (rounded up to PUD_SIZE). * VMALLOC_START: beginning of the kernel vmalloc space * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ -#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) +#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE) #define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) -#define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) +#define VMEMMAP_START (VMALLOC_END + SZ_64K) +#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) #define FIRST_USER_ADDRESS 0UL diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e333f7f0b4f2..04c7e8ba47ce 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -365,8 +365,8 @@ void __init mem_init(void) MLK_ROUNDUP(_text, _etext), MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP - MLG((unsigned long)vmemmap, - (unsigned long)vmemmap + VMEMMAP_SIZE), + MLG(VMEMMAP_START, + VMEMMAP_START + VMEMMAP_SIZE), MLM((unsigned long)virt_to_page(PAGE_OFFSET), (unsigned long)virt_to_page(high_memory)), #endif From ee97096e89bbb5f39614638bd32fded97eb08691 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Feb 2016 17:57:14 +0100 Subject: [PATCH 484/607] UPSTREAM: arm64: mm: treat memstart_addr as a signed quantity Commit c031a4213c11 ("arm64: kaslr: randomize the linear region") implements randomization of the linear region, by subtracting a random multiple of PUD_SIZE from memstart_addr. This causes the virtual mapping of system RAM to move upwards in the linear region, and at the same time causes memstart_addr to assume a value which may be negative if the offset of system RAM in the physical space is smaller than its offset relative to PAGE_OFFSET in the virtual space. Since memstart_addr is effectively an offset now, redefine its type as s64 so that expressions involving shifting or division preserve its sign. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 020d044f66874eba058ce8264fc550f3eca67879) Signed-off-by: Jeff Vander Stoep Change-Id: I0482ebc13baaa9005cf372795e656c2417be9d1c --- arch/arm64/include/asm/memory.h | 2 +- arch/arm64/mm/init.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 5f8667a99e41..12f8a00fb3f1 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -135,7 +135,7 @@ #include #include -extern phys_addr_t memstart_addr; +extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 04c7e8ba47ce..3d75ff40c5ff 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -54,7 +54,7 @@ * executes, which assigns it its actual value. So use a default value * that cannot be mistaken for a real physical address. */ -phys_addr_t memstart_addr __read_mostly = ~0ULL; +s64 memstart_addr __read_mostly = -1; phys_addr_t arm64_dma_phys_limit __read_mostly; #ifdef CONFIG_BLK_DEV_INITRD @@ -183,7 +183,7 @@ void __init arm64_memblock_init(void) * linear mapping. Take care not to clip the kernel which may be * high in memory. */ - memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), + memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)), ULLONG_MAX); if (memblock_end_of_DRAM() > linear_region_size) memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); From 22346f1e99e4ed88ed8f68d95875a92395c3fea4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 29 Jan 2016 11:59:03 +0100 Subject: [PATCH 485/607] BACKPORT: arm64: kaslr: randomize the linear region When KASLR is enabled (CONFIG_RANDOMIZE_BASE=y), and entropy has been provided by the bootloader, randomize the placement of RAM inside the linear region if sufficient space is available. For instance, on a 4KB granule/3 levels kernel, the linear region is 256 GB in size, and we can choose any 1 GB aligned offset that is far enough from the top of the address space to fit the distance between the start of the lowest memblock and the top of the highest memblock. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 031a4213c11a5db475f528c182f7b3858df11db) Signed-off-by: Jeff Vander Stoep Change-Id: I272de8ee358351d95eacc7dc5f47600adec3e813 --- arch/arm64/kernel/kaslr.c | 4 ++++ arch/arm64/mm/init.c | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 8b32a1f8f09f..582983920054 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -21,6 +21,7 @@ #include u64 __read_mostly module_alloc_base; +u16 __initdata memstart_offset_seed; static __init u64 get_kaslr_seed(void *fdt) { @@ -123,6 +124,9 @@ u64 __init kaslr_early_init(u64 dt_phys) mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); offset = seed & mask; + /* use the top 16 bits to randomize the linear region */ + memstart_offset_seed = seed >> 48; + /* * The kernel Image should not extend across a 1GB/32MB/512MB alignment * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3d75ff40c5ff..8e3cf19e4f00 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -198,6 +198,23 @@ void __init arm64_memblock_init(void) memblock_add(__pa(_text), (u64)(_end - _text)); } + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern u16 memstart_offset_seed; + u64 range = linear_region_size - + (memblock_end_of_DRAM() - memblock_start_of_DRAM()); + + /* + * If the size of the linear region exceeds, by a sufficient + * margin, the size of the region that the available physical + * memory spans, randomize the linear region as well. + */ + if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) { + range = range / ARM64_MEMSTART_ALIGN + 1; + memstart_addr -= ARM64_MEMSTART_ALIGN * + ((range * memstart_offset_seed) >> 16); + } + } + /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. @@ -367,12 +384,13 @@ void __init mem_init(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE), - MLM((unsigned long)virt_to_page(PAGE_OFFSET), + MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), (unsigned long)virt_to_page(high_memory)), #endif MLK(FIXADDR_START, FIXADDR_TOP), MLM(PCI_IO_START, PCI_IO_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory)); + MLM(__phys_to_virt(memblock_start_of_DRAM()), + (unsigned long)high_memory)); #undef MLK #undef MLM From 39b0815e4872fe6eb20a7d1b42b551e341cb7167 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 10 Jan 2016 11:29:07 +0100 Subject: [PATCH 486/607] BACKPORT: efi: stub: implement efi_get_random_bytes() based on EFI_RNG_PROTOCOL This exposes the firmware's implementation of EFI_RNG_PROTOCOL via a new function efi_get_random_bytes(). Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit e4fbf4767440472f9d23b0f25a2b905e1c63b6a8) Signed-off-by: Jeff Vander Stoep Change-Id: Id46036b78c2efd223b6cd5488e512fd93e8f597d --- drivers/firmware/efi/libstub/Makefile | 2 +- drivers/firmware/efi/libstub/efistub.h | 3 +++ drivers/firmware/efi/libstub/random.c | 35 ++++++++++++++++++++++++++ include/linux/efi.h | 6 ++++- 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 drivers/firmware/efi/libstub/random.c diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index c0ddd1b8dca3..68bdd9f23e13 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -34,7 +34,7 @@ $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE lib-$(CONFIG_EFI_ARMSTUB) += arm-stub.o fdt.o string.o \ $(patsubst %.c,lib-%.o,$(arm-deps)) -lib-$(CONFIG_ARM64) += arm64-stub.o +lib-$(CONFIG_ARM64) += arm64-stub.o random.o CFLAGS_arm64-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) # diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 6b6548fda089..206b7252b9d1 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -43,4 +43,7 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, unsigned long desc_size, efi_memory_desc_t *runtime_map, int *count); +efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table, + unsigned long size, u8 *out); + #endif diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c new file mode 100644 index 000000000000..97941ee5954f --- /dev/null +++ b/drivers/firmware/efi/libstub/random.c @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Linaro Ltd; + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include "efistub.h" + +struct efi_rng_protocol { + efi_status_t (*get_info)(struct efi_rng_protocol *, + unsigned long *, efi_guid_t *); + efi_status_t (*get_rng)(struct efi_rng_protocol *, + efi_guid_t *, unsigned long, u8 *out); +}; + +efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg, + unsigned long size, u8 *out) +{ + efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID; + efi_status_t status; + struct efi_rng_protocol *rng; + + status = efi_call_early(locate_protocol, &rng_proto, NULL, + (void **)&rng); + if (status != EFI_SUCCESS) + return status; + + return rng->get_rng(rng, NULL, size, out); +} diff --git a/include/linux/efi.h b/include/linux/efi.h index 569b5a866bb1..e747eb08b2be 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -299,7 +299,7 @@ typedef struct { void *open_protocol_information; void *protocols_per_handle; void *locate_handle_buffer; - void *locate_protocol; + efi_status_t (*locate_protocol)(efi_guid_t *, void *, void **); void *install_multiple_protocol_interfaces; void *uninstall_multiple_protocol_interfaces; void *calculate_crc32; @@ -599,6 +599,10 @@ void efi_native_runtime_setup(void); #define EFI_PROPERTIES_TABLE_GUID \ EFI_GUID( 0x880aaca3, 0x4adc, 0x4a04, 0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5 ) +#define EFI_RNG_PROTOCOL_GUID \ + EFI_GUID(0x3152bca5, 0xeade, 0x433d, \ + 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) + typedef struct { efi_guid_t guid; u64 table; From 718645ea36329edb7d1c01871c13385fe1a2f03a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 10:43:16 +0100 Subject: [PATCH 487/607] UPSTREAM: efi: stub: add implementation of efi_random_alloc() This implements efi_random_alloc(), which allocates a chunk of memory of a certain size at a certain alignment, and uses the random_seed argument it receives to randomize the address of the allocation. This is implemented by iterating over the UEFI memory map, counting the number of suitable slots (aligned offsets) within each region, and picking a random number between 0 and 'number of slots - 1' to select the slot, This should guarantee that each possible offset is chosen equally likely. Suggested-by: Kees Cook Reviewed-by: Matt Fleming Reviewed-by: Kees Cook Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2ddbfc81eac84a299cb4747a8764bc43f23e9008) Signed-off-by: Jeff Vander Stoep Change-Id: I8f59e3e91a71c752d69fd08ca43a890977c82919 --- drivers/firmware/efi/libstub/efistub.h | 4 + drivers/firmware/efi/libstub/random.c | 100 +++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 206b7252b9d1..5ed3d3f38166 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -46,4 +46,8 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table, unsigned long size, u8 *out); +efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg, + unsigned long size, unsigned long align, + unsigned long *addr, unsigned long random_seed); + #endif diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c index 97941ee5954f..53f6d3fe6d86 100644 --- a/drivers/firmware/efi/libstub/random.c +++ b/drivers/firmware/efi/libstub/random.c @@ -33,3 +33,103 @@ efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg, return rng->get_rng(rng, NULL, size, out); } + +/* + * Return the number of slots covered by this entry, i.e., the number of + * addresses it covers that are suitably aligned and supply enough room + * for the allocation. + */ +static unsigned long get_entry_num_slots(efi_memory_desc_t *md, + unsigned long size, + unsigned long align) +{ + u64 start, end; + + if (md->type != EFI_CONVENTIONAL_MEMORY) + return 0; + + start = round_up(md->phys_addr, align); + end = round_down(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - size, + align); + + if (start > end) + return 0; + + return (end - start + 1) / align; +} + +/* + * The UEFI memory descriptors have a virtual address field that is only used + * when installing the virtual mapping using SetVirtualAddressMap(). Since it + * is unused here, we can reuse it to keep track of each descriptor's slot + * count. + */ +#define MD_NUM_SLOTS(md) ((md)->virt_addr) + +efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg, + unsigned long size, + unsigned long align, + unsigned long *addr, + unsigned long random_seed) +{ + unsigned long map_size, desc_size, total_slots = 0, target_slot; + efi_status_t status; + efi_memory_desc_t *memory_map; + int map_offset; + + status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size, + &desc_size, NULL, NULL); + if (status != EFI_SUCCESS) + return status; + + if (align < EFI_ALLOC_ALIGN) + align = EFI_ALLOC_ALIGN; + + /* count the suitable slots in each memory map entry */ + for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { + efi_memory_desc_t *md = (void *)memory_map + map_offset; + unsigned long slots; + + slots = get_entry_num_slots(md, size, align); + MD_NUM_SLOTS(md) = slots; + total_slots += slots; + } + + /* find a random number between 0 and total_slots */ + target_slot = (total_slots * (u16)random_seed) >> 16; + + /* + * target_slot is now a value in the range [0, total_slots), and so + * it corresponds with exactly one of the suitable slots we recorded + * when iterating over the memory map the first time around. + * + * So iterate over the memory map again, subtracting the number of + * slots of each entry at each iteration, until we have found the entry + * that covers our chosen slot. Use the residual value of target_slot + * to calculate the randomly chosen address, and allocate it directly + * using EFI_ALLOCATE_ADDRESS. + */ + for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { + efi_memory_desc_t *md = (void *)memory_map + map_offset; + efi_physical_addr_t target; + unsigned long pages; + + if (target_slot >= MD_NUM_SLOTS(md)) { + target_slot -= MD_NUM_SLOTS(md); + continue; + } + + target = round_up(md->phys_addr, align) + target_slot * align; + pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + + status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, pages, &target); + if (status == EFI_SUCCESS) + *addr = target; + break; + } + + efi_call_early(free_pool, memory_map); + + return status; +} From 3fafb1a67d5261cd8a0d9a9ad0316baf98d0bba8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 11:47:49 +0100 Subject: [PATCH 488/607] UPSTREAM: efi: stub: use high allocation for converted command line Before we can move the command line processing before the allocation of the kernel, which is required for detecting the 'nokaslr' option which controls that allocation, move the converted command line higher up in memory, to prevent it from interfering with the kernel itself. Since x86 needs the address to fit in 32 bits, use UINT_MAX as the upper bound there. Otherwise, use ULONG_MAX (i.e., no limit) Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 48fcb2d0216103d15306caa4814e2381104df6d8) Signed-off-by: Jeff Vander Stoep Change-Id: Ie959355658d3f2f1819bee842c77cc5eef54b8e7 --- arch/x86/include/asm/efi.h | 2 ++ drivers/firmware/efi/libstub/efi-stub-helper.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0010c78c4998..08b1f2f6ea50 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -25,6 +25,8 @@ #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" +#define MAX_CMDLINE_ADDRESS UINT_MAX + #ifdef CONFIG_X86_32 diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index f07d4a67fa76..29ed2f9b218c 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -649,6 +649,10 @@ static u8 *efi_utf16_to_utf8(u8 *dst, const u16 *src, int n) return dst; } +#ifndef MAX_CMDLINE_ADDRESS +#define MAX_CMDLINE_ADDRESS ULONG_MAX +#endif + /* * Convert the unicode UEFI command line to ASCII to pass to kernel. * Size of memory allocated return in *cmd_line_len. @@ -684,7 +688,8 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg, options_bytes++; /* NUL termination */ - status = efi_low_alloc(sys_table_arg, options_bytes, 0, &cmdline_addr); + status = efi_high_alloc(sys_table_arg, options_bytes, 0, + &cmdline_addr, MAX_CMDLINE_ADDRESS); if (status != EFI_SUCCESS) return NULL; From 4de18b705de63c094a0c23754337c750d072ade9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 14:48:29 +0100 Subject: [PATCH 489/607] UPSTREAM: arm64: efi: invoke EFI_RNG_PROTOCOL to supply KASLR randomness Since arm64 does not use a decompressor that supplies an execution environment where it is feasible to some extent to provide a source of randomness, the arm64 KASLR kernel depends on the bootloader to supply some random bits in the /chosen/kaslr-seed DT property upon kernel entry. On UEFI systems, we can use the EFI_RNG_PROTOCOL, if supplied, to obtain some random bits. At the same time, use it to randomize the offset of the kernel Image in physical memory. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2b5fe07a78a09a32002642b8a823428ade611f16) Signed-off-by: Jeff Vander Stoep Change-Id: I9cb7ae5727dfdf3726b1c9544bce74722ec77bbd --- arch/arm64/Kconfig | 5 ++ drivers/firmware/efi/libstub/arm-stub.c | 40 +++++++---- drivers/firmware/efi/libstub/arm64-stub.c | 84 ++++++++++++++++------- drivers/firmware/efi/libstub/fdt.c | 14 ++++ 4 files changed, 105 insertions(+), 38 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a42e3235484b..0b5ec89a8ec4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -794,6 +794,11 @@ config RANDOMIZE_BASE It is the bootloader's job to provide entropy, by passing a random u64 value in /chosen/kaslr-seed at kernel entry. + When booting via the UEFI stub, it will invoke the firmware's + EFI_RNG_PROTOCOL implementation (if available) to supply entropy + to the kernel proper. In addition, it will randomise the physical + location of the kernel Image as well. + If unsure, say N. config RANDOMIZE_MODULE_REGION_FULL diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 950c87f5d279..d5aa1d16154f 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -18,6 +18,8 @@ #include "efistub.h" +bool __nokaslr; + static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) { static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; @@ -207,14 +209,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, pr_efi_err(sys_table, "Failed to find DRAM base\n"); goto fail; } - status = handle_kernel_image(sys_table, image_addr, &image_size, - &reserve_addr, - &reserve_size, - dram_base, image); - if (status != EFI_SUCCESS) { - pr_efi_err(sys_table, "Failed to relocate kernel\n"); - goto fail; - } /* * Get the command line from EFI, using the LOADED_IMAGE @@ -224,7 +218,28 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, cmdline_ptr = efi_convert_cmdline(sys_table, image, &cmdline_size); if (!cmdline_ptr) { pr_efi_err(sys_table, "getting command line via LOADED_IMAGE_PROTOCOL\n"); - goto fail_free_image; + goto fail; + } + + /* check whether 'nokaslr' was passed on the command line */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + static const u8 default_cmdline[] = CONFIG_CMDLINE; + const u8 *str, *cmdline = cmdline_ptr; + + if (IS_ENABLED(CONFIG_CMDLINE_FORCE)) + cmdline = default_cmdline; + str = strstr(cmdline, "nokaslr"); + if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) + __nokaslr = true; + } + + status = handle_kernel_image(sys_table, image_addr, &image_size, + &reserve_addr, + &reserve_size, + dram_base, image); + if (status != EFI_SUCCESS) { + pr_efi_err(sys_table, "Failed to relocate kernel\n"); + goto fail_free_cmdline; } status = efi_parse_options(cmdline_ptr); @@ -244,7 +259,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, if (status != EFI_SUCCESS) { pr_efi_err(sys_table, "Failed to load device tree!\n"); - goto fail_free_cmdline; + goto fail_free_image; } } @@ -286,12 +301,11 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_free(sys_table, initrd_size, initrd_addr); efi_free(sys_table, fdt_size, fdt_addr); -fail_free_cmdline: - efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); - fail_free_image: efi_free(sys_table, image_size, *image_addr); efi_free(sys_table, reserve_size, reserve_addr); +fail_free_cmdline: + efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); fail: return EFI_ERROR; } diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 78dfbd34b6bf..e0e6b74fef8f 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -13,6 +13,10 @@ #include #include +#include "efistub.h" + +extern bool __nokaslr; + efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, unsigned long *image_addr, unsigned long *image_size, @@ -23,26 +27,52 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, { efi_status_t status; unsigned long kernel_size, kernel_memsize = 0; - unsigned long nr_pages; void *old_image_addr = (void *)*image_addr; unsigned long preferred_offset; + u64 phys_seed = 0; + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + if (!__nokaslr) { + status = efi_get_random_bytes(sys_table_arg, + sizeof(phys_seed), + (u8 *)&phys_seed); + if (status == EFI_NOT_FOUND) { + pr_efi(sys_table_arg, "EFI_RNG_PROTOCOL unavailable, no randomness supplied\n"); + } else if (status != EFI_SUCCESS) { + pr_efi_err(sys_table_arg, "efi_get_random_bytes() failed\n"); + return status; + } + } else { + pr_efi(sys_table_arg, "KASLR disabled on kernel command line\n"); + } + } /* * The preferred offset of the kernel Image is TEXT_OFFSET bytes beyond * a 2 MB aligned base, which itself may be lower than dram_base, as * long as the resulting offset equals or exceeds it. */ - preferred_offset = round_down(dram_base, SZ_2M) + TEXT_OFFSET; + preferred_offset = round_down(dram_base, MIN_KIMG_ALIGN) + TEXT_OFFSET; if (preferred_offset < dram_base) - preferred_offset += SZ_2M; + preferred_offset += MIN_KIMG_ALIGN; - /* Relocate the image, if required. */ kernel_size = _edata - _text; - if (*image_addr != preferred_offset) { - kernel_memsize = kernel_size + (_end - _edata); + kernel_memsize = kernel_size + (_end - _edata); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) { /* - * First, try a straight allocation at the preferred offset. + * If KASLR is enabled, and we have some randomness available, + * locate the kernel at a randomized offset in physical memory. + */ + *reserve_size = kernel_memsize + TEXT_OFFSET; + status = efi_random_alloc(sys_table_arg, *reserve_size, + MIN_KIMG_ALIGN, reserve_addr, + phys_seed); + + *image_addr = *reserve_addr + TEXT_OFFSET; + } else { + /* + * Else, try a straight allocation at the preferred offset. * This will work around the issue where, if dram_base == 0x0, * efi_low_alloc() refuses to allocate at 0x0 (to prevent the * address of the allocation to be mistaken for a FAIL return @@ -52,27 +82,31 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, * Mustang), we can still place the kernel at the address * 'dram_base + TEXT_OFFSET'. */ - *image_addr = *reserve_addr = preferred_offset; - nr_pages = round_up(kernel_memsize, EFI_ALLOC_ALIGN) / - EFI_PAGE_SIZE; - status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, nr_pages, - (efi_physical_addr_t *)reserve_addr); - if (status != EFI_SUCCESS) { - kernel_memsize += TEXT_OFFSET; - status = efi_low_alloc(sys_table_arg, kernel_memsize, - SZ_2M, reserve_addr); + if (*image_addr == preferred_offset) + return EFI_SUCCESS; - if (status != EFI_SUCCESS) { - pr_efi_err(sys_table_arg, "Failed to relocate kernel\n"); - return status; - } - *image_addr = *reserve_addr + TEXT_OFFSET; - } - memcpy((void *)*image_addr, old_image_addr, kernel_size); - *reserve_size = kernel_memsize; + *image_addr = *reserve_addr = preferred_offset; + *reserve_size = round_up(kernel_memsize, EFI_ALLOC_ALIGN); + + status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, + *reserve_size / EFI_PAGE_SIZE, + (efi_physical_addr_t *)reserve_addr); } + if (status != EFI_SUCCESS) { + *reserve_size = kernel_memsize + TEXT_OFFSET; + status = efi_low_alloc(sys_table_arg, *reserve_size, + MIN_KIMG_ALIGN, reserve_addr); + + if (status != EFI_SUCCESS) { + pr_efi_err(sys_table_arg, "Failed to relocate kernel\n"); + *reserve_size = 0; + return status; + } + *image_addr = *reserve_addr + TEXT_OFFSET; + } + memcpy((void *)*image_addr, old_image_addr, kernel_size); return EFI_SUCCESS; } diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index b62e2f5dcab3..b1c22cf18f7d 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -147,6 +147,20 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, if (status) goto fdt_set_fail; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + efi_status_t efi_status; + + efi_status = efi_get_random_bytes(sys_table, sizeof(fdt_val64), + (u8 *)&fdt_val64); + if (efi_status == EFI_SUCCESS) { + status = fdt_setprop(fdt, node, "kaslr-seed", + &fdt_val64, sizeof(fdt_val64)); + if (status) + goto fdt_set_fail; + } else if (efi_status != EFI_NOT_FOUND) { + return efi_status; + } + } return EFI_SUCCESS; fdt_set_fail: From a2f0639adbc63164137683f6d0c4deb88406290d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Feb 2016 13:53:10 -0800 Subject: [PATCH 490/607] UPSTREAM: arm64: make irq_stack_ptr more robust Switching between stacks is only valid if we are tracing ourselves while on the irq_stack, so it is only valid when in current and non-preemptible context, otherwise is is just zeroed off. Fixes: 132cd887b5c5 ("arm64: Modify stack trace and dump for use with irq_stack") Acked-by: James Morse Tested-by: James Morse Signed-off-by: Yang Shi Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a80a0eb70c358f8c7dda4bb62b2278dc6285217b) Signed-off-by: Jeff Vander Stoep Change-Id: I431d3d5e8e1f556ddfef283af88dd2f63b825f7c --- arch/arm64/kernel/stacktrace.c | 13 ++++++------- arch/arm64/kernel/traps.c | 11 ++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 4fad9787ab46..cfd46c227c8c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -44,14 +44,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) unsigned long irq_stack_ptr; /* - * Use raw_smp_processor_id() to avoid false-positives from - * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping - * task stacks, we can be pre-empted in this case, so - * {raw_,}smp_processor_id() may give us the wrong value. Sleeping - * tasks can't ever be on an interrupt stack, so regardless of cpu, - * the checks will always fail. + * Switching between stacks is valid when tracing current and in + * non-preemptible context. */ - irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); + if (tsk == current && !preemptible()) + irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + else + irq_stack_ptr = 0; low = frame->sp; /* irq stacks are not THREAD_SIZE aligned */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cbedd724f48e..c5392081b49b 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,9 +146,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + unsigned long irq_stack_ptr; int skip; + /* + * Switching between stacks is valid when tracing current and in + * non-preemptible context. + */ + if (tsk == current && !preemptible()) + irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + else + irq_stack_ptr = 0; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) From 46debe0769bb9597eaeb460f8dd0c3a85ad76af1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Mar 2016 15:22:55 +0000 Subject: [PATCH 491/607] UPSTREAM: arm64: hugetlb: partial revert of 66b3923a1a0f Commit 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") introduced support for huge pages using the contiguous bit in the PTE as opposed to block mappings, which may be slightly unwieldy (512M) in 64k page configurations. Unfortunately, this support has resulted in some late regressions when running the libhugetlbfs test suite with 64k pages and CONFIG_DEBUG_VM as a result of a BUG: | readback (2M: 64): ------------[ cut here ]------------ | kernel BUG at fs/hugetlbfs/inode.c:446! | Internal error: Oops - BUG: 0 [#1] SMP | Modules linked in: | CPU: 7 PID: 1448 Comm: readback Not tainted 4.5.0-rc7 #148 | Hardware name: linux,dummy-virt (DT) | task: fffffe0040964b00 ti: fffffe00c2668000 task.ti: fffffe00c2668000 | PC is at remove_inode_hugepages+0x44c/0x480 | LR is at remove_inode_hugepages+0x264/0x480 Rather than revert the entire patch, simply avoid advertising the contiguous huge page sizes for now while people are actively working on a fix. This patch can then be reverted once things have been sorted out. Cc: David Woods Reported-by: Steve Capper Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit ff7925848b50050732ac0401e0acf27e8b241d7b) Signed-off-by: Jeff Vander Stoep Change-Id: I3a9751fa79b2d2871dbdc06ea1aa3d1336bb4f4f --- arch/arm64/mm/hugetlbpage.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 82d607c3614e..da30529bb1f6 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt) hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else if (ps == (PAGE_SIZE * CONT_PTES)) { - hugetlb_add_hstate(CONT_PTE_SHIFT); - } else if (ps == (PMD_SIZE * CONT_PMDS)) { - hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; @@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt) return 1; } __setup("hugepagesz=", setup_hugepagesz); - -#ifdef CONFIG_ARM64_64K_PAGES -static __init int add_default_hugepagesz(void) -{ - if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) - hugetlb_add_hstate(CONT_PMD_SHIFT); - return 0; -} -arch_initcall(add_default_hugepagesz); -#endif From b6d54c720c294dd973ddff2d111a4f3a0d35dc74 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 15 Mar 2016 11:22:57 +0000 Subject: [PATCH 492/607] UPSTREAM: arm64: fix KASLR boot-time I-cache maintenance Commit f80fb3a3d50843a4 ("arm64: add support for kernel ASLR") missed a DSB necessary to complete I-cache maintenance in the primary boot path, and hence stale instructions may still be present in the I-cache and may be executed until the I-cache maintenance naturally completes. Since commit 8ec41987436d566f ("arm64: mm: ensure patched kernel text is fetched from PoU"), all CPUs invalidate their I-caches after their MMU is enabled. Prior a CPU's MMU having been enabled, arbitrary lines may have been fetched from the PoC into I-caches. We never patch text expected to be executed with the MMU off. Thus, it is unnecessary to perform broadcast I-cache maintenance in the primary boot path. This patch reduces the scope of the I-cache maintenance to the local CPU, and adds the missing DSB with similar scope, matching prior maintenance in the primary boot path. Signed-off-by: Mark Rutland Acked-by: Ard Biesehvuel Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit b90b4a608ea2401cc491828f7a385edd2e236e37) Signed-off-by: Jeff Vander Stoep Change-Id: Ic66b5fec29867b86782ad6c3243642afc1f40080 --- arch/arm64/kernel/head.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 319f896c6e74..a88a15447c3b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -740,8 +740,9 @@ __enable_mmu: msr sctlr_el1, x19 // re-enable the MMU isb - ic ialluis // flush instructions fetched - isb // via old mapping + ic iallu // flush instructions fetched + dsb nsh // via old mapping + isb add x27, x27, x23 // relocated __mmap_switched #endif br x27 From 25fd795692e05141f36099f778c054a86d586812 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 22 Mar 2016 10:11:45 +0000 Subject: [PATCH 493/607] UPSTREAM: arm64: consistently use p?d_set_huge Commit 324420bf91f60582 ("arm64: add support for ioremap() block mappings") added new p?d_set_huge functions which do the hard work to generate and set a correct block entry. These differ from open-coded huge page creation in the early page table code by explicitly setting the P?D_TYPE_SECT bits (which are implicitly retained by mk_sect_prot() for any valid prot), but are otherwise identical (and cannot fail on arm64). For simplicity and consistency, make use of these in the initial page table creation code. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit c661cb1c537e2364bfdabb298fb934fd77445e98) Signed-off-by: Jeff Vander Stoep Change-Id: I25e58a1626831c2c709abcded989d1770fea851c --- arch/arm64/mm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f4d1da0c1531..72e634533a3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -211,8 +211,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, if (((addr | next | phys) & ~SECTION_MASK) == 0 && block_mappings_allowed(pgtable_alloc)) { pmd_t old_pmd =*pmd; - set_pmd(pmd, __pmd(phys | - pgprot_val(mk_sect_prot(prot)))); + pmd_set_huge(pmd, phys, prot); /* * Check for previous table entries created during * boot (__create_page_tables) and flush them. @@ -272,8 +271,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, if (use_1G_block(addr, next, phys) && block_mappings_allowed(pgtable_alloc)) { pud_t old_pud = *pud; - set_pud(pud, __pud(phys | - pgprot_val(mk_sect_prot(prot)))); + pud_set_huge(pud, phys, prot); /* * If we have an old value for a pud, it will From 1c4143d40cec48cac343bf1e8a426746ac240f92 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 10 Mar 2016 18:41:16 +0000 Subject: [PATCH 494/607] UPSTREAM: arm64: kasan: Fix zero shadow mapping overriding kernel image shadow With the 16KB and 64KB page size configurations, SWAPPER_BLOCK_SIZE is PAGE_SIZE and ARM64_SWAPPER_USES_SECTION_MAPS is 0. Since kimg_shadow_end is not page aligned (_end shifted by KASAN_SHADOW_SCALE_SHIFT), the edges of previously mapped kernel image shadow via vmemmap_populate() may be overridden by subsequent calls to kasan_populate_zero_shadow(), leading to kernel panics like below: ------------------------------------------------------------------------------ Unable to handle kernel paging request at virtual address fffffc100135068c pgd = fffffc8009ac0000 [fffffc100135068c] *pgd=00000009ffee0003, *pud=00000009ffee0003, *pmd=00000009ffee0003, *pte=00e0000081a00793 Internal error: Oops: 9600004f [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.5.0-rc4+ #1984 Hardware name: Juno (DT) task: fffffe09001a0000 ti: fffffe0900200000 task.ti: fffffe0900200000 PC is at __memset+0x4c/0x200 LR is at kasan_unpoison_shadow+0x34/0x50 pc : [] lr : [] pstate: 00000245 sp : fffffe0900203db0 x29: fffffe0900203db0 x28: 0000000000000000 x27: 0000000000000000 x26: 0000000000000000 x25: fffffc80099b69d0 x24: 0000000000000001 x23: 0000000000000000 x22: 0000000000002000 x21: dffffc8000000000 x20: 1fffff9001350a8c x19: 0000000000002000 x18: 0000000000000008 x17: 0000000000000147 x16: ffffffffffffffff x15: 79746972100e041d x14: ffffff0000000000 x13: ffff000000000000 x12: 0000000000000000 x11: 0101010101010101 x10: 1fffffc11c000000 x9 : 0000000000000000 x8 : fffffc100135068c x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : 0000000000000004 x3 : fffffc100134f651 x2 : 0000000000000400 x1 : 0000000000000000 x0 : fffffc100135068c Process swapper/0 (pid: 1, stack limit = 0xfffffe0900200020) Call trace: [] __memset+0x4c/0x200 [] __asan_register_globals+0x5c/0xb0 [] _GLOBAL__sub_I_65535_1_sunrpc_cache_lookup+0x1c/0x28 [] kernel_init_freeable+0x104/0x274 [] kernel_init+0x10/0xf8 [] ret_from_fork+0x10/0x50 ------------------------------------------------------------------------------ This patch aligns kimg_shadow_start and kimg_shadow_end to SWAPPER_BLOCK_SIZE in all configurations. Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area") Signed-off-by: Catalin Marinas Acked-by: Mark Rutland Acked-by: Ard Biesheuvel Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2776e0e8ef683a42fe3e9a5facf576b73579700e) Signed-off-by: Jeff Vander Stoep Change-Id: I13a6b38aefbeddd20bc87cb1382f2787bbc5cf9c --- arch/arm64/mm/kasan_init.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 56e19d150c21..206dd95ea292 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -158,15 +158,12 @@ void __init kasan_init(void) * vmemmap_populate() has populated the shadow region that covers the * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent - * kasan_populate_zero_shadow() from replacing the PMD block mappings - * with PMD table mappings at the edges of the shadow region for the - * kernel image. + * kasan_populate_zero_shadow() from replacing the page table entries + * (PMD or PTE) at the edges of the shadow region for the kernel + * image. */ - if (ARM64_SWAPPER_USES_SECTION_MAPS) { - kimg_shadow_start = round_down(kimg_shadow_start, - SWAPPER_BLOCK_SIZE); - kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); - } + kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE); + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, (void *)mod_shadow_start); From f41890295d6496ed50bbe8606d23fb10661a844f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 2 Mar 2016 09:47:13 +0100 Subject: [PATCH 495/607] UPSTREAM: arm64: mm: check at build time that PAGE_OFFSET divides the VA space evenly Commit 8439e62a1561 ("arm64: mm: use bit ops rather than arithmetic in pa/va translations") changed the boundary check against PAGE_OFFSET from an arithmetic comparison to a bit test. This means we now silently assume that PAGE_OFFSET is a power of 2 that divides the kernel virtual address space into two equal halves. So make that assumption explicit. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6d2aa549de1fc998581d216de3853aa131aa4446) Signed-off-by: Jeff Vander Stoep Change-Id: I8c3bc8cdb7d7f7dea092fd1a208b04583a141054 --- arch/arm64/mm/init.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8e3cf19e4f00..2c38be3df4c8 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -172,6 +172,13 @@ void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; + /* + * Ensure that the linear region takes up exactly half of the kernel + * virtual address space. This way, we can distinguish a linear address + * from a kernel/module/vmalloc address by testing a single bit. + */ + BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1)); + /* * Select a suitable value for the base of physical memory. */ From 391ba6bf3e866b9deb4b0f1d2f52c0ede0acce5a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 25 Feb 2016 20:48:53 +0100 Subject: [PATCH 496/607] UPSTREAM: arm64: lse: deal with clobbered IP registers after branch via PLT The LSE atomics implementation uses runtime patching to patch in calls to out of line non-LSE atomics implementations on cores that lack hardware support for LSE. To avoid paying the overhead cost of a function call even if no call ends up being made, the bl instruction is kept invisible to the compiler, and the out of line implementations preserve all registers, not just the ones that they are required to preserve as per the AAPCS64. However, commit fd045f6cd98e ("arm64: add support for module PLTs") added support for routing branch instructions via veneers if the branch target offset exceeds the range of the ordinary relative branch instructions. Since this deals with jump and call instructions that are exposed to ELF relocations, the PLT code uses x16 to hold the address of the branch target when it performs an indirect branch-to-register, something which is explicitly allowed by the AAPCS64 (and ordinary compiler generated code does not expect register x16 or x17 to retain their values across a bl instruction). Since the lse runtime patched bl instructions don't adhere to the AAPCS64, they don't deal with this clobbering of registers x16 and x17. So add them to the clobber list of the asm() statements that perform the call instructions, and drop x16 and x17 from the list of registers that are callee saved in the out of line non-LSE implementations. In addition, since we have given these functions two scratch registers, they no longer need to stack/unstack temp registers. Signed-off-by: Ard Biesheuvel [will: factored clobber list into #define, updated Makefile comment] Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 5be8b70af1ca78cefb8b756d157532360a5fd663) Signed-off-by: Jeff Vander Stoep Change-Id: Ia44a54eba315a47a6b8aaa2259b444e0139162c0 --- arch/arm64/include/asm/atomic_lse.h | 38 ++++++++++++++--------------- arch/arm64/include/asm/lse.h | 1 + arch/arm64/lib/Makefile | 13 +++++----- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index 197e06afbf71..39c1d340fec5 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -36,7 +36,7 @@ static inline void atomic_andnot(int i, atomic_t *v) " stclr %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_or(int i, atomic_t *v) @@ -48,7 +48,7 @@ static inline void atomic_or(int i, atomic_t *v) " stset %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_xor(int i, atomic_t *v) @@ -60,7 +60,7 @@ static inline void atomic_xor(int i, atomic_t *v) " steor %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_add(int i, atomic_t *v) @@ -72,7 +72,7 @@ static inline void atomic_add(int i, atomic_t *v) " stadd %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC_OP_ADD_RETURN(name, mb, cl...) \ @@ -90,7 +90,7 @@ static inline int atomic_add_return##name(int i, atomic_t *v) \ " add %w[i], %w[i], w30") \ : [i] "+r" (w0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return w0; \ } @@ -116,7 +116,7 @@ static inline void atomic_and(int i, atomic_t *v) " stclr %w[i], %[v]") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_sub(int i, atomic_t *v) @@ -133,7 +133,7 @@ static inline void atomic_sub(int i, atomic_t *v) " stadd %w[i], %[v]") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC_OP_SUB_RETURN(name, mb, cl...) \ @@ -153,7 +153,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v) \ " add %w[i], %w[i], w30") \ : [i] "+r" (w0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS , ##cl); \ \ return w0; \ } @@ -177,7 +177,7 @@ static inline void atomic64_andnot(long i, atomic64_t *v) " stclr %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_or(long i, atomic64_t *v) @@ -189,7 +189,7 @@ static inline void atomic64_or(long i, atomic64_t *v) " stset %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_xor(long i, atomic64_t *v) @@ -201,7 +201,7 @@ static inline void atomic64_xor(long i, atomic64_t *v) " steor %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_add(long i, atomic64_t *v) @@ -213,7 +213,7 @@ static inline void atomic64_add(long i, atomic64_t *v) " stadd %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...) \ @@ -231,7 +231,7 @@ static inline long atomic64_add_return##name(long i, atomic64_t *v) \ " add %[i], %[i], x30") \ : [i] "+r" (x0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -257,7 +257,7 @@ static inline void atomic64_and(long i, atomic64_t *v) " stclr %[i], %[v]") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_sub(long i, atomic64_t *v) @@ -274,7 +274,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) " stadd %[i], %[v]") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...) \ @@ -294,7 +294,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v) \ " add %[i], %[i], x30") \ : [i] "+r" (x0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -330,7 +330,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) "2:") : [ret] "+&r" (x0), [v] "+Q" (v->counter) : - : "x30", "cc", "memory"); + : __LL_SC_CLOBBERS, "cc", "memory"); return x0; } @@ -359,7 +359,7 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr, \ " mov %" #w "[ret], " #w "30") \ : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr) \ : [old] "r" (x1), [new] "r" (x2) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -416,7 +416,7 @@ static inline long __cmpxchg_double##name(unsigned long old1, \ [v] "+Q" (*(unsigned long *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h index 3de42d68611d..23acc00be32d 100644 --- a/arch/arm64/include/asm/lse.h +++ b/arch/arm64/include/asm/lse.h @@ -26,6 +26,7 @@ __asm__(".arch_extension lse"); /* Macro for constructing calls to out-of-line ll/sc atomics */ #define __LL_SC_CALL(op) "bl\t" __stringify(__LL_SC_PREFIX(op)) "\n" +#define __LL_SC_CLOBBERS "x16", "x17", "x30" /* In-line patching at runtime */ #define ARM64_LSE_ATOMIC_INSN(llsc, lse) \ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 1a811ecf71da..c86b7909ef31 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -4,15 +4,16 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o -# Tell the compiler to treat all general purpose registers as -# callee-saved, which allows for efficient runtime patching of the bl -# instruction in the caller with an atomic instruction when supported by -# the CPU. Result and argument registers are handled correctly, based on -# the function prototype. +# Tell the compiler to treat all general purpose registers (with the +# exception of the IP registers, which are already handled by the caller +# in case of a PLT) as callee-saved, which allows for efficient runtime +# patching of the bl instruction in the caller with an atomic instruction +# when supported by the CPU. Result and argument registers are handled +# correctly, based on the function prototype. lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 \ -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ - -fcall-saved-x16 -fcall-saved-x17 -fcall-saved-x18 + -fcall-saved-x18 From 1d2e2e3c0f8f343e4e74cdd89fb94fc341578e64 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 10 Mar 2016 18:30:56 +0000 Subject: [PATCH 497/607] UPSTREAM: arm64: kasan: Use actual memory node when populating the kernel image shadow With the 16KB or 64KB page configurations, the generic vmemmap_populate() implementation warns on potential offnode page_structs via vmemmap_verify() because the arm64 kasan_init() passes NUMA_NO_NODE instead of the actual node for the kernel image memory. Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area") Signed-off-by: Catalin Marinas Reported-by: James Morse Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2f76969f2eef051bdd63d38b08d78e790440b0ad) Signed-off-by: Jeff Vander Stoep Change-Id: I8985e5b4628a9c7076767d4565f7633635813b5c --- arch/arm64/mm/kasan_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 206dd95ea292..757009daa9ed 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -152,7 +152,8 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, + pfn_to_nid(virt_to_pfn(_text))); /* * vmemmap_populate() has populated the shadow region that covers the From 42f010ac16ad91ca029ecb77dbf18ca95d2b9598 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 26 Jul 2016 10:16:55 -0700 Subject: [PATCH 498/607] UPSTREAM: arm64: Only select ARM64_MODULE_PLTS if MODULES=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting CONFIG_RANDOMIZE_BASE=y and CONFIG_MODULES=n fails to build the module PLTs support: CC arch/arm64/kernel/module-plts.o /work/Linux/linux-2.6-aarch64/arch/arm64/kernel/module-plts.c: In function ‘module_emit_plt_entry’: /work/Linux/linux-2.6-aarch64/arch/arm64/kernel/module-plts.c:32:49: error: dereferencing pointer to incomplete type ‘struct module’ This patch selects ARM64_MODULE_PLTS conditionally only if MODULES is enabled. Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Cc: # 4.6+ Reported-by: Jeff Vander Stoep Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit b9c220b589daaf140f5b8ebe502c98745b94e65c) Signed-off-by: Jeff Vander Stoep Change-Id: I446cb3aa78f1c64b5aa1e2e90fda13f7d46cac33 --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0b5ec89a8ec4..bb92d3875791 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -784,7 +784,7 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image" - select ARM64_MODULE_PLTS + select ARM64_MODULE_PLTS if MODULES select RELOCATABLE help Randomizes the virtual address at which the kernel image is From 1488d59cb5fd80dea0abb78bafb5130b64f561db Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 24 Aug 2016 18:02:08 +0100 Subject: [PATCH 499/607] UPSTREAM: arm64: avoid TLB conflict with CONFIG_RANDOMIZE_BASE When CONFIG_RANDOMIZE_BASE is selected, we modify the page tables to remap the kernel at a newly-chosen VA range. We do this with the MMU disabled, but do not invalidate TLBs prior to re-enabling the MMU with the new tables. Thus the old mappings entries may still live in TLBs, and we risk violating Break-Before-Make requirements, leading to TLB conflicts and/or other issues. We invalidate TLBs when we uninsall the idmap in early setup code, but prior to this we are subject to issues relating to the Break-Before-Make violation. Avoid these issues by invalidating the TLBs before the new mappings can be used by the hardware. Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Cc: # 4.6+ Acked-by: Ard Biesheuvel Acked-by: Will Deacon Signed-off-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit fd363bd417ddb6103564c69cfcbd92d9a7877431) Signed-off-by: Jeff Vander Stoep Change-Id: I6c23ce55cdd8b66587b6787b8f28df8535e39f24 --- arch/arm64/kernel/head.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index a88a15447c3b..54fd1e2590a2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -738,6 +738,9 @@ __enable_mmu: isb bl __create_page_tables // recreate kernel mapping + tlbi vmalle1 // Remove any stale TLB entries + dsb nsh + msr sctlr_el1, x19 // re-enable the MMU isb ic iallu // flush instructions fetched From 542ecd89801bcab7306825c0fbc6231cd67e170f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 8 Jun 2016 15:10:57 +0100 Subject: [PATCH 500/607] UPSTREAM: arm64: spinlock: fix spin_unlock_wait for LSE atomics Commit d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers") fixed spin_unlock_wait for LL/SC-based atomics under the premise that the LSE atomics (in particular, the LDADDA instruction) are indivisible. Unfortunately, these instructions are only indivisible when used with the -AL (full ordering) suffix and, consequently, the same issue can theoretically be observed with LSE atomics, where a later (in program order) load can be speculated before the write portion of the atomic operation. This patch fixes the issue by performing a CAS of the lock once we've established that it's unlocked, in much the same way as the LL/SC code. Fixes: d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers") Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 3a5facd09da848193f5bcb0dea098a298bc1a29d) Signed-off-by: Jeff Vander Stoep Change-Id: Icedaa4c508784bf43d0b5787586480fd668ccc49 --- arch/arm64/include/asm/spinlock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index fc9682bfe002..deb6e5ac891f 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -37,13 +37,17 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) "2: ldaxr %w0, %2\n" " eor %w1, %w0, %w0, ror #16\n" " cbnz %w1, 1b\n" + /* Serialise against any concurrent lockers */ ARM64_LSE_ATOMIC_INSN( /* LL/SC */ " stxr %w1, %w0, %2\n" -" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */ - /* LSE atomics */ " nop\n" -" nop\n") +" nop\n", + /* LSE atomics */ +" mov %w1, %w0\n" +" cas %w0, %w0, %2\n" +" eor %w1, %w1, %w0\n") +" cbnz %w1, 2b\n" : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) : : "memory"); From 8ab85055db1054611473cfd1d72512e45b37e251 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:12 -0800 Subject: [PATCH 501/607] UPSTREAM: asm-generic: Consolidate mark_rodata_ro() Instead of defining mark_rodata_ro() in each architecture, consolidate it. Signed-off-by: Kees Cook Acked-by: Will Deacon Cc: Andrew Morton Cc: Andy Gross Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Ashok Kumar Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Dan Williams Cc: David Brown Cc: David Hildenbrand Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Marc Zyngier Cc: Mark Rutland Cc: Mathias Krause Cc: Michael Ellerman Cc: Nicolas Pitre Cc: PaX Team Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ross Zwisler Cc: Russell King Cc: Rusty Russell Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/1455748879-21872-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: Iec0c44b3f5d7948954da93fba6cb57888a2709de (cherry picked from commit e267d97b83d9cecc16c54825f9f3ac7f72dc1e1e) Signed-off-by: Sami Tolvanen --- arch/arm/include/asm/cacheflush.h | 1 - arch/arm64/include/asm/cacheflush.h | 4 ---- arch/parisc/include/asm/cacheflush.h | 4 ---- arch/x86/include/asm/cacheflush.h | 1 - include/linux/init.h | 4 ++++ 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index d5525bfc7e3e..9156fc303afd 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif #ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); void set_kernel_text_rw(void); void set_kernel_text_ro(void); #else diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 7fc294c3bc5b..22dda613f9c9 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); -#ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); -#endif - #endif diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 845272ce9cc5..7bd69bd43a01 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } } -#ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); -#endif - #include #define ARCH_HAS_KMAP diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e63aa38e85fb..c8cff75c5b21 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -92,7 +92,6 @@ void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) #ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); extern const int rodata_test_data; extern int kernel_set_to_readonly; void set_kernel_text_rw(void); diff --git a/include/linux/init.h b/include/linux/init.h index b449f378f995..aedb254abc37 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -142,6 +142,10 @@ void prepare_namespace(void); void __init load_default_modules(void); int __init init_rootfs(void); +#ifdef CONFIG_DEBUG_RODATA +void mark_rodata_ro(void); +#endif + extern void (*late_time_init)(void); extern bool initcall_debug; From 36e8afda0dc5ac915ad9a6742e5a8708ed656249 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:13 -0800 Subject: [PATCH 502/607] UPSTREAM: mm/init: Add 'rodata=off' boot cmdline parameter to disable read-only kernel mappings It may be useful to debug writes to the readonly sections of memory, so provide a cmdline "rodata=off" to allow for this. This can be expanded in the future to support "log" and "write" modes, but that will need to be architecture-specific. This also makes KDB software breakpoints more usable, as read-only mappings can now be disabled on any kernel. Suggested-by: H. Peter Anvin Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I67b818ca390afdd42ab1c27cb4f8ac64bbdb3b65 (cherry picked from commit d2aa1acad22f1bdd0cfa67b3861800e392254454) Signed-off-by: Sami Tolvanen --- Documentation/kernel-parameters.txt | 4 ++++ init/main.c | 27 +++++++++++++++++++++++---- kernel/debug/kdb/kdb_bp.c | 4 +--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b02f027cba72..6bb9f95e7690 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3415,6 +3415,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ro [KNL] Mount root device read-only on boot + rodata= [KNL] + on Mark read-only kernel memory as read-only (default). + off Leave read-only kernel memory writable for debugging. + root= [KNL] Root filesystem See name_to_dev_t comment in init/do_mounts.c. diff --git a/init/main.c b/init/main.c index 9e64d7097f1a..fbafa271531c 100644 --- a/init/main.c +++ b/init/main.c @@ -93,9 +93,6 @@ static int kernel_init(void *); extern void init_IRQ(void); extern void fork_init(void); extern void radix_tree_init(void); -#ifndef CONFIG_DEBUG_RODATA -static inline void mark_rodata_ro(void) { } -#endif /* * Debug helper: via this flag we know that we are in 'early bootup code' @@ -929,6 +926,28 @@ static int try_to_run_init_process(const char *init_filename) static noinline void __init kernel_init_freeable(void); +#ifdef CONFIG_DEBUG_RODATA +static bool rodata_enabled = true; +static int __init set_debug_rodata(char *str) +{ + return strtobool(str, &rodata_enabled); +} +__setup("rodata=", set_debug_rodata); + +static void mark_readonly(void) +{ + if (rodata_enabled) + mark_rodata_ro(); + else + pr_info("Kernel memory protection disabled.\n"); +} +#else +static inline void mark_readonly(void) +{ + pr_warn("This architecture does not have kernel memory protection.\n"); +} +#endif + static int __ref kernel_init(void *unused) { int ret; @@ -937,7 +956,7 @@ static int __ref kernel_init(void *unused) /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); - mark_rodata_ro(); + mark_readonly(); system_state = SYSTEM_RUNNING; numa_default_policy(); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index e1dbf4a2c69e..90ff129c88a2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); -#ifdef CONFIG_DEBUG_RODATA if (!bp->bp_type) { kdb_printf("Software breakpoints are unavailable.\n" - " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " Boot the kernel with rodata=off\n" " OR use hw breaks: help bph\n"); } -#endif return 1; } return 0; From fa88d4cf695b9c3a0de87afaa32d59a3913cfd52 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:14 -0800 Subject: [PATCH 503/607] UPSTREAM: x86/mm: Always enable CONFIG_DEBUG_RODATA and remove the Kconfig option This removes the CONFIG_DEBUG_RODATA option and makes it always enabled. This simplifies the code and also makes it clearer that read-only mapped memory is just as fundamental a security feature in kernel-space as it is in user-space. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I3e79c7c4ead79a81c1445f1b3dd28003517faf18 (cherry picked from commit 9ccaf77cf05915f51231d158abfd5448aedde758) Signed-off-by: Sami Tolvanen --- arch/x86/Kconfig | 3 +++ arch/x86/Kconfig.debug | 18 +++--------------- arch/x86/include/asm/cacheflush.h | 5 ----- arch/x86/include/asm/kvm_para.h | 7 ------- arch/x86/include/asm/sections.h | 2 +- arch/x86/kernel/ftrace.c | 6 +++--- arch/x86/kernel/kgdb.c | 8 ++------ arch/x86/kernel/test_nx.c | 2 -- arch/x86/kernel/test_rodata.c | 2 +- arch/x86/kernel/vmlinux.lds.S | 25 +++++++++++-------------- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/init_64.c | 3 --- arch/x86/mm/pageattr.c | 2 +- 13 files changed, 25 insertions(+), 61 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9e3b1e57499a..7fa0a1830ef6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -307,6 +307,9 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y +config DEBUG_RODATA + def_bool y + config PGTABLE_LEVELS int default 4 if X86_64 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 137dfa96aa14..1f6c306a9a00 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -91,28 +91,16 @@ config EFI_PGT_DUMP issues with the mapping of the EFI runtime regions into that table. -config DEBUG_RODATA - bool "Write protect kernel read-only data structures" - default y - depends on DEBUG_KERNEL - ---help--- - Mark the kernel read-only data as write-protected in the pagetables, - in order to catch accidental (and incorrect) writes to such const - data. This is recommended so that we can catch kernel bugs sooner. - If in doubt, say "Y". - config DEBUG_RODATA_TEST - bool "Testcase for the DEBUG_RODATA feature" - depends on DEBUG_RODATA + bool "Testcase for the marking rodata read-only" default y ---help--- - This option enables a testcase for the DEBUG_RODATA - feature as well as for the change_page_attr() infrastructure. + This option enables a testcase for the setting rodata read-only + as well as for the change_page_attr() infrastructure. If in doubt, say "N" config DEBUG_WX bool "Warn on W+X mappings at boot" - depends on DEBUG_RODATA select X86_PTDUMP_CORE ---help--- Generate a warning if any W+X mappings are found at boot. diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index c8cff75c5b21..61518cf79437 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -91,15 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -#ifdef CONFIG_DEBUG_RODATA extern const int rodata_test_data; extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); -#else -static inline void set_kernel_text_rw(void) { } -static inline void set_kernel_text_ro(void) { } -#endif #ifdef CONFIG_DEBUG_RODATA_TEST int rodata_test(void); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c1adf33fdd0d..bc62e7cbf1b1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void) } #endif /* CONFIG_KVM_GUEST */ -#ifdef CONFIG_DEBUG_RODATA #define KVM_HYPERCALL \ ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) -#else -/* On AMD processors, vmcall will generate a trap that we will - * then rewrite to the appropriate instruction. - */ -#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" -#endif /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 0a5242428659..13b6cdd0af57 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,7 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 311bcf338f07..eb6bd34582c6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end) static unsigned long text_ip_addr(unsigned long ip) { /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. + * On x86_64, kernel text mappings are mapped read-only, so we use + * the kernel identity mapping instead of the kernel text mapping + * to modify the kernel text. * * For 32bit kernels, these mappings are same and we can use * kernel identity mapping to modify code. diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 44256a62702b..ed15cd486d06 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; -#ifdef CONFIG_DEBUG_RODATA char opc[BREAK_INSTR_SIZE]; -#endif /* CONFIG_DEBUG_RODATA */ bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) return err; err = probe_kernel_write((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); -#ifdef CONFIG_DEBUG_RODATA if (!err) return err; /* @@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) return -EINVAL; bpt->type = BP_POKE_BREAKPOINT; -#endif /* CONFIG_DEBUG_RODATA */ + return err; } int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { -#ifdef CONFIG_DEBUG_RODATA int err; char opc[BREAK_INSTR_SIZE]; @@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) goto knl_write; return err; + knl_write: -#endif /* CONFIG_DEBUG_RODATA */ return probe_kernel_write((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 3f92ce07e525..27538f183c3b 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -142,7 +142,6 @@ static int test_NX(void) * by the error message */ -#ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ if (rodata_test_data != 0xC3) { printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); @@ -151,7 +150,6 @@ static int test_NX(void) printk(KERN_ERR "test_nx: .rodata section is executable\n"); ret = -ENODEV; } -#endif #if 0 /* Test 4: Check if the .data section of a module is executable */ diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index 5ecbfe5099da..cb4a01b41e27 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -76,5 +76,5 @@ int rodata_test(void) } MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); MODULE_AUTHOR("Arjan van de Ven "); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf11f562..fe133b710bef 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,29 +41,28 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* - * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA - * we retain large page mappings for boundaries spanning kernel text, rodata - * and data sections. + * On 64-bit, align RODATA to 2MB so we retain large page mappings for + * boundaries spanning kernel text, rodata and data sections. * * However, kernel identity mappings will have different RWX permissions * to the pages mapping to text and to the pages padding (which are freed) the * text section. Hence kernel identity mappings will be broken to smaller * pages. For 64-bit, kernel text and kernel identity mappings are different, - * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, - * as well as retain 2MB large page mappings for kernel text. + * so we can enable protection checks as well as retain 2MB large page + * mappings for kernel text. */ -#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_DEBUG_RODATA_END \ +#define X64_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; #else -#define X64_ALIGN_DEBUG_RODATA_BEGIN -#define X64_ALIGN_DEBUG_RODATA_END +#define X64_ALIGN_RODATA_BEGIN +#define X64_ALIGN_RODATA_END #endif @@ -112,13 +111,11 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 -#if defined(CONFIG_DEBUG_RODATA) /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); -#endif - X64_ALIGN_DEBUG_RODATA_BEGIN + X64_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_DEBUG_RODATA_END + X64_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cb4ef3de61f9..2ebfbaf61142 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void) return flag; } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -960,5 +959,3 @@ void mark_rodata_ro(void) if (__supported_pte_mask & _PAGE_NX) debug_checkwx(); } -#endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec081fe0ce2c..e08d141844ee 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1062,7 +1062,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -1154,8 +1153,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -#endif - int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3137a4feed1..8d77c7f7a614 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -278,7 +278,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections From bfd3de058f94daa2a2751e7660bb9561fa23d7cd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:15 -0800 Subject: [PATCH 504/607] UPSTREAM: arch: Introduce post-init read-only memory One of the easiest ways to protect the kernel from attack is to reduce the internal attack surface exposed when a "write" flaw is available. By making as much of the kernel read-only as possible, we reduce the attack surface. Many things are written to only during __init, and never changed again. These cannot be made "const" since the compiler will do the wrong thing (we do actually need to write to them). Instead, move these items into a memory region that will be made read-only during mark_rodata_ro() which happens after all kernel __init code has finished. This introduces __ro_after_init as a way to mark such memory, and adds some documentation about the existing __read_mostly marking. This improves the security of the Linux kernel by marking formerly read-write memory regions as read-only on a fully booted up system. Based on work by PaX Team and Brad Spengler. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I640f6d858d9770a5e480d12a1c716adf8842feb0 (cherry picked from commit c74ba8b3480da6ddaea17df2263ec09b869ac496) Signed-off-by: Sami Tolvanen --- arch/parisc/include/asm/cache.h | 3 +++ include/asm-generic/vmlinux.lds.h | 1 + include/linux/cache.h | 14 ++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h index 3d0e17bcc8e9..df0f52bd18b4 100644 --- a/arch/parisc/include/asm/cache.h +++ b/arch/parisc/include/asm/cache.h @@ -22,6 +22,9 @@ #define __read_mostly __attribute__((__section__(".data..read_mostly"))) +/* Read-only memory is marked before mark_rodata_ro() is called. */ +#define __ro_after_init __read_mostly + void parisc_cache_init(void); /* initializes cache-flushing */ void disable_sr_hashing_asm(int); /* low level support for above */ void disable_sr_hashing(void); /* turns off space register hashing */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c4bd0e2c173c..772c784ba763 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -256,6 +256,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ + *(.data..ro_after_init) /* Read only after init */ \ *(__vermagic) /* Kernel version magic */ \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \ diff --git a/include/linux/cache.h b/include/linux/cache.h index 17e7e82d2aa7..1be04f8c563a 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -12,10 +12,24 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES #endif +/* + * __read_mostly is used to keep rarely changing variables out of frequently + * updated cachelines. If an architecture doesn't support it, ignore the + * hint. + */ #ifndef __read_mostly #define __read_mostly #endif +/* + * __ro_after_init is used to mark things that are read-only after init (i.e. + * after mark_rodata_ro() has been called). These are effectively read-only, + * but may get written to during init, so can't live in .rodata (via "const"). + */ +#ifndef __ro_after_init +#define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) +#endif + #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #endif From ba24621d7a79e3335f478fead5a84d6ee1a212ce Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:16 -0800 Subject: [PATCH 505/607] UPSTREAM: lkdtm: Verify that '__ro_after_init' works correctly The new __ro_after_init section should be writable before init, but not after. Validate that it gets updated at init and can't be written to afterwards. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I75301d0497fde49a02f13b4e75300111ddadda9d (cherry picked from commit 7cca071ccbd2a293ea69168ace6abbcdce53098e) Signed-off-by: Sami Tolvanen --- drivers/misc/lkdtm.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c index 11fdadc68e53..2a6eaf1122b4 100644 --- a/drivers/misc/lkdtm.c +++ b/drivers/misc/lkdtm.c @@ -103,6 +103,7 @@ enum ctype { CT_EXEC_USERSPACE, CT_ACCESS_USERSPACE, CT_WRITE_RO, + CT_WRITE_RO_AFTER_INIT, CT_WRITE_KERN, }; @@ -140,6 +141,7 @@ static char* cp_type[] = { "EXEC_USERSPACE", "ACCESS_USERSPACE", "WRITE_RO", + "WRITE_RO_AFTER_INIT", "WRITE_KERN", }; @@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up); static u8 data_area[EXEC_SIZE]; static const unsigned long rodata = 0xAA55AA55; +static unsigned long ro_after_init __ro_after_init = 0x55AA5500; module_param(recur_count, int, 0644); MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test"); @@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which) break; } case CT_WRITE_RO: { - unsigned long *ptr; + /* Explicitly cast away "const" for the test. */ + unsigned long *ptr = (unsigned long *)&rodata; - ptr = (unsigned long *)&rodata; + pr_info("attempting bad rodata write at %p\n", ptr); + *ptr ^= 0xabcd1234; - pr_info("attempting bad write at %p\n", ptr); + break; + } + case CT_WRITE_RO_AFTER_INIT: { + unsigned long *ptr = &ro_after_init; + + /* + * Verify we were written to during init. Since an Oops + * is considered a "success", a failure is to just skip the + * real test. + */ + if ((*ptr & 0xAA) != 0xAA) { + pr_info("%p was NOT written during init!?\n", ptr); + break; + } + + pr_info("attempting bad ro_after_init write at %p\n", ptr); *ptr ^= 0xabcd1234; break; @@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void) int n_debugfs_entries = 1; /* Assume only the direct entry */ int i; + /* Make sure we can write to __ro_after_init values during __init */ + ro_after_init |= 0xAA; + /* Register debugfs interface */ lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL); if (!lkdtm_debugfs_root) { From 60e8ff20653571ccab43f9f4554baaa26436e34b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:17 -0800 Subject: [PATCH 506/607] UPSTREAM: x86/vdso: Mark the vDSO code read-only after init The vDSO does not need to be writable after __init, so mark it as __ro_after_init. The result kills the exploit method of writing to the vDSO from kernel space resulting in userspace executing the modified code, as shown here to bypass SMEP restrictions: http://itszn.com/blog/?p=21 The memory map (with added vDSO address reporting) shows the vDSO moving into read-only memory: Before: [ 0.143067] vDSO @ ffffffff82004000 [ 0.143551] vDSO @ ffffffff82006000 ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff81800000 8M ro PSE GLB x pmd 0xffffffff81800000-0xffffffff819f3000 1996K ro GLB x pte 0xffffffff819f3000-0xffffffff81a00000 52K ro NX pte 0xffffffff81a00000-0xffffffff81e00000 4M ro PSE GLB NX pmd 0xffffffff81e00000-0xffffffff81e05000 20K ro GLB NX pte 0xffffffff81e05000-0xffffffff82000000 2028K ro NX pte 0xffffffff82000000-0xffffffff8214f000 1340K RW GLB NX pte 0xffffffff8214f000-0xffffffff82281000 1224K RW NX pte 0xffffffff82281000-0xffffffff82400000 1532K RW GLB NX pte 0xffffffff82400000-0xffffffff83200000 14M RW PSE GLB NX pmd 0xffffffff83200000-0xffffffffc0000000 974M pmd After: [ 0.145062] vDSO @ ffffffff81da1000 [ 0.146057] vDSO @ ffffffff81da4000 ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff81800000 8M ro PSE GLB x pmd 0xffffffff81800000-0xffffffff819f3000 1996K ro GLB x pte 0xffffffff819f3000-0xffffffff81a00000 52K ro NX pte 0xffffffff81a00000-0xffffffff81e00000 4M ro PSE GLB NX pmd 0xffffffff81e00000-0xffffffff81e0b000 44K ro GLB NX pte 0xffffffff81e0b000-0xffffffff82000000 2004K ro NX pte 0xffffffff82000000-0xffffffff8214c000 1328K RW GLB NX pte 0xffffffff8214c000-0xffffffff8227e000 1224K RW NX pte 0xffffffff8227e000-0xffffffff82400000 1544K RW GLB NX pte 0xffffffff82400000-0xffffffff83200000 14M RW PSE GLB NX pmd 0xffffffff83200000-0xffffffffc0000000 974M pmd Based on work by PaX Team and Brad Spengler. Signed-off-by: Kees Cook Acked-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: Iafbf7314a1106c297ea883031ee96c4f53c04a2b (cherry picked from commit 018ef8dcf3de5f62e2cc1a9273cc27e1c6ba8de5) Signed-off-by: Sami Tolvanen --- arch/x86/entry/vdso/vdso2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 0224987556ce..3f69326ed545 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "#include \n"); fprintf(outfile, "\n"); fprintf(outfile, - "static unsigned char raw_data[%lu] __page_aligned_data = {", + "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", mapping_size); for (j = 0; j < stripped_len; j++) { if (j % 10 == 0) From 3cfedbe672ad7fbd73cb5a073ee6a6f60f8130fa Mon Sep 17 00:00:00 2001 From: David Brown Date: Wed, 17 Feb 2016 14:41:18 -0800 Subject: [PATCH 507/607] UPSTREAM: ARM/vdso: Mark the vDSO code read-only after init Although the ARM vDSO is cleanly separated by code/data with the code being read-only in userspace mappings, the code page is still writable from the kernel. There have been exploits (such as http://itszn.com/blog/?p=21) that take advantage of this on x86 to go from a bad kernel write to full root. Prevent this specific exploit class on ARM as well by putting the vDSO code page in post-init read-only memory as well. Before: vdso: 1 text pages at base 80927000 root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables ---[ Modules ]--- ---[ Kernel Mapping ]--- 0x80000000-0x80100000 1M RW NX SHD 0x80100000-0x80600000 5M ro x SHD 0x80600000-0x80800000 2M ro NX SHD 0x80800000-0xbe000000 984M RW NX SHD After: vdso: 1 text pages at base 8072b000 root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables ---[ Modules ]--- ---[ Kernel Mapping ]--- 0x80000000-0x80100000 1M RW NX SHD 0x80100000-0x80600000 5M ro x SHD 0x80600000-0x80800000 2M ro NX SHD 0x80800000-0xbe000000 984M RW NX SHD Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the PaX Team, Brad Spengler, and Kees Cook. Signed-off-by: David Brown Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: Nathan Lynch Cc: PaX Team Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1455748879-21872-8-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I8d3cb7707644343aa907b2d584312ccdad63e270 (cherry picked from commit 11bf9b865898961cee60a41c483c9f27ec76e12e) Signed-off-by: Sami Tolvanen --- arch/arm/vdso/vdso.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S index b2b97e3e7bab..a62a7b64f49c 100644 --- a/arch/arm/vdso/vdso.S +++ b/arch/arm/vdso/vdso.S @@ -23,9 +23,8 @@ #include #include - __PAGE_ALIGNED_DATA - .globl vdso_start, vdso_end + .section .data..ro_after_init .balign PAGE_SIZE vdso_start: .incbin "arch/arm/vdso/vdso.so" From 609abcd0c940b2fa0b7a548c58665bfb5c02d22b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 7 Jun 2016 12:20:51 +0200 Subject: [PATCH 508/607] UPSTREAM: vmlinux.lds.h: allow arch specific handling of ro_after_init data section commit c74ba8b3480d ("arch: Introduce post-init read-only memory") introduced the __ro_after_init attribute which allows to add variables to the ro_after_init data section. This new section was added to rodata, even though it contains writable data. This in turn causes problems on architectures which mark the page table entries read-only that point to rodata very early. This patch allows architectures to implement an own handling of the .data..ro_after_init section. Usually that would be: - mark the rodata section read-only very early - mark the ro_after_init section read-only within mark_rodata_ro Signed-off-by: Heiko Carstens Reviewed-by: Kees Cook Signed-off-by: Martin Schwidefsky Bug: 31660652 Change-Id: If68cb4d86f88678c9bac8c47072775ab85ef5770 (cherry picked from commit 32fb2fc5c357fb99616bbe100dbcb27bc7f5d045) Signed-off-by: Sami Tolvanen --- include/asm-generic/vmlinux.lds.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 772c784ba763..33eb2fe23071 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -248,6 +248,14 @@ . = ALIGN(align); \ *(.data..init_task) +/* + * Allow architectures to handle ro_after_init data on their + * own by defining an empty RO_AFTER_INIT_DATA. + */ +#ifndef RO_AFTER_INIT_DATA +#define RO_AFTER_INIT_DATA *(.data..ro_after_init) +#endif + /* * Read only Data */ @@ -256,7 +264,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ - *(.data..ro_after_init) /* Read only after init */ \ + RO_AFTER_INIT_DATA /* Read only after init */ \ *(__vermagic) /* Kernel version magic */ \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \ From 4055d6b56ac1de5b900ce6f0b484996aa357ee66 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 23 Sep 2016 10:44:37 -0700 Subject: [PATCH 509/607] android-base.cfg: Enable kernel ASLR Bug: 30369029 Change-Id: I0c1c932255866f308d67de1df2ad52c9c19c4799 --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 34fa64a669ac..1ebef5c69fb6 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -139,6 +139,7 @@ CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_PROFILING=y CONFIG_QUOTA=y +CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y From 494a59a68340b0244cd6edf7a367e9233b006a3f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 30 Aug 2016 17:19:13 -0400 Subject: [PATCH 510/607] BACKPORT: audit: consistently record PIDs with task_tgid_nr() Unfortunately we record PIDs in audit records using a variety of methods despite the correct way being the use of task_tgid_nr(). This patch converts all of these callers, except for the case of AUDIT_SET in audit_receive_msg() (see the comment in the code). Reported-by: Jeff Vander Stoep Signed-off-by: Paul Moore Bug: 28952093 (cherry picked from commit fa2bea2f5cca5b8d4a3e5520d2e8c0ede67ac108) Signed-off-by: Jeff Vander Stoep Change-Id: If6645f9de8bc58ed9755f28dc6af5fbf08d72a00 --- kernel/audit.c | 8 +++++++- kernel/auditsc.c | 12 ++++++------ security/lsm_audit.c | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..34f690b9213a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) @@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 48f45987dc6c..63f0e495f517 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1987,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", oldloginuid, loginuid, oldsessionid, sessionid, !rc); @@ -2212,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2237,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2337,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2369,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } diff --git a/security/lsm_audit.c b/security/lsm_audit.c index cccbf3068cdc..45d927ab807d 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -220,7 +220,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm))); switch (a->type) { @@ -294,7 +294,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { - pid_t pid = task_pid_nr(tsk); + pid_t pid = task_tgid_nr(tsk); if (pid) { char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); From 1daecfe215ea3379fdc842f264607d37dd8fe091 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 27 Sep 2016 13:48:29 -0700 Subject: [PATCH 511/607] ANDROID: dm: android-verity: Remove fec_header location constraint This CL removes the mandate of the fec_header being located right after the ECC data. (Cherry-picked from https://android-review.googlesource.com/#/c/280401) Bug: 28865197 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ie04c8cf2dd755f54d02dbdc4e734a13d6f6507b5 --- drivers/md/dm-android-verity.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 15ce2a81c1f4..bb6c1285e499 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -266,10 +266,7 @@ static inline int validate_fec_header(struct fec_header *header, u64 offset) le32_to_cpu(header->version) != FEC_VERSION || le32_to_cpu(header->size) != sizeof(struct fec_header) || le32_to_cpu(header->roots) == 0 || - le32_to_cpu(header->roots) >= FEC_RSM || - offset < le32_to_cpu(header->fec_size) || - offset - le32_to_cpu(header->fec_size) != - le64_to_cpu(header->inp_size)) + le32_to_cpu(header->roots) >= FEC_RSM) return -EINVAL; return 0; From bb849676d5a9351c7d263c20b5ef9e67ec463412 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:15 +0100 Subject: [PATCH 512/607] UPSTREAM: mm/memblock: add MEMBLOCK_NOMAP attribute to memblock memory table This introduces the MEMBLOCK_NOMAP attribute and the required plumbing to make it usable as an indicator that some parts of normal memory should not be covered by the kernel direct mapping. It is up to the arch to actually honor the attribute when laying out this mapping, but the memblock code itself is modified to disregard these regions for allocations and other general use. Cc: linux-mm@kvack.org Cc: Alexander Kuleshov Cc: Andrew Morton Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I55cd3abdf514ac54c071fa0037d8dac73bda798d (cherry picked from commit bf3d3cc580f9960883ebf9ea05868f336d9491c2) Signed-off-by: Sami Tolvanen --- include/linux/memblock.h | 8 ++++++++ mm/memblock.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 24daf8fc4d7c..fec66f86eeff 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -25,6 +25,7 @@ enum { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ MEMBLOCK_MIRROR = 0x2, /* mirrored region */ + MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ }; struct memblock_region { @@ -82,6 +83,7 @@ bool memblock_overlaps_region(struct memblock_type *type, int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); +int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); ulong choose_memblock_flags(void); /* Low level functions */ @@ -184,6 +186,11 @@ static inline bool memblock_is_mirror(struct memblock_region *m) return m->flags & MEMBLOCK_MIRROR; } +static inline bool memblock_is_nomap(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_NOMAP; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); @@ -319,6 +326,7 @@ phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); int memblock_is_memory(phys_addr_t addr); +int memblock_is_map_memory(phys_addr_t addr); int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); int memblock_is_reserved(phys_addr_t addr); bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index d300f1329814..07ff069fef25 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -822,6 +822,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); } +/** + * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); +} /** * __next_reserved_mem_region - next function for for_each_reserved_region() @@ -913,6 +924,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1022,6 +1037,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1519,6 +1538,15 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +int __init_memblock memblock_is_map_memory(phys_addr_t addr) +{ + int i = memblock_search(&memblock.memory, addr); + + if (i == -1) + return false; + return !memblock_is_nomap(&memblock.memory.regions[i]); +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) From b6535fe666034be99b777a0dfad672636e17de35 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:16 +0100 Subject: [PATCH 513/607] BACKPORT: arm64: only consider memblocks with NOMAP cleared for linear mapping Take the new memblock attribute MEMBLOCK_NOMAP into account when deciding whether a certain region is or should be covered by the kernel direct mapping. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: Id7346a09bb3aee5e9a5ef8812251f80cf8265532 (cherry picked from commit 68709f45385aeddb0ca96a060c0c8259944f321b) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/init.c | 2 +- arch/arm64/mm/mmu.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 2c38be3df4c8..08aa45ff05b6 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -131,7 +131,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) int pfn_valid(unsigned long pfn) { - return (pfn & PFN_MASK) == pfn && memblock_is_memory(pfn << PAGE_SHIFT); + return (pfn & PFN_MASK) == pfn && memblock_is_map_memory(pfn << PAGE_SHIFT); } EXPORT_SYMBOL(pfn_valid); #endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 72e634533a3f..9dafcfde7c26 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -438,6 +438,8 @@ static void __init map_mem(pgd_t *pgd) if (start >= end) break; + if (memblock_is_nomap(reg)) + continue; __map_memblock(pgd, start, end); } From 4ff6ae812864e5122da39ca70072987ce047f374 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:17 +0100 Subject: [PATCH 514/607] UPSTREAM: arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP Change the EFI memory reservation logic to use memblock_mark_nomap() rather than memblock_reserve() to mark UEFI reserved regions as occupied. In addition to reserving them against allocations done by memblock, this will also prevent them from being covered by the linear mapping. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: Ia3ce78f40f8d41a9afdd42238fe9cbfd81bbff08 (cherry picked from commit 4dffbfc48d65e5d8157a634fd670065d237a9377) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/efi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4eeb17198cfa..04531d35f1df 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -187,7 +187,7 @@ static __init void reserve_regions(void) early_init_dt_add_memory_arch(paddr, size); if (is_reserve_region(md)) { - memblock_reserve(paddr, size); + memblock_mark_nomap(paddr, size); if (efi_enabled(EFI_DBG)) pr_cont("*"); } @@ -209,8 +209,6 @@ void __init efi_init(void) efi_system_table = params.system_table; - memblock_reserve(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK))); memmap.phys_map = params.mmap; memmap.map = early_memremap(params.mmap, params.mmap_size); if (memmap.map == NULL) { @@ -230,6 +228,9 @@ void __init efi_init(void) reserve_regions(); early_memunmap(memmap.map, params.mmap_size); + memblock_mark_nomap(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); } static bool __init efi_virtmap_init(void) From 0467f02fb679b2de7c583af61cb6e48aca8237fc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:18 +0100 Subject: [PATCH 515/607] UPSTREAM: arm64/efi: split off EFI init and runtime code for reuse by 32-bit ARM This splits off the early EFI init and runtime code that - discovers the EFI params and the memory map from the FDT, and installs the memblocks and config tables. - prepares and installs the EFI page tables so that UEFI Runtime Services can be invoked at the virtual address installed by the stub. This will allow it to be reused for 32-bit ARM. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I143e4b38a5426f70027eff6cc5f732ac370ae69d (cherry picked from commit e5bc22a42e4d46cc203fdfb6d2c76202b08666a0) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/efi.c | 326 +---------------------------- drivers/firmware/efi/Makefile | 3 + drivers/firmware/efi/arm-init.c | 208 ++++++++++++++++++ drivers/firmware/efi/arm-runtime.c | 151 +++++++++++++ 4 files changed, 363 insertions(+), 325 deletions(-) create mode 100644 drivers/firmware/efi/arm-init.c create mode 100644 drivers/firmware/efi/arm-runtime.c diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 04531d35f1df..bd3b2f5adf0c 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -11,318 +11,11 @@ * */ -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include #include -#include -#include -#include -#include - -struct efi_memory_map memmap; - -static u64 efi_system_table; - -static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; - -static struct mm_struct efi_mm = { - .mm_rb = RB_ROOT, - .pgd = efi_pgd, - .mm_users = ATOMIC_INIT(2), - .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), - .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), - .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), -}; - -static int __init is_normal_ram(efi_memory_desc_t *md) -{ - if (md->attribute & EFI_MEMORY_WB) - return 1; - return 0; -} - -/* - * Translate a EFI virtual address into a physical address: this is necessary, - * as some data members of the EFI system table are virtually remapped after - * SetVirtualAddressMap() has been called. - */ -static phys_addr_t efi_to_phys(unsigned long addr) -{ - efi_memory_desc_t *md; - - for_each_efi_memory_desc(&memmap, md) { - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - if (md->virt_addr == 0) - /* no virtual mapping has been installed by the stub */ - break; - if (md->virt_addr <= addr && - (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT)) - return md->phys_addr + addr - md->virt_addr; - } - return addr; -} - -static int __init uefi_init(void) -{ - efi_char16_t *c16; - void *config_tables; - u64 table_size; - char vendor[100] = "unknown"; - int i, retval; - - efi.systab = early_memremap(efi_system_table, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) { - pr_warn("Unable to map EFI system table.\n"); - return -ENOMEM; - } - - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); - - /* - * Verify the EFI Table - */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { - pr_err("System table signature incorrect\n"); - retval = -EINVAL; - goto out; - } - if ((efi.systab->hdr.revision >> 16) < 2) - pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* Show what we know for posterity */ - c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), - sizeof(vendor) * sizeof(efi_char16_t)); - if (c16) { - for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = c16[i]; - vendor[i] = '\0'; - early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); - } - - pr_info("EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables; - config_tables = early_memremap(efi_to_phys(efi.systab->tables), - table_size); - if (config_tables == NULL) { - pr_warn("Unable to map EFI config table array.\n"); - retval = -ENOMEM; - goto out; - } - retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_64_t), NULL); - - early_memunmap(config_tables, table_size); -out: - early_memunmap(efi.systab, sizeof(efi_system_table_t)); - return retval; -} - -/* - * Return true for RAM regions we want to permanently reserve. - */ -static __init int is_reserve_region(efi_memory_desc_t *md) -{ - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - case EFI_PERSISTENT_MEMORY: - return 0; - default: - break; - } - return is_normal_ram(md); -} - -static __init void reserve_regions(void) -{ - efi_memory_desc_t *md; - u64 paddr, npages, size; - - if (efi_enabled(EFI_DBG)) - pr_info("Processing EFI memory map:\n"); - - for_each_efi_memory_desc(&memmap, md) { - paddr = md->phys_addr; - npages = md->num_pages; - - if (efi_enabled(EFI_DBG)) { - char buf[64]; - - pr_info(" 0x%012llx-0x%012llx %s", - paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, - efi_md_typeattr_format(buf, sizeof(buf), md)); - } - - memrange_efi_to_native(&paddr, &npages); - size = npages << PAGE_SHIFT; - - if (is_normal_ram(md)) - early_init_dt_add_memory_arch(paddr, size); - - if (is_reserve_region(md)) { - memblock_mark_nomap(paddr, size); - if (efi_enabled(EFI_DBG)) - pr_cont("*"); - } - - if (efi_enabled(EFI_DBG)) - pr_cont("\n"); - } - - set_bit(EFI_MEMMAP, &efi.flags); -} - -void __init efi_init(void) -{ - struct efi_fdt_params params; - - /* Grab UEFI information placed in FDT by stub */ - if (!efi_get_fdt_params(¶ms)) - return; - - efi_system_table = params.system_table; - - memmap.phys_map = params.mmap; - memmap.map = early_memremap(params.mmap, params.mmap_size); - if (memmap.map == NULL) { - /* - * If we are booting via UEFI, the UEFI memory map is the only - * description of memory we have, so there is little point in - * proceeding if we cannot access it. - */ - panic("Unable to map EFI memory map.\n"); - } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; - - if (uefi_init() < 0) - return; - - reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); - memblock_mark_nomap(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); -} - -static bool __init efi_virtmap_init(void) -{ - efi_memory_desc_t *md; - - init_new_context(NULL, &efi_mm); - - for_each_efi_memory_desc(&memmap, md) { - pgprot_t prot; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - if (md->virt_addr == 0) - return false; - - pr_info(" EFI remap 0x%016llx => %p\n", - md->phys_addr, (void *)md->virt_addr); - - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if (!is_normal_ram(md)) - prot = __pgprot(PROT_DEVICE_nGnRE); - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot = PAGE_KERNEL_EXEC; - else - prot = PAGE_KERNEL; - - create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, - md->num_pages << EFI_PAGE_SHIFT, - __pgprot(pgprot_val(prot) | PTE_NG)); - } - return true; -} - -/* - * Enable the UEFI Runtime Services if all prerequisites are in place, i.e., - * non-early mapping of the UEFI system table and virtual mappings for all - * EFI_MEMORY_RUNTIME regions. - */ -static int __init arm64_enable_runtime_services(void) -{ - u64 mapsize; - - if (!efi_enabled(EFI_BOOT)) { - pr_info("EFI services will not be available.\n"); - return 0; - } - - if (efi_runtime_disabled()) { - pr_info("EFI runtime services will be disabled.\n"); - return 0; - } - - pr_info("Remapping and enabling EFI services.\n"); - - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { - pr_err("Failed to remap EFI memory map\n"); - return -ENOMEM; - } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; - - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); - return -ENOMEM; - } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - - if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); - return -ENOMEM; - } - - /* Set up runtime services function pointers */ - efi_native_runtime_setup(); - set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - - efi.runtime_version = efi.systab->hdr.revision; - - return 0; -} -early_initcall(arm64_enable_runtime_services); static int __init arm64_dmi_init(void) { @@ -338,23 +31,6 @@ static int __init arm64_dmi_init(void) } core_initcall(arm64_dmi_init); -static void efi_set_pgd(struct mm_struct *mm) -{ - switch_mm(NULL, mm, NULL); -} - -void efi_virtmap_load(void) -{ - preempt_disable(); - efi_set_pgd(&efi_mm); -} - -void efi_virtmap_unload(void) -{ - efi_set_pgd(current->active_mm); - preempt_enable(); -} - /* * UpdateCapsule() depends on the system being shutdown via * ResetSystem(). diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index ec379a4164cc..f292917b00e7 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -18,3 +18,6 @@ obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o + +arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o +obj-$(CONFIG_ARM64) += $(arm-obj-y) diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c new file mode 100644 index 000000000000..ffdd76a51929 --- /dev/null +++ b/drivers/firmware/efi/arm-init.c @@ -0,0 +1,208 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 2.4 + * + * Copyright (C) 2013 - 2015 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +struct efi_memory_map memmap; + +u64 efi_system_table; + +static int __init is_normal_ram(efi_memory_desc_t *md) +{ + if (md->attribute & EFI_MEMORY_WB) + return 1; + return 0; +} + +/* + * Translate a EFI virtual address into a physical address: this is necessary, + * as some data members of the EFI system table are virtually remapped after + * SetVirtualAddressMap() has been called. + */ +static phys_addr_t efi_to_phys(unsigned long addr) +{ + efi_memory_desc_t *md; + + for_each_efi_memory_desc(&memmap, md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) + /* no virtual mapping has been installed by the stub */ + break; + if (md->virt_addr <= addr && + (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT)) + return md->phys_addr + addr - md->virt_addr; + } + return addr; +} + +static int __init uefi_init(void) +{ + efi_char16_t *c16; + void *config_tables; + u64 table_size; + char vendor[100] = "unknown"; + int i, retval; + + efi.systab = early_memremap(efi_system_table, + sizeof(efi_system_table_t)); + if (efi.systab == NULL) { + pr_warn("Unable to map EFI system table.\n"); + return -ENOMEM; + } + + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + + /* + * Verify the EFI Table + */ + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { + pr_err("System table signature incorrect\n"); + retval = -EINVAL; + goto out; + } + if ((efi.systab->hdr.revision >> 16) < 2) + pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* Show what we know for posterity */ + c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), + sizeof(vendor) * sizeof(efi_char16_t)); + if (c16) { + for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) + vendor[i] = c16[i]; + vendor[i] = '\0'; + early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); + } + + pr_info("EFI v%u.%.02u by %s\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables; + config_tables = early_memremap(efi_to_phys(efi.systab->tables), + table_size); + if (config_tables == NULL) { + pr_warn("Unable to map EFI config table array.\n"); + retval = -ENOMEM; + goto out; + } + retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, + sizeof(efi_config_table_64_t), NULL); + + early_memunmap(config_tables, table_size); +out: + early_memunmap(efi.systab, sizeof(efi_system_table_t)); + return retval; +} + +/* + * Return true for RAM regions we want to permanently reserve. + */ +static __init int is_reserve_region(efi_memory_desc_t *md) +{ + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + return 0; + default: + break; + } + return is_normal_ram(md); +} + +static __init void reserve_regions(void) +{ + efi_memory_desc_t *md; + u64 paddr, npages, size; + + if (efi_enabled(EFI_DBG)) + pr_info("Processing EFI memory map:\n"); + + for_each_efi_memory_desc(&memmap, md) { + paddr = md->phys_addr; + npages = md->num_pages; + + if (efi_enabled(EFI_DBG)) { + char buf[64]; + + pr_info(" 0x%012llx-0x%012llx %s", + paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, + efi_md_typeattr_format(buf, sizeof(buf), md)); + } + + memrange_efi_to_native(&paddr, &npages); + size = npages << PAGE_SHIFT; + + if (is_normal_ram(md)) + early_init_dt_add_memory_arch(paddr, size); + + if (is_reserve_region(md)) { + memblock_mark_nomap(paddr, size); + if (efi_enabled(EFI_DBG)) + pr_cont("*"); + } + + if (efi_enabled(EFI_DBG)) + pr_cont("\n"); + } + + set_bit(EFI_MEMMAP, &efi.flags); +} + +void __init efi_init(void) +{ + struct efi_fdt_params params; + + /* Grab UEFI information placed in FDT by stub */ + if (!efi_get_fdt_params(¶ms)) + return; + + efi_system_table = params.system_table; + + memmap.phys_map = params.mmap; + memmap.map = early_memremap(params.mmap, params.mmap_size); + if (memmap.map == NULL) { + /* + * If we are booting via UEFI, the UEFI memory map is the only + * description of memory we have, so there is little point in + * proceeding if we cannot access it. + */ + panic("Unable to map EFI memory map.\n"); + } + memmap.map_end = memmap.map + params.mmap_size; + memmap.desc_size = params.desc_size; + memmap.desc_version = params.desc_ver; + + if (uefi_init() < 0) + return; + + reserve_regions(); + early_memunmap(memmap.map, params.mmap_size); + memblock_mark_nomap(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); +} diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c new file mode 100644 index 000000000000..974743e13a4d --- /dev/null +++ b/drivers/firmware/efi/arm-runtime.c @@ -0,0 +1,151 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 2.4 + * + * Copyright (C) 2013, 2014 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; + +extern u64 efi_system_table; + +static struct mm_struct efi_mm = { + .mm_rb = RB_ROOT, + .pgd = efi_pgd, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), +}; + +static bool __init efi_virtmap_init(void) +{ + efi_memory_desc_t *md; + + init_new_context(NULL, &efi_mm); + + for_each_efi_memory_desc(&memmap, md) { + pgprot_t prot; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) + return false; + + pr_info(" EFI remap 0x%016llx => %p\n", + md->phys_addr, (void *)md->virt_addr); + + /* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. + */ + if ((md->attribute & EFI_MEMORY_WB) == 0) + prot = __pgprot(PROT_DEVICE_nGnRE); + else if (md->type == EFI_RUNTIME_SERVICES_CODE || + !PAGE_ALIGNED(md->phys_addr)) + prot = PAGE_KERNEL_EXEC; + else + prot = PAGE_KERNEL; + + create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + __pgprot(pgprot_val(prot) | PTE_NG)); + } + return true; +} + +/* + * Enable the UEFI Runtime Services if all prerequisites are in place, i.e., + * non-early mapping of the UEFI system table and virtual mappings for all + * EFI_MEMORY_RUNTIME regions. + */ +static int __init arm64_enable_runtime_services(void) +{ + u64 mapsize; + + if (!efi_enabled(EFI_BOOT)) { + pr_info("EFI services will not be available.\n"); + return 0; + } + + if (efi_runtime_disabled()) { + pr_info("EFI runtime services will be disabled.\n"); + return 0; + } + + pr_info("Remapping and enabling EFI services.\n"); + + mapsize = memmap.map_end - memmap.map; + memmap.map = (__force void *)ioremap_cache(memmap.phys_map, + mapsize); + if (!memmap.map) { + pr_err("Failed to remap EFI memory map\n"); + return -ENOMEM; + } + memmap.map_end = memmap.map + mapsize; + efi.memmap = &memmap; + + efi.systab = (__force void *)ioremap_cache(efi_system_table, + sizeof(efi_system_table_t)); + if (!efi.systab) { + pr_err("Failed to remap EFI System Table\n"); + return -ENOMEM; + } + set_bit(EFI_SYSTEM_TABLES, &efi.flags); + + if (!efi_virtmap_init()) { + pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); + return -ENOMEM; + } + + /* Set up runtime services function pointers */ + efi_native_runtime_setup(); + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); + + efi.runtime_version = efi.systab->hdr.revision; + + return 0; +} +early_initcall(arm64_enable_runtime_services); + +static void efi_set_pgd(struct mm_struct *mm) +{ + switch_mm(NULL, mm, NULL); +} + +void efi_virtmap_load(void) +{ + preempt_disable(); + efi_set_pgd(&efi_mm); +} + +void efi_virtmap_unload(void) +{ + efi_set_pgd(current->active_mm); + preempt_enable(); +} From e5ef8437e39123daae78dbd9010c45226681d076 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:19 +0100 Subject: [PATCH 516/607] UPSTREAM: arm64/efi: refactor EFI init and runtime code for reuse by 32-bit ARM This refactors the EFI init and runtime code that will be shared between arm64 and ARM so that it can be built for both archs. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: Ieee70bbe117170d2054a9c82c4f1a8143b7e302b (cherry picked from commit f7d924894265794f447ea799dd853400749b5a22) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/efi.h | 9 ++++++ arch/arm64/kernel/efi.c | 23 ++++++++++++++ drivers/firmware/efi/arm-init.c | 7 +++-- drivers/firmware/efi/arm-runtime.c | 48 ++++++++++-------------------- drivers/firmware/efi/efi.c | 2 ++ 5 files changed, 54 insertions(+), 35 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index ef572206f1c3..8e88a696c9cb 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -2,7 +2,9 @@ #define _ASM_EFI_H #include +#include #include +#include #ifdef CONFIG_EFI extern void efi_init(void); @@ -10,6 +12,8 @@ extern void efi_init(void); #define efi_init() #endif +int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); + #define efi_call_virt(f, ...) \ ({ \ efi_##f##_t *__f; \ @@ -63,6 +67,11 @@ extern void efi_init(void); * Services are enabled and the EFI_RUNTIME_SERVICES bit set. */ +static inline void efi_set_pgd(struct mm_struct *mm) +{ + switch_mm(NULL, mm, NULL); +} + void efi_virtmap_load(void); void efi_virtmap_unload(void); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index bd3b2f5adf0c..b6abc852f2a1 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,6 +17,29 @@ #include +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val; + + /* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. + */ + if ((md->attribute & EFI_MEMORY_WB) == 0) + prot_val = PROT_DEVICE_nGnRE; + else if (md->type == EFI_RUNTIME_SERVICES_CODE || + !PAGE_ALIGNED(md->phys_addr)) + prot_val = pgprot_val(PAGE_KERNEL_EXEC); + else + prot_val = pgprot_val(PAGE_KERNEL); + + create_pgd_mapping(mm, md->phys_addr, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + __pgprot(prot_val | PTE_NG)); + return 0; +} + static int __init arm64_dmi_init(void) { /* diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index ffdd76a51929..9e15d571b53c 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -57,7 +57,7 @@ static int __init uefi_init(void) { efi_char16_t *c16; void *config_tables; - u64 table_size; + size_t table_size; char vendor[100] = "unknown"; int i, retval; @@ -69,7 +69,8 @@ static int __init uefi_init(void) } set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); + if (IS_ENABLED(CONFIG_64BIT)) + set_bit(EFI_64BIT, &efi.flags); /* * Verify the EFI Table @@ -107,7 +108,7 @@ static int __init uefi_init(void) goto out; } retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_64_t), NULL); + sizeof(efi_config_table_t), NULL); early_memunmap(config_tables, table_size); out: diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 974743e13a4d..6ae21e41a429 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -23,18 +24,14 @@ #include #include -#include -#include #include +#include #include -static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; - extern u64 efi_system_table; static struct mm_struct efi_mm = { .mm_rb = RB_ROOT, - .pgd = efi_pgd, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), @@ -46,35 +43,27 @@ static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); for_each_efi_memory_desc(&memmap, md) { - pgprot_t prot; + phys_addr_t phys = md->phys_addr; + int ret; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) return false; - pr_info(" EFI remap 0x%016llx => %p\n", - md->phys_addr, (void *)md->virt_addr); - - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot = __pgprot(PROT_DEVICE_nGnRE); - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot = PAGE_KERNEL_EXEC; - else - prot = PAGE_KERNEL; - - create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, - md->num_pages << EFI_PAGE_SHIFT, - __pgprot(pgprot_val(prot) | PTE_NG)); + ret = efi_create_mapping(&efi_mm, md); + if (!ret) { + pr_info(" EFI remap %pa => %p\n", + &phys, (void *)(unsigned long)md->virt_addr); + } else { + pr_warn(" EFI remap %pa: failed to create mapping (%d)\n", + &phys, ret); + return false; + } } return true; } @@ -84,7 +73,7 @@ static bool __init efi_virtmap_init(void) * non-early mapping of the UEFI system table and virtual mappings for all * EFI_MEMORY_RUNTIME regions. */ -static int __init arm64_enable_runtime_services(void) +static int __init arm_enable_runtime_services(void) { u64 mapsize; @@ -131,12 +120,7 @@ static int __init arm64_enable_runtime_services(void) return 0; } -early_initcall(arm64_enable_runtime_services); - -static void efi_set_pgd(struct mm_struct *mm) -{ - switch_mm(NULL, mm, NULL); -} +early_initcall(arm_enable_runtime_services); void efi_virtmap_load(void) { diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 027ca212179f..cffa89b3317b 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -25,6 +25,8 @@ #include #include +#include + struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, .acpi = EFI_INVALID_TABLE_ADDR, From d5f46e4b247971c9c10f902151dcf7df8c750b1c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 5 Nov 2015 15:09:17 +0000 Subject: [PATCH 517/607] UPSTREAM: arm64: Add macros to read/write system registers Rather than crafting custom macros for reading/writing each system register provide generics accessors, read_sysreg and write_sysreg, for this purpose. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Cc: Suzuki Poulose Cc: Will Deacon Signed-off-by: Marc Zyngier Change-Id: I1d6cf948bc6660dfd096ff5a18eba682941098c1 (cherry picked from commit 3600c2fdc09a43a30909743569e35a29121602ed) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/sysreg.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b9fd8ec79033..1a78d6e2a78b 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -20,6 +20,8 @@ #ifndef __ASM_SYSREG_H #define __ASM_SYSREG_H +#include + #include /* @@ -215,6 +217,8 @@ #else +#include + asm( " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" " .equ .L__reg_num_x\\num, \\num\n" @@ -239,6 +243,23 @@ static inline void config_sctlr_el1(u32 clear, u32 set) val |= set; asm volatile("msr sctlr_el1, %0" : : "r" (val)); } + +/* + * Unlike read_cpuid, calls to read_sysreg are never expected to be + * optimized away or replaced with synthetic values. + */ +#define read_sysreg(r) ({ \ + u64 __val; \ + asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ + __val; \ +}) + +#define write_sysreg(v, r) do { \ + u64 __val = (u64)v; \ + asm volatile("msr " __stringify(r) ", %0" \ + : : "r" (__val)); \ +} while (0) + #endif #endif /* __ASM_SYSREG_H */ From 3e64a8612dfa8738b95ad904a454b859ec9d9174 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Wed, 24 Feb 2016 17:44:57 -0800 Subject: [PATCH 518/607] UPSTREAM: arm64: Add workaround for Cavium erratum 27456 On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI instructions may cause the icache to become corrupted if it contains data for a non-current ASID. This patch implements the workaround (which invalidates the local icache when switching the mm) by using code patching. Signed-off-by: Andrew Pinski Signed-off-by: David Daney Reviewed-by: Will Deacon Signed-off-by: Catalin Marinas Change-Id: I60e6d17926b067a4e022d7b159e239114303a547 (cherry picked from commit 104a0c02e8b1936c049e18a6d4e4ab040fb61213) Signed-off-by: Sami Tolvanen --- Documentation/arm64/silicon-errata.txt | 1 + arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/mm/proc.S | 12 ++++++++++++ 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 58b71ddf9b60..ba4b6acfc545 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -56,3 +56,4 @@ stable kernels. | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | | Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | +| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bb92d3875791..bf24d67a1ff1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -439,6 +439,17 @@ config CAVIUM_ERRATUM_23154 If unsure, say Y. +config CAVIUM_ERRATUM_27456 + bool "Cavium erratum 27456: Broadcast TLBI instructions may cause icache corruption" + default y + help + On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI + instructions may cause the icache to become corrupted if it + contains data for a non-current ASID. The fix is to + invalidate the icache when changing the mm context. + + If unsure, say Y. + endmenu diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 37a53fc6b384..727e594ac5c2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -33,8 +33,9 @@ #define ARM64_HAS_NO_HW_PREFETCH 8 #define ARM64_HAS_UAO 9 #define ARM64_ALT_PAN_NOT_UAO 10 +#define ARM64_WORKAROUND_CAVIUM_27456 12 -#define ARM64_NCAPS 11 +#define ARM64_NCAPS 13 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index e6bc988e8dbf..06afd04e02c0 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -87,6 +87,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_WORKAROUND_CAVIUM_23154, MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01), }, +#endif +#ifdef CONFIG_CAVIUM_ERRATUM_27456 + { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + .desc = "Cavium erratum 27456", + .capability = ARM64_WORKAROUND_CAVIUM_27456, + MIDR_RANGE(MIDR_THUNDERX, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 1), + }, #endif { } diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 0c19534a901e..543f5198005a 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "proc-macros.S" @@ -137,7 +139,17 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb +alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 ret + nop + nop + nop +alternative_else + ic iallu + dsb nsh + isb + ret +alternative_endif ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From b4ad81e8d05dc75ee527c56f6936ebc7fa3f46a4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Mar 2016 10:58:09 +0100 Subject: [PATCH 519/607] UPSTREAM: arm64/kernel: fix incorrect EL0 check in inv_entry macro The implementation of macro inv_entry refers to its 'el' argument without the required leading backslash, which results in an undefined symbol 'el' to be passed into the kernel_entry macro rather than the index of the exception level as intended. This undefined symbol strangely enough does not result in build failures, although it is visible in vmlinux: $ nm -n vmlinux |head U el 0000000000000000 A _kernel_flags_le_hi32 0000000000000000 A _kernel_offset_le_hi32 0000000000000000 A _kernel_size_le_hi32 000000000000000a A _kernel_flags_le_lo32 ..... However, it does result in incorrect code being generated for invalid exceptions taken from EL0, since the argument check in kernel_entry assumes EL1 if its argument does not equal '0'. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Change-Id: I406c1207682a4dff3054a019c26fdf1310b08ed1 (cherry picked from commit b660950c60a7278f9d8deb7c32a162031207c758) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 1f7f5a2b61bf..12e8d2bcb3f9 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -277,7 +277,7 @@ END(vectors) * Invalid mode handlers */ .macro inv_entry, el, reason, regsize = 64 - kernel_entry el, \regsize + kernel_entry \el, \regsize mov x0, sp mov x1, #\reason mrs x2, esr_el1 From c1ee3c7d19d2735996d99f6c94a50ac77a9ea338 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 30 Mar 2016 14:25:46 +0200 Subject: [PATCH 520/607] UPSTREAM: arm64/mm: ensure memstart_addr remains sufficiently aligned After choosing memstart_addr to be the highest multiple of ARM64_MEMSTART_ALIGN less than or equal to the first usable physical memory address, we clip the memblocks to the maximum size of the linear region. Since the kernel may be high up in memory, we take care not to clip the kernel itself, which means we have to clip some memory from the bottom if this occurs, to ensure that the distance between the first and the last usable physical memory address can be covered by the linear region. However, we fail to update memstart_addr if this clipping from the bottom occurs, which means that we may still end up with virtual addresses that wrap into the userland range. So increment memstart_addr as appropriate to prevent this from happening. Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I72306207cc46a30b780f5e00b9ef23aa8409867e (cherry picked from commit 2958987f5da2ebcf6a237c5f154d7e3340e60945) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/init.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 08aa45ff05b6..0db12e9e8765 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -192,8 +192,12 @@ void __init arm64_memblock_init(void) */ memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)), ULLONG_MAX); - if (memblock_end_of_DRAM() > linear_region_size) - memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); + if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) { + /* ensure that memstart_addr remains sufficiently aligned */ + memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size, + ARM64_MEMSTART_ALIGN); + memblock_remove(0, memstart_addr); + } /* * Apply the memory limit if it was set. Since the kernel may be loaded From 9297d46fb6479dad23622f61760dd9fb1700442d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 30 Mar 2016 14:25:47 +0200 Subject: [PATCH 521/607] UPSTREAM: arm64: choose memstart_addr based on minimum sparsemem section alignment This redefines ARM64_MEMSTART_ALIGN in terms of the minimal alignment required by sparsemem vmemmap. This comes down to using 1 GB for all translation granules if CONFIG_SPARSEMEM_VMEMMAP is enabled. Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I05b8bc6ab24f677f263b09d7c31fcce4f21269b1 (cherry picked from commit 06e9bf2fd9b372bc1c757995b6ca1cfab0720acb) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/kernel-pgtable.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 5c6375d8528b..7e51d1b57c0c 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include /* * The linear mapping and the start of memory are both 2M aligned (per @@ -86,10 +87,24 @@ * (64k granule), or a multiple that can be mapped using contiguous bits * in the page tables: 32 * PMD_SIZE (16k granule) */ -#ifdef CONFIG_ARM64_64K_PAGES -#define ARM64_MEMSTART_ALIGN SZ_512M +#if defined(CONFIG_ARM64_4K_PAGES) +#define ARM64_MEMSTART_SHIFT PUD_SHIFT +#elif defined(CONFIG_ARM64_16K_PAGES) +#define ARM64_MEMSTART_SHIFT (PMD_SHIFT + 5) #else -#define ARM64_MEMSTART_ALIGN SZ_1G +#define ARM64_MEMSTART_SHIFT PMD_SHIFT +#endif + +/* + * sparsemem vmemmap imposes an additional requirement on the alignment of + * memstart_addr, due to the fact that the base of the vmemmap region + * has a direct correspondence, and needs to appear sufficiently aligned + * in the virtual address space. + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS +#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS) +#else +#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT) #endif #endif /* __ASM_KERNEL_PGTABLE_H */ From 29089dcc2c5272437ce9bbdae4ea249ed25e8281 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 13 Apr 2016 16:01:22 +0100 Subject: [PATCH 522/607] UPSTREAM: arm64: Implement ptep_set_access_flags() for hardware AF/DBM When hardware updates of the access and dirty states are enabled, the default ptep_set_access_flags() implementation based on calling set_pte_at() directly is potentially racy. This triggers the "racy dirty state clearing" warning in set_pte_at() because an existing writable PTE is overridden with a clean entry. There are two main scenarios for this situation: 1. The CPU getting an access fault does not support hardware updates of the access/dirty flags. However, a different agent in the system (e.g. SMMU) can do this, therefore overriding a writable entry with a clean one could potentially lose the automatically updated dirty status 2. A more complex situation is possible when all CPUs support hardware AF/DBM: a) Initial state: shareable + writable vma and pte_none(pte) b) Read fault taken by two threads of the same process on different CPUs c) CPU0 takes the mmap_sem and proceeds to handling the fault. It eventually reaches do_set_pte() which sets a writable + clean pte. CPU0 releases the mmap_sem d) CPU1 acquires the mmap_sem and proceeds to handle_pte_fault(). The pte entry it reads is present, writable and clean and it continues to pte_mkyoung() e) CPU1 calls ptep_set_access_flags() If between (d) and (e) the hardware (another CPU) updates the dirty state (clears PTE_RDONLY), CPU1 will override the PTR_RDONLY bit marking the entry clean again. This patch implements an arm64-specific ptep_set_access_flags() function to perform an atomic update of the PTE flags. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Signed-off-by: Catalin Marinas Reported-by: Ming Lei Tested-by: Julien Grall Cc: Will Deacon Cc: # 4.3+ [will: reworded comment] Signed-off-by: Will Deacon Change-Id: Id2a0b0d8eb6e7df6325ecb48b88b8401a5dd09e5 (cherry picked from commit 66dbd6e61a526ae7d11a208238ae2c17e5cacb6b) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/pgtable.h | 5 ++++ arch/arm64/mm/fault.c | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d48ea8fc789d..0c82a1d9f31e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -594,6 +594,11 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #ifdef CONFIG_ARM64_HW_AFDBM +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + /* * Atomic pte/pmd modifications. */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 44e56de23f79..9bb45de46085 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -81,6 +81,56 @@ void show_pte(struct mm_struct *mm, unsigned long addr) printk("\n"); } +#ifdef CONFIG_ARM64_HW_AFDBM +/* + * This function sets the access flags (dirty, accessed), as well as write + * permission, and only to a more permissive setting. + * + * It needs to cope with hardware update of the accessed/dirty state by other + * agents in the system and can safely skip the __sync_icache_dcache() call as, + * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * + * Returns whether or not the PTE actually changed. + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + pteval_t old_pteval; + unsigned int tmp; + + if (pte_same(*ptep, entry)) + return 0; + + /* only preserve the access flags and write permission */ + pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; + + /* + * PTE_RDONLY is cleared by default in the asm below, so set it in + * back if necessary (read-only or clean PTE). + */ + if (!pte_write(entry) || !dirty) + pte_val(entry) |= PTE_RDONLY; + + /* + * Setting the flags must be done atomically to avoid racing with the + * hardware update of the access/dirty state. + */ + asm volatile("// ptep_set_access_flags\n" + " prfm pstl1strm, %2\n" + "1: ldxr %0, %2\n" + " and %0, %0, %3 // clear PTE_RDONLY\n" + " orr %0, %0, %4 // set flags\n" + " stxr %w1, %0, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) + : "L" (~PTE_RDONLY), "r" (pte_val(entry))); + + flush_tlb_fix_spurious_fault(vma, address); + return 1; +} +#endif + /* * The kernel tried to access some page that wasn't present. */ From 2515432e66e01f632e49069e7c04b969dc13c3d5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 18 Apr 2016 17:09:44 +0200 Subject: [PATCH 523/607] UPSTREAM: arm64: introduce mov_q macro to move a constant into a 64-bit register Implement a macro mov_q that can be used to move an immediate constant into a 64-bit register, using between 2 and 4 movz/movk instructions (depending on the operand) Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I7e6b684e46cad5df79e6b8bc28d72b9e37daedd6 (cherry picked from commit 30b5ba5cf333cc650e474eaf2cc1ae91bc7cf89f) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 70f7b9e04598..9ea846ded55c 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -233,4 +233,24 @@ lr .req x30 // link register .long \sym\()_hi32 .endm + /* + * mov_q - move an immediate constant into a 64-bit register using + * between 2 and 4 movz/movk instructions (depending on the + * magnitude and sign of the operand) + */ + .macro mov_q, reg, val + .if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff) + movz \reg, :abs_g1_s:\val + .else + .if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff) + movz \reg, :abs_g2_s:\val + .else + movz \reg, :abs_g3:\val + movk \reg, :abs_g2_nc:\val + .endif + movk \reg, :abs_g1_nc:\val + .endif + movk \reg, :abs_g0_nc:\val + .endm + #endif /* __ASM_ASSEMBLER_H */ From 9c8a3a98f3c33eb7665dd5f101f28c46fedb4b75 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Wed, 27 Apr 2016 17:47:00 +0100 Subject: [PATCH 524/607] BACKPORT: arm64: Fold proc-macros.S into assembler.h To allow the assembler macros defined in arch/arm64/mm/proc-macros.S to be used outside the mm code move the contents of proc-macros.S to asm/assembler.h. Also, delete proc-macros.S, and fix up all references to proc-macros.S. Signed-off-by: Geoff Levand Acked-by: Pavel Machek [rebased, included dcache_by_line_op] Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Will Deacon Change-Id: I09e694442ffd25dcac864216d0369c9727ad0090 (cherry picked from commit 7b7293ae3dbd0a1965bf310b77fed5f9bb37bb93) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 82 ++++++++++++++++++++++++- arch/arm64/mm/cache.S | 2 - arch/arm64/mm/proc-macros.S | 98 ------------------------------ arch/arm64/mm/proc.S | 3 - 4 files changed, 81 insertions(+), 104 deletions(-) delete mode 100644 arch/arm64/mm/proc-macros.S diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9ea846ded55c..33bbb7ba7fb5 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -1,5 +1,5 @@ /* - * Based on arch/arm/include/asm/assembler.h + * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S * * Copyright (C) 1996-2000 Russell King * Copyright (C) 2012 ARM Ltd. @@ -23,6 +23,8 @@ #ifndef __ASM_ASSEMBLER_H #define __ASM_ASSEMBLER_H +#include +#include #include #include @@ -211,6 +213,84 @@ lr .req x30 // link register add \reg, \reg, \tmp .endm +/* + * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) + */ + .macro vma_vm_mm, rd, rn + ldr \rd, [\rn, #VMA_VM_MM] + .endm + +/* + * mmid - get context id from mm pointer (mm->context.id) + */ + .macro mmid, rd, rn + ldr \rd, [\rn, #MM_CONTEXT_ID] + .endm + +/* + * dcache_line_size - get the minimum D-cache line size from the CTR register. + */ + .macro dcache_line_size, reg, tmp + mrs \tmp, ctr_el0 // read CTR + ubfm \tmp, \tmp, #16, #19 // cache line size encoding + mov \reg, #4 // bytes per word + lsl \reg, \reg, \tmp // actual cache line size + .endm + +/* + * icache_line_size - get the minimum I-cache line size from the CTR register. + */ + .macro icache_line_size, reg, tmp + mrs \tmp, ctr_el0 // read CTR + and \tmp, \tmp, #0xf // cache line size encoding + mov \reg, #4 // bytes per word + lsl \reg, \reg, \tmp // actual cache line size + .endm + +/* + * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map + */ + .macro tcr_set_idmap_t0sz, valreg, tmpreg +#ifndef CONFIG_ARM64_VA_BITS_48 + ldr_l \tmpreg, idmap_t0sz + bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH +#endif + .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 6df07069a025..50ff9ba3a236 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -24,8 +24,6 @@ #include #include -#include "proc-macros.S" - /* * flush_icache_range(start,end) * diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S deleted file mode 100644 index 984edcda1850..000000000000 --- a/arch/arm64/mm/proc-macros.S +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Based on arch/arm/mm/proc-macros.S - * - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include - -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldr \rd, [\rn, #VMA_VM_MM] - .endm - -/* - * mmid - get context id from mm pointer (mm->context.id) - */ - .macro mmid, rd, rn - ldr \rd, [\rn, #MM_CONTEXT_ID] - .endm - -/* - * dcache_line_size - get the minimum D-cache line size from the CTR register. - */ - .macro dcache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - ubfm \tmp, \tmp, #16, #19 // cache line size encoding - mov \reg, #4 // bytes per word - lsl \reg, \reg, \tmp // actual cache line size - .endm - -/* - * icache_line_size - get the minimum I-cache line size from the CTR register. - */ - .macro icache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - and \tmp, \tmp, #0xf // cache line size encoding - mov \reg, #4 // bytes per word - lsl \reg, \reg, \tmp // actual cache line size - .endm - -/* - * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map - */ - .macro tcr_set_idmap_t0sz, valreg, tmpreg -#ifndef CONFIG_ARM64_VA_BITS_48 - ldr_l \tmpreg, idmap_t0sz - bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH -#endif - .endm - -/* - * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present - */ - .macro reset_pmuserenr_el0, tmpreg - mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx \tmpreg, \tmpreg, #8, #4 - cmp \tmpreg, #1 // Skip if no PMU present - b.lt 9000f - msr pmuserenr_el0, xzr // Disable PMU access from EL0 -9000: - .endm - -/* - * Macro to perform a data cache maintenance for the interval - * [kaddr, kaddr + size) - * - * op: operation passed to dc instruction - * domain: domain used in dsb instruciton - * kaddr: starting virtual address of the region - * size: size of the region - * Corrupts: kaddr, size, tmp1, tmp2 - */ - .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 - dcache_line_size \tmp1, \tmp2 - add \size, \kaddr, \size - sub \tmp2, \tmp1, #1 - bic \kaddr, \kaddr, \tmp2 -9998: dc \op, \kaddr - add \kaddr, \kaddr, \tmp1 - cmp \kaddr, \size - b.lo 9998b - dsb \domain - .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 543f5198005a..9f6deacf41d2 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -23,13 +23,10 @@ #include #include #include -#include #include #include #include -#include "proc-macros.S" - #ifdef CONFIG_ARM64_64K_PAGES #define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K #elif defined(CONFIG_ARM64_16K_PAGES) From fdc4c4805c5afb51fdd0e03ee55281d9b10f25c1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 13 Jun 2016 11:15:14 +0100 Subject: [PATCH 525/607] UPSTREAM: arm64: fix dump_instr when PAN and UAO are in use If the kernel is set to show unhandled signals, and a user task does not handle a SIGILL as a result of an instruction abort, we will attempt to log the offending instruction with dump_instr before killing the task. We use dump_instr to log the encoding of the offending userspace instruction. However, dump_instr is also used to dump instructions from kernel space, and internally always switches to KERNEL_DS before dumping the instruction with get_user. When both PAN and UAO are in use, reading a user instruction via get_user while in KERNEL_DS will result in a permission fault, which leads to an Oops. As we have regs corresponding to the context of the original instruction abort, we can inspect this and only flip to KERNEL_DS if the original abort was taken from the kernel, avoiding this issue. At the same time, remove the redundant (and incorrect) comments regarding the order dump_mem and dump_instr are called in. Cc: Catalin Marinas Cc: James Morse Cc: Robin Murphy Cc: #4.6+ Signed-off-by: Mark Rutland Reported-by: Vladimir Murzin Tested-by: Vladimir Murzin Fixes: 57f4959bad0a154a ("arm64: kernel: Add support for User Access Override") Signed-off-by: Will Deacon Change-Id: I54c00f3598d227a7e2767b357cb453075dcce7bd (cherry picked from commit c5cea06be060f38e5400d796e61cfc8c36e52924) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/traps.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index c5392081b49b..58651a9dfcf8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -64,8 +64,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, /* * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. + * to safely read from kernel space. */ fs = get_fs(); set_fs(KERNEL_DS); @@ -111,21 +110,12 @@ static void dump_backtrace_entry(unsigned long where) print_ip_sym(where); } -static void dump_instr(const char *lvl, struct pt_regs *regs) +static void __dump_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); - mm_segment_t fs; char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - for (i = -4; i < 1; i++) { unsigned int val, bad; @@ -139,8 +129,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } } printk("%sCode: %s\n", lvl, str); +} - set_fs(fs); +static void dump_instr(const char *lvl, struct pt_regs *regs) +{ + if (!user_mode(regs)) { + mm_segment_t fs = get_fs(); + set_fs(KERNEL_DS); + __dump_instr(lvl, regs); + set_fs(fs); + } else { + __dump_instr(lvl, regs); + } } static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) From 916a99af4c1bf19207dd4589b1704ab144407c77 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 13 Jun 2016 17:57:02 +0100 Subject: [PATCH 526/607] UPSTREAM: arm64: mm: mark fault_info table const Unlike the debug_fault_info table, we never intentionally alter the fault_info table at runtime, and all derived pointers are treated as const currently. Make the table const so that it can be placed in .rodata and protected from unintentional writes, as we do for the syscall tables. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon Change-Id: I3fb0bb55427835c165cc377d8dc2a3fa9e6e950d (cherry picked from commit bbb1681ee3653bdcfc6a4ba31902738118311fd4) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9bb45de46085..b5773944d1d3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -438,7 +438,7 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; } -static struct fault_info { +static const struct fault_info { int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); int sig; int code; From 32054524cb03d884215a6051ad8c21077c241250 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 12:33:01 +0100 Subject: [PATCH 527/607] UPSTREAM: arm64: add macro to extract ESR_ELx.EC Several places open-code extraction of the EC field from an ESR_ELx value, in subtly different ways. This is unfortunate duplication and variation, and the precise logic used to extract the field is a distraction. This patch adds a new macro, ESR_ELx_EC(), to extract the EC field from an ESR_ELx value in a consistent fashion. Existing open-coded extractions in core arm64 code are moved over to the new helper. KVM code is left as-is for the moment. Signed-off-by: Mark Rutland Tested-by: Huang Shijie Cc: Dave P Martin Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Catalin Marinas Change-Id: Ib634a4795277d243fce5dd30b139e2ec1465bee9 (cherry picked from commit 275f344bec51e9100bae81f3cc8c6940bbfb24c0) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/esr.h | 1 + arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/fault.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 77eeb2cc648f..f772e15c4766 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -74,6 +74,7 @@ #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) +#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define ESR_ELx_IL (UL(1) << 25) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 58651a9dfcf8..29d7b68f63f0 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -465,7 +465,7 @@ static const char *esr_class_str[] = { const char *esr_get_class_string(u32 esr) { - return esr_class_str[esr >> ESR_ELx_EC_SHIFT]; + return esr_class_str[ESR_ELx_EC(esr)]; } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index b5773944d1d3..59b8e1db6338 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -244,7 +244,7 @@ out: static inline int permission_fault(unsigned int esr) { - unsigned int ec = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT; + unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); From b8f39d900e1581dd99788c1e38989b93d114dced Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 12:33:03 +0100 Subject: [PATCH 528/607] UPSTREAM: arm64: kill ESR_LNX_EXEC Currently we treat ESR_EL1 bit 24 as software-defined for distinguishing instruction aborts from data aborts, but this bit is architecturally RES0 for instruction aborts, and could be allocated for an arbitrary purpose in future. Additionally, we hard-code the value in entry.S without the mnemonic, making the code difficult to understand. Instead, remove ESR_LNX_EXEC, and distinguish aborts based on the esr, which we already pass to the sole use of ESR_LNX_EXEC. A new helper, is_el0_instruction_abort() is added to make the logic clear. Any instruction aborts taken from EL1 will already have been handled by bad_mode, so we need not handle that case in the helper. For consistency, the existing permission_fault helper is renamed to is_permission_fault, and the return type is changed to bool. There should be no functional changes as the return value was a boolean expression, and the result is only used in another boolean expression. Signed-off-by: Mark Rutland Cc: Dave P Martin Cc: Huang Shijie Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Catalin Marinas Change-Id: Iaf66fa5f3b13cf985b11a3b0a40c4333fe9ef833 (cherry picked from commit 541ec870ef31433018d245614254bd9d810a9ac3) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/fault.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 12e8d2bcb3f9..eefffa81c6df 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -532,7 +532,7 @@ el0_ia: enable_dbg_and_irq ct_user_exit mov x0, x26 - orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts + mov x1, x25 mov x2, sp bl do_mem_abort b ret_to_user diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 59b8e1db6338..56c6bfbd19b4 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -202,8 +202,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADMAP 0x010000 #define VM_FAULT_BADACCESS 0x020000 -#define ESR_LNX_EXEC (1 << 24) - static int __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, struct task_struct *tsk) @@ -242,7 +240,7 @@ out: return fault; } -static inline int permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; @@ -250,6 +248,11 @@ static inline int permission_fault(unsigned int esr) return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } +static bool is_el0_instruction_abort(unsigned int esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -276,14 +279,14 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; - if (esr & ESR_LNX_EXEC) { + if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } - if (permission_fault(esr) && (addr < USER_DS)) { + if (is_permission_fault(esr) && (addr < USER_DS)) { if (get_fs() == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); From 9887915a4ecf3697d056f01f8e7fe09c853f93bb Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Wed, 27 Apr 2016 17:47:10 +0100 Subject: [PATCH 529/607] UPSTREAM: arm64: Add new asm macro copy_page Kexec and hibernate need to copy pages of memory, but may not have all of the kernel mapped, and are unable to call copy_page(). Add a simplistic copy_page() macro, that can be inlined in these situations. lib/copy_page.S provides a bigger better version, but uses more registers. Signed-off-by: Geoff Levand [Changed asm label to 9998, added commit message] Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Will Deacon Change-Id: If23a454e211b1f57f8ba1a2a00b44dabf4b82932 (cherry picked from commit 5003dbde45961dd7ab3d8a09ab9ad8bcb604db40) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 33bbb7ba7fb5..290e13428f4a 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -24,6 +24,7 @@ #define __ASM_ASSEMBLER_H #include +#include #include #include #include @@ -291,6 +292,24 @@ lr .req x30 // link register 9000: .endm +/* + * copy_page - copy src to dest using temp registers t1-t8 + */ + .macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req +9998: ldp \t1, \t2, [\src] + ldp \t3, \t4, [\src, #16] + ldp \t5, \t6, [\src, #32] + ldp \t7, \t8, [\src, #48] + add \src, \src, #64 + stnp \t1, \t2, [\dest] + stnp \t3, \t4, [\dest, #16] + stnp \t5, \t6, [\dest, #32] + stnp \t7, \t8, [\dest, #48] + add \dest, \dest, #64 + tst \src, #(PAGE_SIZE - 1) + b.ne 9998b + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. From 1d8922da229f3b2b4b6a9e75ad7dd75f862462bb Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:27 +0100 Subject: [PATCH 530/607] UPSTREAM: Revert "arm64: alternatives: add enable parameter to conditional asm macros" Commit 77ee306c0aea9 ("arm64: alternatives: add enable parameter to conditional asm macros") extended the alternative assembly macros. Unfortunately this does not really work as one would expect, as the enable parameter in fact correctly protects the alternative section magic, but not the actual code sequences. This results in having both the original instruction(s) _and_ the alternative ones, if enable if false. Since there is no user of this macros anyway, just revert it. This reverts commit 77ee306c0aea9a219daec256ad25982944affef8. Signed-off-by: Andre Przywara Signed-off-by: Catalin Marinas Change-Id: I608104891335dfa2dacdb364754ae2658088ddf2 (cherry picked from commit b82bfa4793cd0f8fde49b85e0ad66906682e7447) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/alternative.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index beccbdefa106..7288071195e1 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -95,13 +95,11 @@ void apply_alternatives(void *start, size_t length); * The code that follows this macro will be assembled and linked as * normal. There are no restrictions on this code. */ -.macro alternative_if_not cap, enable = 1 - .if \enable +.macro alternative_if_not cap .pushsection .altinstructions, "a" altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f .popsection 661: - .endif .endm /* @@ -118,22 +116,18 @@ void apply_alternatives(void *start, size_t length); * alternative sequence it is defined in (branches into an * alternative sequence are not fixed up). */ -.macro alternative_else, enable = 1 - .if \enable +.macro alternative_else 662: .pushsection .altinstr_replacement, "ax" 663: - .endif .endm /* * Complete an alternative code sequence. */ -.macro alternative_endif, enable = 1 - .if \enable +.macro alternative_endif 664: .popsection .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) - .endif .endm #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ From 405893f77e25abf7584f8ea65567c032849e41dd Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:28 +0100 Subject: [PATCH 531/607] UPSTREAM: arm64: fix "dc cvau" cache operation on errata-affected core The ARM errata 819472, 826319, 827319 and 824069 for affected Cortex-A53 cores demand to promote "dc cvau" instructions to "dc civac" as well. Attribute the usage of the instruction in __flush_cache_user_range to also be covered by our alternative patching efforts. For that we introduce an assembly macro which both deals with alternatives while still tagging the instructions as USER. Signed-off-by: Andre Przywara Signed-off-by: Catalin Marinas Change-Id: If5e7933ba32331b2aa28fc5d9e019649452f0f6c (cherry picked from commit 290622efc76ece22ef76a30bf117755891ab27f6) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/alternative.h | 4 ++++ arch/arm64/mm/cache.S | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 7288071195e1..8746ff6abd77 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -133,6 +133,10 @@ void apply_alternatives(void *start, size_t length); #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) +.macro user_alt, label, oldinstr, newinstr, cond +9999: alternative_insn "\oldinstr", "\newinstr", \cond + _ASM_EXTABLE 9999b, \label +.endm /* * Generate the assembly for UAO alternatives with exception table entries. diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 50ff9ba3a236..07d7352d7c38 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -52,7 +52,7 @@ ENTRY(__flush_cache_user_range) sub x3, x2, #1 bic x4, x0, x3 1: -USER(9f, dc cvau, x4 ) // clean D line to PoU +user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE add x4, x4, x2 cmp x4, x1 b.lo 1b From 76440ecb4dfc5f76582199a32b6c33ee9f0aa221 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:29 +0100 Subject: [PATCH 532/607] UPSTREAM: arm64: include alternative handling in dcache_by_line_op The newly introduced dcache_by_line_op macro is used at least in one occassion at the moment to issue a "dc cvau" instruction, which is affected by ARM errata 819472, 826319, 827319 and 824069. Change the macro to allow for alternative patching in there to protect affected Cortex-A53 cores. Signed-off-by: Andre Przywara [catalin.marinas@arm.com: indentation fixups] Signed-off-by: Catalin Marinas Change-Id: I450594dc311b09b6b832b707a9abb357608cc6e4 (cherry picked from commit 823066d9edcdfe4cedb06216c2b1f91efaf68a87) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 290e13428f4a..9e8ac1e73457 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -24,6 +24,7 @@ #define __ASM_ASSEMBLER_H #include +#include #include #include #include @@ -273,7 +274,16 @@ lr .req x30 // link register add \size, \kaddr, \size sub \tmp2, \tmp1, #1 bic \kaddr, \kaddr, \tmp2 -9998: dc \op, \kaddr +9998: + .if (\op == cvau || \op == cvac) +alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE + dc \op, \kaddr +alternative_else + dc civac, \kaddr +alternative_endif + .else + dc \op, \kaddr + .endif add \kaddr, \kaddr, \tmp1 cmp \kaddr, \size b.lo 9998b From 4caec81662608819e904586b172abc85945b4d65 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 20 Jun 2016 18:28:01 +0100 Subject: [PATCH 533/607] BACKPORT: arm64: kernel: Save and restore UAO and addr_limit on exception entry If we take an exception while at EL1, the exception handler inherits the original context's addr_limit and PSTATE.UAO values. To be consistent always reset addr_limit and PSTATE.UAO on (re-)entry to EL1. This prevents accidental re-use of the original context's addr_limit. Based on a similar patch for arm from Russell King. Cc: # 4.6- Acked-by: Will Deacon Reviewed-by: Mark Rutland Signed-off-by: James Morse Signed-off-by: Will Deacon Change-Id: Iab453201c6e08bc6e22500b7c5570dd0fe2d1b74 (cherry picked from commit e19a6ee2460bdd0d0055a6029383422773f9999a) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/ptrace.h | 2 ++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 19 +++++++++++++++++-- arch/arm64/mm/fault.c | 3 ++- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index e9e5467e0bf4..200d5c32b38f 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -116,6 +116,8 @@ struct pt_regs { }; u64 orig_x0; u64 syscallno; + u64 orig_addr_limit; + u64 unused; // maintain 16 byte alignment }; #define arch_has_single_step() (1) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 25de8b244961..087cf9a65359 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -58,6 +58,7 @@ int main(void) DEFINE(S_PC, offsetof(struct pt_regs, pc)); DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); + DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index eefffa81c6df..9fabed5dd2bf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -97,7 +98,14 @@ mov x29, xzr // fp pointed to user-space .else add x21, sp, #S_FRAME_SIZE - .endif + get_thread_info tsk + /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ + ldr x20, [tsk, #TI_ADDR_LIMIT] + str x20, [sp, #S_ORIG_ADDR_LIMIT] + mov x20, #TASK_SIZE_64 + str x20, [tsk, #TI_ADDR_LIMIT] + ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO) + .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] @@ -128,6 +136,14 @@ .endm .macro kernel_exit, el + .if \el != 0 + /* Restore the task's original addr_limit. */ + ldr x20, [sp, #S_ORIG_ADDR_LIMIT] + str x20, [tsk, #TI_ADDR_LIMIT] + + /* No need to restore UAO, it will be restored from SPSR_EL1 */ + .endif + ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter @@ -406,7 +422,6 @@ el1_irq: bl trace_hardirqs_off #endif - get_thread_info tsk irq_handler #ifdef CONFIG_PREEMPT diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 56c6bfbd19b4..03d44388ae7b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -287,7 +287,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (is_permission_fault(esr) && (addr < USER_DS)) { - if (get_fs() == KERNEL_DS) + /* regs->orig_addr_limit may be 0 if we entered from EL0 */ + if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); if (!search_exception_tables(regs->pc)) From d907eb252977469ed9bded38bd76c2212065c2cf Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 9 Aug 2016 18:25:26 -0700 Subject: [PATCH 534/607] UPSTREAM: arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas Change-Id: Ifba74589ba2cf05b28335d4fd3e3140ef73668db (cherry picked from commit 9adeb8e72dbfe976709df01e259ed556ee60e779) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/entry.S | 7 +++++++ arch/arm64/mm/fault.c | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9fabed5dd2bf..533e1c9fd5a6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -352,6 +352,8 @@ el1_sync: lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1 b.eq el1_da + cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1 + b.eq el1_ia cmp x24, #ESR_ELx_EC_SYS64 // configurable trap b.eq el1_undef cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception @@ -363,6 +365,11 @@ el1_sync: cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1 b.ge el1_dbg b el1_inv + +el1_ia: + /* + * Fall through to the Data abort case + */ el1_da: /* * Data abort handling diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 03d44388ae7b..e328430dbcec 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -131,6 +131,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, } #endif +static bool is_el1_instruction_abort(unsigned int esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; +} + /* * The kernel tried to access some page that wasn't present. */ @@ -139,8 +144,9 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, { /* * Are we prepared to handle this kernel fault? + * We are almost certainly not prepared to handle instruction faults. */ - if (fixup_exception(regs)) + if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) return; /* @@ -245,7 +251,8 @@ static inline bool is_permission_fault(unsigned int esr) unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || + (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } static bool is_el0_instruction_abort(unsigned int esr) @@ -291,6 +298,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); + if (is_el1_instruction_abort(esr)) + die("Attempting to execute userspace memory", regs, esr); + if (!search_exception_tables(regs->pc)) die("Accessing user space memory outside uaccess.h routines", regs, esr); } From 23368b642deb01ac6ce668ec1dedfcc0cab25c71 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 14:58:21 +0100 Subject: [PATCH 535/607] FROMLIST: arm64: Factor out PAN enabling/disabling into separate uaccess_* macros This patch moves the directly coded alternatives for turning PAN on/off into separate uaccess_{enable,disable} macros or functions. The asm macros take a few arguments which will be used in subsequent patches. Note that any (unlikely) access that the compiler might generate between uaccess_enable() and uaccess_disable(), other than those explicitly specified by the user access code, will not be protected by PAN. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Ic3fddd706400c8798f57456c56361d84d234f6ef (cherry picked from commit a4820644c627b82cbc865f2425bb788c94743b16) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/futex.h | 14 ++--- arch/arm64/include/asm/uaccess.h | 79 +++++++++++++++++++++++++--- arch/arm64/kernel/armv8_deprecated.c | 10 ++-- arch/arm64/lib/clear_user.S | 8 ++- arch/arm64/lib/copy_from_user.S | 8 ++- arch/arm64/lib/copy_in_user.S | 8 ++- arch/arm64/lib/copy_to_user.S | 8 ++- 7 files changed, 95 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f2585cdd32c2..71dfa3b42313 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -27,9 +27,9 @@ #include #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ +do { \ + uaccess_enable(); \ asm volatile( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +44,11 @@ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +118,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" -ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +134,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) -ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); + uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c3d445b42351..c8ef22a9a83b 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,6 +18,8 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H +#ifndef __ASSEMBLY__ + /* * User space memory access functions */ @@ -123,6 +125,44 @@ static inline void set_fs(mm_segment_t fs) " .long (" #from " - .), (" #to " - .)\n" \ " .popsection\n" +/* + * User access enabling/disabling. + */ +#define __uaccess_disable(alt) \ +do { \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +#define __uaccess_enable(alt) \ +do { \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +static inline void uaccess_disable(void) +{ + __uaccess_disable(ARM64_HAS_PAN); +} + +static inline void uaccess_enable(void) +{ + __uaccess_enable(ARM64_HAS_PAN); +} + +/* + * These functions are no-ops when UAO is present. + */ +static inline void uaccess_disable_not_uao(void) +{ + __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); +} + +static inline void uaccess_enable_not_uao(void) +{ + __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); +} + /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -150,8 +190,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -172,9 +211,8 @@ do { \ default: \ BUILD_BUG(); \ } \ + uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -219,8 +257,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -241,8 +278,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_disable_not_uao(); \ } while (0) #define __put_user(x, ptr) \ @@ -327,4 +363,31 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +#else /* __ASSEMBLY__ */ + +#include +#include + +/* + * User access enabling/disabling macros. These are no-ops when UAO is + * present. + */ + .macro uaccess_disable_not_uao, tmp1 +alternative_if_not ARM64_ALT_PAN_NOT_UAO + nop +alternative_else + SET_PSTATE_PAN(1) +alternative_endif + .endm + + .macro uaccess_enable_not_uao, tmp1, tmp2 +alternative_if_not ARM64_ALT_PAN_NOT_UAO + nop +alternative_else + SET_PSTATE_PAN(0) +alternative_endif + .endm + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c37202c0c838..562f7ddb158b 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -281,9 +281,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) * Error-checking SWP macros implemented using ldxr{b}/stxr{b} */ #define __user_swpX_asm(data, addr, res, temp, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%3]\n" \ "1: stxr"B" %w0, %w1, [%3]\n" \ " cbz %w0, 2f\n" \ @@ -299,11 +299,11 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp) \ __user_swpX_asm(data, addr, res, temp, "") diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 5d1cad3ce6d6..08b5f18ba604 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,10 @@ */ #include -#include #include #include #include +#include .text @@ -33,8 +33,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -54,8 +53,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x2 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 0b90497d4424..6505ec81f1da 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,11 @@ #include -#include #include #include #include #include +#include /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,12 +67,10 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index f7292dd08c84..9b04ff3ab610 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,11 @@ #include -#include #include #include #include #include +#include /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,12 +68,10 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7a7efe255034..8077e4f34d56 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,11 @@ #include -#include #include #include #include #include +#include /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,12 +66,10 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__arch_copy_to_user) From 3b66929169de053042d47e482dd5748794756153 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 15:48:55 +0100 Subject: [PATCH 536/607] FROMLIST: arm64: Factor out TTBR0_EL1 post-update workaround into a specific asm macro This patch takes the errata workaround code out of cpu_do_switch_mm into a dedicated post_ttbr0_update_workaround macro which will be reused in a subsequent patch. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: I69f94e4c41046bd52ca9340b72d97bfcf955b586 (cherry picked from commit 4398e6a1644373a4c2f535f4153c8378d0914630) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 17 +++++++++++++++++ arch/arm64/mm/proc.S | 11 +---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9e8ac1e73457..9d3e77a5cf07 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -362,4 +362,21 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Errata workaround post TTBR0_EL1 update. + */ + .macro post_ttbr0_update_workaround +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 + nop + nop + nop +alternative_else + ic iallu + dsb nsh + isb +alternative_endif +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 9f6deacf41d2..765713702625 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -136,17 +136,8 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 + post_ttbr0_update_workaround ret - nop - nop - nop -alternative_else - ic iallu - dsb nsh - isb - ret -alternative_endif ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From 1911d36b27ba58ee18592df25b7ee636d4d4c41d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 16:53:00 +0100 Subject: [PATCH 537/607] FROMLIST: arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1 This patch adds the uaccess macros/functions to disable access to user space by setting TTBR0_EL1 to a reserved zeroed page. Since the value written to TTBR0_EL1 must be a physical address, for simplicity this patch introduces a reserved_ttbr0 page at a constant offset from swapper_pg_dir. The uaccess_disable code uses the ttbr1_el1 value adjusted by the reserved_ttbr0 offset. Enabling access to user is done by restoring TTBR0_EL1 with the value from the struct thread_info ttbr0 variable. Interrupts must be disabled during the uaccess_ttbr0_enable code to ensure the atomicity of the thread_info.ttbr0 read and TTBR0_EL1 write. This patch also moves the get_thread_info asm macro from entry.S to assembler.h for reuse in the uaccess_ttbr0_* macros. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Idf09a870b8612dce23215bce90d88781f0c0c3aa (cherry picked from commit 940d37234182d2675ab8ab46084840212d735018) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 16 +++++ arch/arm64/include/asm/cpufeature.h | 6 ++ arch/arm64/include/asm/kernel-pgtable.h | 7 ++ arch/arm64/include/asm/thread_info.h | 3 + arch/arm64/include/asm/uaccess.h | 96 +++++++++++++++++++++++-- arch/arm64/kernel/asm-offsets.c | 3 + arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 4 -- arch/arm64/kernel/head.S | 6 +- arch/arm64/kernel/vmlinux.lds.S | 5 ++ 10 files changed, 134 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9d3e77a5cf07..aeb4554b3af3 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -53,6 +53,15 @@ msr daifclr, #2 .endm + .macro save_and_disable_irq, flags + mrs \flags, daif + msr daifset, #2 + .endm + + .macro restore_irq, flags + msr daif, \flags + .endm + /* * Enable and disable debug exceptions. */ @@ -362,6 +371,13 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Return the current thread_info. + */ + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + /* * Errata workaround post TTBR0_EL1 update. */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 727e594ac5c2..f125c03ab2e1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -188,6 +188,12 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } +static inline bool system_uses_ttbr0_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && + !cpus_have_cap(ARM64_HAS_PAN); +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7e51d1b57c0c..7803343e5881 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include #include /* @@ -54,6 +55,12 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +#define RESERVED_TTBR0_SIZE (PAGE_SIZE) +#else +#define RESERVED_TTBR0_SIZE (0) +#endif + /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index abd64bd1f6d9..b3325a9cb90f 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -47,6 +47,9 @@ typedef unsigned long mm_segment_t; struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ +#endif struct task_struct *task; /* main task structure */ int preempt_count; /* 0 => preemptable, <0 => bug */ int cpu; /* cpu */ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c8ef22a9a83b..c37c064d7cdd 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -128,16 +129,57 @@ static inline void set_fs(mm_segment_t fs) /* * User access enabling/disabling. */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void uaccess_ttbr0_disable(void) +{ + unsigned long ttbr; + + /* reserved_ttbr0 placed at the end of swapper_pg_dir */ + ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; + write_sysreg(ttbr, ttbr0_el1); + isb(); +} + +static inline void uaccess_ttbr0_enable(void) +{ + unsigned long flags; + + /* + * Disable interrupts to avoid preemption between reading the 'ttbr0' + * variable and the MSR. A context switch could trigger an ASID + * roll-over and an update of 'ttbr0'. + */ + local_irq_save(flags); + write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + isb(); + local_irq_restore(flags); +} +#else +static inline void uaccess_ttbr0_disable(void) +{ +} + +static inline void uaccess_ttbr0_enable(void) +{ +} +#endif + #define __uaccess_disable(alt) \ do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ + if (system_uses_ttbr0_pan()) \ + uaccess_ttbr0_disable(); \ + else \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) #define __uaccess_enable(alt) \ do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ + if (system_uses_ttbr0_pan()) \ + uaccess_ttbr0_enable(); \ + else \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) static inline void uaccess_disable(void) @@ -367,12 +409,39 @@ extern __must_check long strnlen_user(const char __user *str, long n); #include #include +#include /* - * User access enabling/disabling macros. These are no-ops when UAO is - * present. + * User access enabling/disabling macros. + */ + .macro uaccess_ttbr0_disable, tmp1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + isb + .endm + + .macro uaccess_ttbr0_enable, tmp1 + get_thread_info \tmp1 + ldr \tmp1, [\tmp1, #TI_TTBR0] // load saved TTBR0_EL1 + msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 + isb + .endm + +/* + * These macros are no-ops when UAO is present. */ .macro uaccess_disable_not_uao, tmp1 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +alternative_if_not ARM64_HAS_PAN + uaccess_ttbr0_disable \tmp1 +alternative_else + nop + nop + nop + nop +alternative_endif +#endif alternative_if_not ARM64_ALT_PAN_NOT_UAO nop alternative_else @@ -381,6 +450,21 @@ alternative_endif .endm .macro uaccess_enable_not_uao, tmp1, tmp2 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption + uaccess_ttbr0_enable \tmp1 + restore_irq \tmp2 +alternative_else + nop + nop + nop + nop + nop + nop + nop +alternative_endif +#endif alternative_if_not ARM64_ALT_PAN_NOT_UAO nop alternative_else diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 087cf9a65359..d0ec987dba5b 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,6 +36,9 @@ int main(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TI_TTBR0, offsetof(struct thread_info, ttbr0)); +#endif DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); BLANK(); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7566cad9fa1d..40ee3f2933e7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -43,6 +43,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 533e1c9fd5a6..8eb8eb085036 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -190,10 +190,6 @@ alternative_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 54fd1e2590a2..9bfa58fea8ce 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -321,14 +321,14 @@ __create_page_tables: * dirty cache lines being evicted. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ mov x0, x25 - add x6, x26, #SWAPPER_DIR_SIZE + add x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -406,7 +406,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 14813a6ca13b..87fd0556a1bd 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -185,6 +185,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG From 5775ca34829caf0664c8ccc02fd0e93cb6022e0f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 2 Sep 2016 14:54:03 +0100 Subject: [PATCH 538/607] FROMLIST: arm64: Disable TTBR0_EL1 during normal kernel execution When the TTBR0 PAN feature is enabled, the kernel entry points need to disable access to TTBR0_EL1. The PAN status of the interrupted context is stored as part of the saved pstate, reusing the PSR_PAN_BIT (22). Restoring access to TTBR0_PAN is done on exception return if returning to user or returning to a context where PAN was disabled. Context switching via switch_mm() must defer the update of TTBR0_EL1 until a return to user or an explicit uaccess_enable() call. Special care needs to be taken for two cases where TTBR0_EL1 is set outside the normal kernel context switch operation: EFI run-time services (via efi_set_pgd) and CPU suspend (via cpu_(un)install_idmap). Code has been added to avoid deferred TTBR0_EL1 switching as in switch_mm() and restore the reserved TTBR0_EL1 when uninstalling the special TTBR0_EL1. This patch also removes a stale comment on the switch_mm() function. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Id1198cf1cde022fad10a94f95d698fae91d742aa (cherry picked from commit d26cfd64c973b31f73091c882e07350e14fdd6c9) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/efi.h | 26 ++++++++++- arch/arm64/include/asm/mmu_context.h | 51 +++++++++++++++------ arch/arm64/include/asm/ptrace.h | 2 + arch/arm64/kernel/entry.S | 67 ++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 9 ++++ arch/arm64/mm/context.c | 7 ++- 6 files changed, 146 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cb..932f5a56d1a6 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include #include #include #include @@ -69,7 +70,30 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); static inline void efi_set_pgd(struct mm_struct *mm) { - switch_mm(NULL, mm, NULL); + __switch_mm(mm); + + if (system_uses_ttbr0_pan()) { + if (mm != current->active_mm) { + /* + * Update the current thread's saved ttbr0 since it is + * restored as part of a return from exception. Set + * the hardware TTBR0_EL1 using cpu_switch_mm() + * directly to enable potential errata workarounds. + */ + update_saved_ttbr0(current, mm); + cpu_switch_mm(mm->pgd, mm); + } else { + /* + * Defer the switch to the current thread's TTBR0_EL1 + * until uaccess_enable(). Restore the current + * thread's saved ttbr0 corresponding to its active_mm + * (if different from init_mm). + */ + cpu_set_reserved_ttbr0(); + if (current->active_mm != &init_mm) + update_saved_ttbr0(current, current->active_mm); + } + } } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a00f7cf35bbd..4a32fd5f101d 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -113,7 +114,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm) + if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } @@ -173,21 +174,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ + if (system_uses_ttbr0_pan()) { + BUG_ON(mm->pgd == swapper_pg_dir); + task_thread_info(tsk)->ttbr0 = + virt_to_phys(mm->pgd) | ASID(mm) << 48; + } +} +#else +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ +} +#endif + +static inline void __switch_mm(struct mm_struct *next) { unsigned int cpu = smp_processor_id(); - if (prev == next) - return; - /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -200,7 +207,23 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, check_and_switch_context(next, cpu); } +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + if (prev != next) + __switch_mm(next); + + /* + * Update the saved TTBR0_EL1 of the scheduled-in task as the previous + * value may have not been initialised yet (activate_mm caller) or the + * ASID has changed since the last run (following the context switch + * of another thread of the same process). + */ + update_saved_ttbr0(tsk, next); +} + #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, NULL) +#define activate_mm(prev,next) switch_mm(prev, next, current) #endif diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 200d5c32b38f..dd4257e286b1 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -21,6 +21,8 @@ #include +#define _PSR_PAN_BIT 22 + /* Current Exception Level values, as contained in CurrentEL */ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8eb8eb085036..eb185c93dfc6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include /* @@ -109,6 +111,34 @@ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if_not ARM64_HAS_PAN + nop +alternative_else + b 1f // skip TTBR0 PAN +alternative_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -147,6 +177,42 @@ ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if_not ARM64_HAS_PAN + nop +alternative_else + b 2f // skip TTBR0 PAN +alternative_endif + + .if \el != 0 + tbnz x22, #_PSR_PAN_BIT, 1f // Skip re-enabling TTBR0 access if previously disabled + .endif + + uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -168,6 +234,7 @@ alternative_else alternative_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 29b8c247d56f..6591bf23422b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -347,6 +347,15 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure init_thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ + init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 7275628ba59f..25128089c386 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -182,7 +182,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpu_switch_mm(mm->pgd, mm); + /* + * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when + * emulating PAN. + */ + if (!system_uses_ttbr0_pan()) + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) From 5dc2b7c7bb33138270ff9494be6cf334bd3d20e1 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 18:22:39 +0100 Subject: [PATCH 539/607] FROMLIST: arm64: Handle faults caused by inadvertent user access with PAN enabled When TTBR0_EL1 is set to the reserved page, an erroneous kernel access to user space would generate a translation fault. This patch adds the checks for the software-set PSR_PAN_BIT to emulate a permission fault and report it accordingly. This patch also updates the description of the synchronous external aborts on translation table walks. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: I623113fc8bf6d5f023aeec7a0640b62a25ef8420 (cherry picked from commit be3db9340c8011d22f06715339b66bcbbd4893bd) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/fault.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index e328430dbcec..f0c75fd6a3fa 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -246,13 +246,19 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || - (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + else + return fsc_type == ESR_ELx_FSC_PERM; } static bool is_el0_instruction_abort(unsigned int esr) @@ -293,7 +299,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (is_permission_fault(esr) && (addr < USER_DS)) { + if (addr < USER_DS && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -478,10 +484,10 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, { do_bad, SIGBUS, 0, "synchronous parity error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, From 4dbc88bd2b6a74fd33483ee2593dcf2bd858eabe Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 5 Jul 2016 12:25:15 +0100 Subject: [PATCH 540/607] FROMLIST: arm64: xen: Enable user access before a privcmd hvc call Privcmd calls are issued by the userspace. The kernel needs to enable access to TTBR0_EL1 as the hypervisor would issue stage 1 translations to user memory via AT instructions. Since AT instructions are not affected by the PAN bit (ARMv8.1), we only need the explicit uaccess_enable/disable if the TTBR0 PAN option is enabled. Reviewed-by: Julien Grall Acked-by: Stefano Stabellini Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: I927f14076ba94c83e609b19f46dd373287e11fc4 (cherry picked from commit 8cc1f33d2c9f206b6505bedba41aed2b33c203c0) Signed-off-by: Sami Tolvanen --- arch/arm64/xen/hypercall.S | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 8bbe9401f4f0..6d6e4af1a4bf 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,6 +49,7 @@ #include #include +#include #include @@ -89,6 +90,24 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Privcmd calls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_enable_not_uao x6, x7 +#endif hvc XEN_IMM + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_disable_not_uao x6 +#endif ret ENDPROC(privcmd_call); From 67cd3bda54dadba4f8892105adf9c2f3982bfa0a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 18:25:31 +0100 Subject: [PATCH 541/607] FROMLIST: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN This patch adds the Kconfig option to enable support for TTBR0 PAN emulation. The option is default off because of a slight performance hit when enabled, caused by the additional TTBR0_EL1 switching during user access operations or exception entry/exit code. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Id00a8ad4169d6eb6176c468d953436eb4ae887ae (cherry picked from commit 6a2d7bad43474c48b68394d455b84a16b7d7dc3f) Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bf24d67a1ff1..488fed2d5e5e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -706,6 +706,14 @@ config SETEND_EMULATION If unsure, say Y endif +config ARM64_SW_TTBR0_PAN + bool "Emulate Priviledged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM From d625b6286ac7af9bc19811c1a625867a71c631bf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 12 Jan 2016 14:22:45 +0100 Subject: [PATCH 542/607] UPSTREAM: ia64: split off early_ioremap() declarations into asm/early_ioremap.h Unlike x86, arm64 and ARM, ia64 does not declare its implementations of early_ioremap/early_iounmap/early_memremap/early_memunmap in a header file called This complicates the use of these functions in generic code, since the header cannot be included directly, and we have to rely on transitive includes, which is fragile. So create a for ia64, and move the existing definitions into it. Signed-off-by: Ard Biesheuvel Signed-off-by: Tony Luck Change-Id: I31eb55a9d57596faa40aec64bd26ce3ec21b0b4d (cherry picked from commit 809267708557ed5575831282f719ca644698084b) Signed-off-by: Sami Tolvanen --- arch/ia64/include/asm/early_ioremap.h | 10 ++++++++++ arch/ia64/include/asm/io.h | 5 +---- 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 arch/ia64/include/asm/early_ioremap.h diff --git a/arch/ia64/include/asm/early_ioremap.h b/arch/ia64/include/asm/early_ioremap.h new file mode 100644 index 000000000000..eec9e1d1b833 --- /dev/null +++ b/arch/ia64/include/asm/early_ioremap.h @@ -0,0 +1,10 @@ +#ifndef _ASM_IA64_EARLY_IOREMAP_H +#define _ASM_IA64_EARLY_IOREMAP_H + +extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); +#define early_memremap(phys_addr, size) early_ioremap(phys_addr, size) + +extern void early_iounmap (volatile void __iomem *addr, unsigned long size); +#define early_memunmap(addr, size) early_iounmap(addr, size) + +#endif diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 9041bbe2b7b4..a865d2a04f75 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -20,6 +20,7 @@ */ #include +#include /* We don't use IO slowdowns on the ia64, but.. */ #define __SLOW_DOWN_IO do { } while (0) @@ -427,10 +428,6 @@ __writeq (unsigned long val, volatile void __iomem *addr) extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap (volatile void __iomem *addr); -extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); -#define early_memremap(phys_addr, size) early_ioremap(phys_addr, size) -extern void early_iounmap (volatile void __iomem *addr, unsigned long size); -#define early_memunmap(addr, size) early_iounmap(addr, size) static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) { return ioremap(phys_addr, size); From a92cff09551d9ef0605af1cf6a7d9ee087d036c3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 12 Jan 2016 14:22:46 +0100 Subject: [PATCH 543/607] UPSTREAM: efi: include asm/early_ioremap.h not asm/efi.h to get early_memremap The code in efi.c uses early_memremap(), but relies on a transitive include rather than including asm/early_ioremap.h directly, since this header did not exist on ia64. Commit f7d924894265 ("arm64/efi: refactor EFI init and runtime code for reuse by 32-bit ARM") attempted to work around this by including asm/efi.h, which transitively includes asm/early_ioremap.h on most architectures. However, since asm/efi.h does not exist on ia64 either, this is not much of an improvement. Now that we have created an asm/early_ioremap.h for ia64, we can just include it directly. Reported-by: Guenter Roeck Signed-off-by: Ard Biesheuvel Signed-off-by: Tony Luck Change-Id: Ifa3e69e0b4078bac1e1d29bfe56861eb394e865b (cherry picked from commit 0f7f2f0c0fcbe5e2bcba707a628ebaedfe2be4b4) Signed-off-by: Sami Tolvanen --- drivers/firmware/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index cffa89b3317b..2cd37dad67a6 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -25,7 +25,7 @@ #include #include -#include +#include struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, From 8fd31d5a2c06726a6060ea355bb5c90d44b132c4 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Mon, 3 Oct 2016 16:17:34 -0700 Subject: [PATCH 544/607] Fix a build breakage in IO latency hist code. Fix a build breakage where MMC is enabled, but BLOCK is not. Change-Id: I0eb422d12264f0371f3368ae7c37342ef9efabaa Signed-off-by: Mohan Srinivasan --- drivers/mmc/core/core.c | 6 ++++++ drivers/mmc/core/host.c | 4 ++++ include/linux/mmc/core.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index b503249950df..f9d928cbc214 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,7 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); +#ifdef CONFIG_BLOCK if (mrq->lat_hist_enabled) { ktime_t completion; u_int64_t delta_us; @@ -194,6 +195,7 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) (mrq->data->flags & MMC_DATA_READ), delta_us); } +#endif trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -638,11 +640,13 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { +#ifdef CONFIG_BLOCK if (host->latency_hist_enabled) { areq->mrq->io_start = ktime_get(); areq->mrq->lat_hist_enabled = 1; } else areq->mrq->lat_hist_enabled = 0; +#endif trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -2923,6 +2927,7 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +#ifdef CONFIG_BLOCK static ssize_t latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2970,6 +2975,7 @@ mmc_latency_hist_sysfs_exit(struct mmc_host *host) { device_remove_file(&host->class_dev, &dev_attr_latency_hist); } +#endif subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 17068839c74b..443fdfc22d8a 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -392,7 +392,9 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif +#ifdef CONFIG_BLOCK mmc_latency_hist_sysfs_init(host); +#endif mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) @@ -422,7 +424,9 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif +#ifdef CONFIG_BLOCK mmc_latency_hist_sysfs_exit(host); +#endif device_del(&host->class_dev); diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 3349f0676acb..0860efd6e1be 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -137,7 +137,9 @@ struct mmc_request { void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; ktime_t io_start; +#ifdef CONFIG_BLOCK int lat_hist_enabled; +#endif }; struct mmc_card; From 1a6c07cf6f2671e1a8a561745bdcc295a7cfeb17 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Fri, 19 Feb 2016 11:50:32 -0600 Subject: [PATCH 545/607] BACKPORT: arm64: mm: Mark .rodata as RO Currently the .rodata section is actually still executable when DEBUG_RODATA is enabled. This changes that so the .rodata is actually read only, no execute. It also adds the .rodata section to the mem_init banner. Signed-off-by: Jeremy Linton Reviewed-by: Kees Cook Acked-by: Mark Rutland [catalin.marinas@arm.com: added vm_struct vmlinux_rodata in map_kernel()] Signed-off-by: Catalin Marinas Bug: 31374226 Change-Id: I6fd95beaf814fc91805da12c5329a57ce9008fd7 (cherry picked from commit 2f39b5f91eb4bccd786d194e70db1dccad784755) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/vmlinux.lds.S | 6 +++--- arch/arm64/mm/init.c | 2 ++ arch/arm64/mm/mmu.c | 25 ++++++++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 87fd0556a1bd..f1d6c49dcc5f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -114,14 +114,14 @@ SECTIONS *(.got) /* Global offset table */ } + ALIGN_DEBUG_RO_MIN(PAGE_SIZE) _etext = .; /* End of text section */ - RO_DATA(PAGE_SIZE) - EXCEPTION_TABLE(8) + RO_DATA(PAGE_SIZE) /* everything from this point to */ + EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ NOTES ALIGN_DEBUG_RO_MIN(PAGE_SIZE) - _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0db12e9e8765..678878077996 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -376,6 +376,7 @@ void __init mem_init(void) " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n" " .data : 0x%p" " - 0x%p" " (%6ld KB)\n" #ifdef CONFIG_SPARSEMEM_VMEMMAP " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" @@ -391,6 +392,7 @@ void __init mem_init(void) MLG(VMALLOC_START, VMALLOC_END), MLK_ROUNDUP(__init_begin, __init_end), MLK_ROUNDUP(_text, _etext), + MLK_ROUNDUP(__start_rodata, __init_begin), MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9dafcfde7c26..4b38f6b8806e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -386,14 +386,14 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { unsigned long kernel_start = __pa(_stext); - unsigned long kernel_end = __pa(_etext); + unsigned long kernel_end = __pa(__init_begin); /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel text */ + /* No overlap with the kernel text/rodata */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -402,7 +402,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel text mapping. + * This block overlaps the kernel text/rodata mapping. * Map the portion(s) which don't overlap. */ if (start < kernel_start) @@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end early_pgtable_alloc); /* - * Map the linear alias of the [_stext, _etext) interval as + * Map the linear alias of the [_stext, __init_begin) interval as * read-only/non-executable. This makes the contents of the * region accessible to subsystems such as hibernate, but * protects it from inadvertent modification or execution. @@ -447,12 +447,18 @@ static void __init map_mem(pgd_t *pgd) void mark_rodata_ro(void) { - if (!IS_ENABLED(CONFIG_DEBUG_RODATA)) - return; + unsigned long section_size; + section_size = (unsigned long)_etext - (unsigned long)_stext; create_mapping_late(__pa(_stext), (unsigned long)_stext, - (unsigned long)__init_begin - (unsigned long)_stext, - PAGE_KERNEL_ROX); + section_size, PAGE_KERNEL_ROX); + /* + * mark .rodata as read only. Use __init_begin rather than __end_rodata + * to cover NOTES and EXCEPTION_TABLE. + */ + section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; + create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, + section_size, PAGE_KERNEL_RO); } void fixup_init(void) @@ -491,9 +497,10 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, */ static void __init map_kernel(pgd_t *pgd) { - static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data; + static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_chunk(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, &vmlinux_init); map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); From 50bad62b51e802b5bd5e731915234905fbe4d4f6 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Wed, 21 Sep 2016 15:25:04 -0700 Subject: [PATCH 546/607] BACKPORT: arm64: Correctly bounds check virt_addr_valid virt_addr_valid is supposed to return true if and only if virt_to_page returns a valid page structure. The current macro does math on whatever address is given and passes that to pfn_valid to verify. vmalloc and module addresses can happen to generate a pfn that 'happens' to be valid. Fix this by only performing the pfn_valid check on addresses that have the potential to be valid. Acked-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Will Deacon Bug: 31374226 Change-Id: I75cbeb3edb059f19af992b7f5d0baa283f95991b (cherry picked from commit ca219452c6b8a6cd1369b6a78b1cf069d0386865) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/memory.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 12f8a00fb3f1..ba1b3409d7ed 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -193,7 +193,11 @@ static inline void *phys_to_virt(phys_addr_t x) #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define _virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + +#define _virt_addr_is_linear(kaddr) (((u64)(kaddr)) >= PAGE_OFFSET) +#define virt_addr_valid(kaddr) (_virt_addr_is_linear(kaddr) && \ + _virt_addr_valid(kaddr)) #endif From 762a327de1b8dc48f813c7d397f7b627cff215c5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 8 Sep 2016 09:57:07 +0200 Subject: [PATCH 547/607] UPSTREAM: fs/proc/kcore.c: Make bounce buffer global for read Next patch adds bounce buffer for ktext area, so it's convenient to have single bounce buffer for both vmalloc/module and ktext cases. Suggested-by: Linus Torvalds Signed-off-by: Jiri Olsa Acked-by: Kees Cook Signed-off-by: Linus Torvalds Bug: 31374226 Change-Id: I8f517354e6d12aed75ed4ae6c0a6adef0a1e61da (cherry picked from commit f5beeb1851ea6f8cfcf2657f26cb24c0582b4945) Signed-off-by: Sami Tolvanen --- fs/proc/kcore.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6e37..2dc84040b0cb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { + char *buf = file->private_data; ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; @@ -500,18 +501,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { - char * elf_buf; - - elf_buf = kzalloc(tsz, GFP_KERNEL); - if (!elf_buf) - return -ENOMEM; - vread(elf_buf, (char *)start, tsz); + vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, elf_buf, tsz)) { - kfree(elf_buf); + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; - } - kfree(elf_buf); } else { if (kern_addr_valid(start)) { unsigned long n; @@ -549,6 +542,11 @@ static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -559,10 +557,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .release = release_kcore, .llseek = default_llseek, }; From c7642969a507168ab9d4a5d4715030af5d2a11a7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 8 Sep 2016 09:57:08 +0200 Subject: [PATCH 548/607] UPSTREAM: fs/proc/kcore.c: Add bounce buffer for ktext data We hit hardened usercopy feature check for kernel text access by reading kcore file: usercopy: kernel memory exposure attempt detected from ffffffff8179a01f () (4065 bytes) kernel BUG at mm/usercopy.c:75! Bypassing this check for kcore by adding bounce buffer for ktext data. Reported-by: Steve Best Fixes: f5509cc18daa ("mm: Hardened usercopy") Suggested-by: Kees Cook Signed-off-by: Jiri Olsa Acked-by: Kees Cook Signed-off-by: Linus Torvalds Bug: 31374226 Change-Id: Ic93e6041b67d804a994518bf4950811f828b406e (cherry picked from commit df04abfd181acc276ba6762c8206891ae10ae00d) Signed-off-by: Sami Tolvanen --- fs/proc/kcore.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 2dc84040b0cb..21f198aa0961 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -509,7 +509,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (kern_addr_valid(start)) { unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + memcpy(buf, (char *) start, tsz); + n = copy_to_user(buffer, buf, tsz); /* * We cannot distinguish between fault on source * and fault on destination. When this happens From d02eecfab2e3b76185f396a1e3dad004b6962696 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 5 Oct 2016 09:52:07 -0700 Subject: [PATCH 549/607] ANDROID: android-base: CONFIG_HARDENED_USERCOPY=y Bug: 31374226 Change-Id: I977e76395017d8d718ea634421b3635023934ef9 Signed-off-by: Sami Tolvanen --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 1ebef5c69fb6..8531a7a79e33 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -24,6 +24,7 @@ CONFIG_DM_VERITY=y CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y +CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y From 620ed669ae0bca79885abb5a3c242ac5bf000f3b Mon Sep 17 00:00:00 2001 From: EunTaik Lee Date: Wed, 24 Feb 2016 04:38:06 +0000 Subject: [PATCH 550/607] UPSTREAM: staging/android/ion : fix a race condition in the ion driver There is a use-after-free problem in the ion driver. This is caused by a race condition in the ion_ioctl() function. A handle has ref count of 1 and two tasks on different cpus calls ION_IOC_FREE simultaneously. cpu 0 cpu 1 ------------------------------------------------------- ion_handle_get_by_id() (ref == 2) ion_handle_get_by_id() (ref == 3) ion_free() (ref == 2) ion_handle_put() (ref == 1) ion_free() (ref == 0 so ion_handle_destroy() is called and the handle is freed.) ion_handle_put() is called and it decreases the slub's next free pointer The problem is detected as an unaligned access in the spin lock functions since it uses load exclusive instruction. In some cases it corrupts the slub's free pointer which causes a mis-aligned access to the next free pointer.(kmalloc returns a pointer like ffffc0745b4580aa). And it causes lots of other hard-to-debug problems. This symptom is caused since the first member in the ion_handle structure is the reference count and the ion driver decrements the reference after it has been freed. To fix this problem client->lock mutex is extended to protect all the codes that uses the handle. Signed-off-by: Eun Taik Lee Reviewed-by: Laura Abbott Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 9590232bb4f4cc824f3425a6e1349afbe6d6d2b7) bug: 31568617 Change-Id: I4ea2be0cad3305c4e196126a02e2ab7108ef0976 --- drivers/staging/android/ion/ion.c | 59 +++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index e237e9f3312d..1958d58fdca3 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -385,13 +385,22 @@ static void ion_handle_get(struct ion_handle *handle) kref_get(&handle->ref); } -static int ion_handle_put(struct ion_handle *handle) +static int ion_handle_put_nolock(struct ion_handle *handle) +{ + int ret; + + ret = kref_put(&handle->ref, ion_handle_destroy); + + return ret; +} + +int ion_handle_put(struct ion_handle *handle) { struct ion_client *client = handle->client; int ret; mutex_lock(&client->lock); - ret = kref_put(&handle->ref, ion_handle_destroy); + ret = ion_handle_put_nolock(handle); mutex_unlock(&client->lock); return ret; @@ -415,18 +424,28 @@ static struct ion_handle *ion_handle_lookup(struct ion_client *client, return ERR_PTR(-EINVAL); } -static struct ion_handle *ion_handle_get_by_id(struct ion_client *client, +static struct ion_handle *ion_handle_get_by_id_nolock(struct ion_client *client, + int id) +{ + struct ion_handle *handle; + + handle = idr_find(&client->idr, id); + if (handle) + ion_handle_get(handle); + + return handle ? handle : ERR_PTR(-EINVAL); +} + +struct ion_handle *ion_handle_get_by_id(struct ion_client *client, int id) { struct ion_handle *handle; mutex_lock(&client->lock); - handle = idr_find(&client->idr, id); - if (handle) - ion_handle_get(handle); + handle = ion_handle_get_by_id_nolock(client, id); mutex_unlock(&client->lock); - return handle ? handle : ERR_PTR(-EINVAL); + return handle; } static bool ion_handle_validate(struct ion_client *client, @@ -530,22 +549,28 @@ struct ion_handle *ion_alloc(struct ion_client *client, size_t len, } EXPORT_SYMBOL(ion_alloc); -void ion_free(struct ion_client *client, struct ion_handle *handle) +static void ion_free_nolock(struct ion_client *client, struct ion_handle *handle) { bool valid_handle; BUG_ON(client != handle->client); - mutex_lock(&client->lock); valid_handle = ion_handle_validate(client, handle); if (!valid_handle) { WARN(1, "%s: invalid handle passed to free.\n", __func__); - mutex_unlock(&client->lock); return; } + ion_handle_put_nolock(handle); +} + +void ion_free(struct ion_client *client, struct ion_handle *handle) +{ + BUG_ON(client != handle->client); + + mutex_lock(&client->lock); + ion_free_nolock(client, handle); mutex_unlock(&client->lock); - ion_handle_put(handle); } EXPORT_SYMBOL(ion_free); @@ -1281,11 +1306,15 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct ion_handle *handle; - handle = ion_handle_get_by_id(client, data.handle.handle); - if (IS_ERR(handle)) + mutex_lock(&client->lock); + handle = ion_handle_get_by_id_nolock(client, data.handle.handle); + if (IS_ERR(handle)) { + mutex_unlock(&client->lock); return PTR_ERR(handle); - ion_free(client, handle); - ion_handle_put(handle); + } + ion_free_nolock(client, handle); + ion_handle_put_nolock(handle); + mutex_unlock(&client->lock); break; } case ION_IOC_SHARE: From b81f4c5f3167142b5b0b91b474658a850e8c5450 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Tue, 2 Aug 2016 15:40:39 -0700 Subject: [PATCH 551/607] ANDROID: binder: Add strong ref checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevent using a binder_ref with only weak references where a strong reference is required. BUG: 30445380 Change-Id: I66c15b066808f28bd27bfe50fd0e03ff45a09fca Signed-off-by: Arve Hjønnevåg --- drivers/android/binder.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index f0ce9959d14d..ce84ff14ce9b 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1003,7 +1003,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) static struct binder_ref *binder_get_ref(struct binder_proc *proc, - uint32_t desc) + uint32_t desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; @@ -1011,12 +1011,16 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc, while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); - if (desc < ref->desc) + if (desc < ref->desc) { n = n->rb_left; - else if (desc > ref->desc) + } else if (desc > ref->desc) { n = n->rb_right; - else + } else if (need_strong_ref && !ref->strong) { + binder_user_error("tried to use weak ref as strong ref\n"); + return NULL; + } else { return ref; + } } return NULL; } @@ -1286,7 +1290,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle); + struct binder_ref *ref = binder_get_ref(proc, fp->handle, + fp->type == BINDER_TYPE_HANDLE); if (ref == NULL) { pr_err("transaction release %d bad handle %d\n", @@ -1381,7 +1386,7 @@ static void binder_transaction(struct binder_proc *proc, if (tr->target.handle) { struct binder_ref *ref; - ref = binder_get_ref(proc, tr->target.handle); + ref = binder_get_ref(proc, tr->target.handle, true); if (ref == NULL) { binder_user_error("%d:%d got transaction to invalid handle\n", proc->pid, thread->pid); @@ -1590,7 +1595,8 @@ static void binder_transaction(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle); + struct binder_ref *ref = binder_get_ref(proc, fp->handle, + fp->type == BINDER_TYPE_HANDLE); if (ref == NULL) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", @@ -1801,7 +1807,9 @@ static int binder_thread_write(struct binder_proc *proc, ref->desc); } } else - ref = binder_get_ref(proc, target); + ref = binder_get_ref(proc, target, + cmd == BC_ACQUIRE || + cmd == BC_RELEASE); if (ref == NULL) { binder_user_error("%d:%d refcount change on invalid ref %d\n", proc->pid, thread->pid, target); @@ -1997,7 +2005,7 @@ static int binder_thread_write(struct binder_proc *proc, if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); - ref = binder_get_ref(proc, target); + ref = binder_get_ref(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, From 78c26bebd182874a7383a5d07446673b61d1ea0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Fri, 12 Aug 2016 16:04:28 -0700 Subject: [PATCH 552/607] ANDROID: binder: Clear binder and cookie when setting handle in flat binder struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents leaking pointers between processes BUG: 30768347 Change-Id: Id898076926f658a1b8b27a3ccb848756b36de4ca Signed-off-by: Arve Hjønnevåg --- drivers/android/binder.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ce84ff14ce9b..b8ba5b7516b9 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1583,7 +1583,9 @@ static void binder_transaction(struct binder_proc *proc, fp->type = BINDER_TYPE_HANDLE; else fp->type = BINDER_TYPE_WEAK_HANDLE; + fp->binder = 0; fp->handle = ref->desc; + fp->cookie = 0; binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE, &thread->todo); @@ -1631,7 +1633,9 @@ static void binder_transaction(struct binder_proc *proc, return_error = BR_FAILED_REPLY; goto err_binder_get_ref_for_node_failed; } + fp->binder = 0; fp->handle = new_ref->desc; + fp->cookie = 0; binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); trace_binder_transaction_ref_to_ref(t, ref, new_ref); @@ -1685,6 +1689,7 @@ static void binder_transaction(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n", fp->handle, target_fd); /* TODO: fput? */ + fp->binder = 0; fp->handle = target_fd; } break; From 077d045666866063087a1e71e5e82ede3e139d96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 May 2016 11:48:25 -0400 Subject: [PATCH 553/607] UPSTREAM: percpu: fix synchronization between chunk->map_extend_work and chunk destruction (cherry picked from commit 4f996e234dad488e5d9ba0858bc1bae12eff82c3) Atomic allocations can trigger async map extensions which is serviced by chunk->map_extend_work. pcpu_balance_work which is responsible for destroying idle chunks wasn't synchronizing properly against chunk->map_extend_work and may end up freeing the chunk while the work item is still in flight. This patch fixes the bug by rolling async map extension operations into pcpu_balance_work. Signed-off-by: Tejun Heo Reported-and-tested-by: Alexei Starovoitov Reported-by: Vlastimil Babka Reported-by: Sasha Levin Cc: stable@vger.kernel.org # v3.18+ Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space") Change-Id: I8f4aaf7fe0bc0e9f353d41e0a7840c40d6a32117 Bug: 31596597 --- mm/percpu.c | 57 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b97a053..58b014900f0f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -110,7 +110,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ - struct work_struct map_extend_work;/* async ->map[] extension */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ void *data; /* chunk data */ int first_free; /* no free below this */ @@ -164,6 +164,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* chunks which need their map areas extended, protected by pcpu_lock */ +static LIST_HEAD(pcpu_map_extend_chunks); + /* * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. @@ -397,13 +400,19 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { int margin, new_alloc; + lockdep_assert_held(&pcpu_lock); + if (is_atomic) { margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && - pcpu_async_enabled) - schedule_work(&chunk->map_extend_work); + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { + if (list_empty(&chunk->map_extend_list)) { + list_add_tail(&chunk->map_extend_list, + &pcpu_map_extend_chunks); + pcpu_schedule_balance_work(); + } + } } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; } @@ -469,20 +478,6 @@ out_unlock: return 0; } -static void pcpu_map_extend_workfn(struct work_struct *work) -{ - struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, - map_extend_work); - int new_alloc; - - spin_lock_irq(&pcpu_lock); - new_alloc = pcpu_need_to_extend(chunk, false); - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); -} - /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -742,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); - INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&chunk->map_extend_list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -1131,6 +1126,7 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; + list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1148,6 +1144,25 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* service chunks which requested async area map extension */ + do { + int new_alloc = 0; + + spin_lock_irq(&pcpu_lock); + + chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, + struct pcpu_chunk, map_extend_list); + if (chunk) { + list_del_init(&chunk->map_extend_list); + new_alloc = pcpu_need_to_extend(chunk, false); + } + + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); + } while (chunk); + /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1646,7 +1661,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); - INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&schunk->map_extend_list); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1675,7 +1690,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); - INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&dchunk->map_extend_list); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); From 3b41e21ffaa38ff8ed8fa8b71f4410f45f42011f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 May 2016 11:48:25 -0400 Subject: [PATCH 554/607] UPSTREAM: percpu: fix synchronization between synchronous map extension and chunk destruction (cherry picked from commit 6710e594f71ccaad8101bc64321152af7cd9ea28) For non-atomic allocations, pcpu_alloc() can try to extend the area map synchronously after dropping pcpu_lock; however, the extension wasn't synchronized against chunk destruction and the chunk might get freed while extension is in progress. This patch fixes the bug by putting most of non-atomic allocations under pcpu_alloc_mutex to synchronize against pcpu_balance_work which is responsible for async chunk management including destruction. Signed-off-by: Tejun Heo Reported-and-tested-by: Alexei Starovoitov Reported-by: Vlastimil Babka Reported-by: Sasha Levin Cc: stable@vger.kernel.org # v3.18+ Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population") Change-Id: I8800962e658e78eac866fff4a4e00294c58a3dec Bug: 31596597 --- mm/percpu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 58b014900f0f..1f376bce413c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -160,7 +160,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ @@ -446,6 +446,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; + lockdep_assert_held(&pcpu_alloc_mutex); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -892,6 +894,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } + if (!is_atomic) + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -964,12 +969,9 @@ restart: if (is_atomic) goto fail; - mutex_lock(&pcpu_alloc_mutex); - if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { - mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } @@ -980,7 +982,6 @@ restart: spin_lock_irqsave(&pcpu_lock, flags); } - mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: @@ -990,8 +991,6 @@ area_found: if (!is_atomic) { int page_start, page_end, rs, re; - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); @@ -1002,7 +1001,6 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; @@ -1042,6 +1040,8 @@ fail: /* see the flag handling in pcpu_blance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); + } else { + mutex_unlock(&pcpu_alloc_mutex); } return NULL; } From d9e3c1d5f7bd8df275f8bce5ed78393c7a5fb22a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 8 Mar 2016 21:09:29 +0700 Subject: [PATCH 555/607] UPSTREAM: arm64: account for sparsemem section alignment when choosing vmemmap offset Commit dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear region") fixed an issue where the struct page array would overflow into the adjacent virtual memory region if system RAM was placed so high up in physical memory that its addresses were not representable in the build time configured virtual address size. However, the fix failed to take into account that the vmemmap region needs to be relatively aligned with respect to the sparsemem section size, so that a sequence of page structs corresponding with a sparsemem section in the linear region appears naturally aligned in the vmemmap region. So round up vmemmap to sparsemem section size. Since this essentially moves the projection of the linear region up in memory, also revert the reduction of the size of the vmemmap region. Cc: Fixes: dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear region") Tested-by: Mark Langsdorf Tested-by: David Daney Tested-by: Robert Richter Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 36e5cd6b897e17d03008f81e075625d8e43e52d0) Signed-off-by: Jeff Vander Stoep Change-Id: I77bad8c6a7c1a7c3dda92a37ceef5ddfb196ec70 --- arch/arm64/include/asm/pgtable.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0c82a1d9f31e..40017aa2fcbd 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -40,13 +40,14 @@ * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ -#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE) +#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) #define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define VMEMMAP_START (VMALLOC_END + SZ_64K) -#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) +#define vmemmap ((struct page *)VMEMMAP_START - \ + SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT)) #define FIRST_USER_ADDRESS 0UL From 663cc539523fbf35732ba0af6a15bcb6d6370c91 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 18 Apr 2016 17:09:47 +0200 Subject: [PATCH 556/607] UPSTREAM: arm64: relocatable: deal with physically misaligned kernel images When booting a relocatable kernel image, there is no practical reason to refuse an image whose load address is not exactly TEXT_OFFSET bytes above a 2 MB aligned base address, as long as the physical and virtual misalignment with respect to the swapper block size are equal, and are both aligned to THREAD_SIZE. Since the virtual misalignment is under our control when we first enter the kernel proper, we can simply choose its value to be equal to the physical misalignment. So treat the misalignment of the physical load address as the initial KASLR offset, and fix up the remaining code to deal with that. Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Bug: 32122850 (cherry picked from commit 08cdac619c81b3fa8cd73aeed2330ffe0a0b73ca) Signed-off-by: Jeff Vander Stoep Change-Id: I658cb3467ba9a4f5b1f5a1cbb972fdc5a3562bf0 --- arch/arm64/kernel/head.S | 9 ++++++--- arch/arm64/kernel/kaslr.c | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 9bfa58fea8ce..461d6cc258dd 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -210,8 +211,8 @@ section_table: ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w20=cpu_boot_mode - mov x23, xzr // KASLR offset, defaults to 0 adrp x24, __PHYS_OFFSET + and x23, x24, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag bl __create_page_tables // x25=TTBR0, x26=TTBR1 /* @@ -488,11 +489,13 @@ __mmap_switched: bl kasan_early_init #endif #ifdef CONFIG_RANDOMIZE_BASE - cbnz x23, 0f // already running randomized? + tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? + b.ne 0f mov x0, x21 // pass FDT address in x0 + mov x1, x23 // pass modulo offset in x1 bl kaslr_early_init // parse FDT for KASLR options cbz x0, 0f // KASLR disabled? just proceed - mov x23, x0 // record KASLR offset + orr x23, x23, x0 // record KASLR offset ret x28 // we must enable KASLR, return // to __enable_mmu() 0: diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 582983920054..b05469173ba5 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -74,7 +74,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, * containing function pointers) to be reinitialized, and zero-initialized * .bss variables will be reset to 0. */ -u64 __init kaslr_early_init(u64 dt_phys) +u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset) { void *fdt; u64 seed, offset, mask, module_range; @@ -132,8 +132,8 @@ u64 __init kaslr_early_init(u64 dt_phys) * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this * happens, increase the KASLR offset by the size of the kernel image. */ - if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != - (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) offset = (offset + (u64)(_end - _text)) & mask; if (IS_ENABLED(CONFIG_KASAN)) From e78f134a78a0ae95b83ac0cac47ab0bb584ebaa7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 6 Oct 2016 15:53:38 -0700 Subject: [PATCH 557/607] CHROMIUM: remove Android's cgroup generic permissions checks The implementation is utterly broken, resulting in all processes being allows to move tasks between sets (as long as they have access to the "tasks" attribute), and upstream is heading towards checking only capability anyway, so let's get rid of this code. BUG=b:31790445,chromium:647994 TEST=Boot android container, examine logcat Change-Id: I2f780a5992c34e52a8f2d0b3557fc9d490da2779 Signed-off-by: Dmitry Torokhov Reviewed-on: https://chromium-review.googlesource.com/394967 Reviewed-by: Ricky Zhou Reviewed-by: John Stultz --- Documentation/cgroups/cgroups.txt | 9 ----- include/linux/cgroup-defs.h | 1 - include/linux/cgroup.h | 14 -------- kernel/cgroup.c | 59 ++----------------------------- kernel/sched/core.c | 1 - mm/memcontrol.c | 10 ------ 6 files changed, 2 insertions(+), 92 deletions(-) diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 2d984e20783b..c6256ae9885b 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -578,15 +578,6 @@ is completely unused; @cgrp->parent is still valid. (Note - can also be called for a newly-created cgroup if an error occurs after this subsystem's create() method has been called for the new cgroup). -int allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called prior to moving a task into a cgroup; if the subsystem -returns an error, this will abort the attach operation. Used -to extend the permission checks - if all subsystems in a cgroup -return 0, the attach will be allowed to proceed, even if the -default permission check (root or same user) fails. - int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4a4eea01956c..06b77f9dd3f2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -422,7 +422,6 @@ struct cgroup_subsys { void (*css_reset)(struct cgroup_subsys_state *css); void (*css_e_css_changed)(struct cgroup_subsys_state *css); - int (*allow_attach)(struct cgroup_taskset *tset); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 70358b9f5a7a..cb91b44f5f78 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -528,16 +528,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -/* - * Default Android check for whether the current process is allowed to move a - * task across cgroups, either because CAP_SYS_NICE is set or because the uid - * of the calling process is the same as the moved task or because we are - * running as root. - * Returns 0 if this is allowed, or -EACCES otherwise. - */ -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset); - - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -562,10 +552,6 @@ static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } -static inline int subsys_cgroup_allow_attach(void *tset) -{ - return -EINVAL; -} #endif /* !CONFIG_CGROUPS */ #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bd541053efc2..371ee5a827e0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2663,45 +2663,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - if (capable(CAP_SYS_NICE)) - return 0; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if (current != task && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - return -EACCES; - } - - return 0; -} - -static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - int i; - int ret; - - for_each_css(css, i, cgrp) { - if (css->ss->allow_attach) { - ret = css->ss->allow_attach(tset); - if (ret) - return ret; - } else { - return -EACCES; - } - } - - return 0; -} - static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) @@ -2716,24 +2677,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - /* - * if the default permission check fails, give each - * cgroup a chance to extend the permission check - */ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct css_set *cset; - cset = task_css_set(task); - list_add(&cset->mg_node, &tset.src_csets); - ret = cgroup_allow_attach(dst_cgrp, &tset); - list_del(&tset.src_csets); - if (ret) - ret = -EACCES; - } + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c90bfb620981..471094bfbcc6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8935,7 +8935,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c714f33f494..ee6acd279953 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4879,11 +4879,6 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) return ret; } -static int mem_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - return subsys_cgroup_allow_attach(tset); -} - static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) @@ -5045,10 +5040,6 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static int mem_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - return 0; -} static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } @@ -5232,7 +5223,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, - .allow_attach = mem_cgroup_allow_attach, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, From 140cab831036741cac88255f282cd68b72140356 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 6 Oct 2016 16:14:16 -0700 Subject: [PATCH 558/607] CHROMIUM: cgroups: relax permissions on moving tasks between cgroups Android expects system_server to be able to move tasks between different cgroups/cpusets, but does not want to be running as root. Let's relax permission check so that processes can move other tasks if they have CAP_SYS_NICE in the affected task's user namespace. BUG=b:31790445,chromium:647994 TEST=Boot android container, examine logcat Change-Id: Ia919c66ab6ed6a6daf7c4cf67feb38b13b1ad09b Signed-off-by: Dmitry Torokhov Reviewed-on: https://chromium-review.googlesource.com/394927 Reviewed-by: Ricky Zhou --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 371ee5a827e0..45c5e134d05b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2677,7 +2677,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { From 7af6188c36382c1a11551cf8bea62bcb3dcaa585 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 11 Oct 2016 13:51:27 -0700 Subject: [PATCH 559/607] BACKPORT: lib: harden strncpy_from_user The strncpy_from_user() accessor is effectively a copy_from_user() specialised to copy strings, terminating early at a NUL byte if possible. In other respects it is identical, and can be used to copy an arbitrarily large buffer from userspace into the kernel. Conceptually, it exposes a similar attack surface. As with copy_from_user(), we check the destination range when the kernel is built with KASAN, but unlike copy_from_user() we do not check the destination buffer when using HARDENED_USERCOPY. As strncpy_from_user() calls get_user() in a loop, we must call check_object_size() explicitly. This patch adds this instrumentation to strncpy_from_user(), per the same rationale as with the regular copy_from_user(). In the absence of hardened usercopy this will have no impact as the instrumentation expands to an empty static inline function. Link: http://lkml.kernel.org/r/1472221903-31181-1-git-send-email-mark.rutland@arm.com Signed-off-by: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Bug: 31374226 Change-Id: I898e4e9f19307e37a9be497cb1a0d7f1e3911661 (cherry picked from commit bf90e56e467ed5766722972d483e6711889ed1b0) Signed-off-by: Sami Tolvanen --- lib/strncpy_from_user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 5a003a2ebd96..05efc1fa97f0 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max = max_addr - src_addr; long retval; + check_object_size(dst, count, false); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); From 7bb5218b77d0aef456634625fff7909f88b4f705 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 18 Oct 2016 12:35:03 -0700 Subject: [PATCH 560/607] cgroup: Remove leftover instances of allow_attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: kernel/sched/tune.c:718:2: error: unknown field ‘allow_attach’ specified in initializer kernel/cpuset.c:2087:2: error: unknown field 'allow_attach' specified in initializer Change-Id: Ie524350ffc6158f3182d90095cca502e58b6f197 Fixes: e78f134a78a0 ("CHROMIUM: remove Android's cgroup generic permissions checks") Signed-off-by: Guenter Roeck --- kernel/cpuset.c | 18 ------------------ kernel/sched/tune.c | 7 ------- 2 files changed, 25 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3b4f27981778..3d0f77112eb3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2061,30 +2061,12 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } -static int cpuset_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if ((current != task) && !capable(CAP_SYS_ADMIN) && - cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) - return -EACCES; - } - - return 0; -} - struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, - .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 505d7b35b0e1..68a24a044b0a 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -368,12 +368,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } -int schedtune_allow_attach(struct cgroup_taskset *tset) -{ - /* We always allows tasks to be moved between existing CGroups */ - return 0; -} - int schedtune_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -715,7 +709,6 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, - .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, From ffa907b7664062250d813d97fb9a9bdb5511897a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 17 Oct 2016 16:18:39 +0100 Subject: [PATCH 561/607] UPSTREAM: arm64: kaslr: keep modules close to the kernel when DYNAMIC_FTRACE=y The RANDOMIZE_MODULE_REGION_FULL Kconfig option allows KASLR to be configured in such a way that kernel modules and the core kernel are allocated completely independently, which implies that modules are likely to require branches via PLT entries to reach the core kernel. The dynamic ftrace code does not expect that, and assumes that it can patch module code to perform a relative branch to anywhere in the core kernel. This may result in errors such as branch_imm_common: offset out of range ------------[ cut here ]------------ WARNING: CPU: 3 PID: 196 at kernel/trace/ftrace.c:1995 ftrace_bug+0x220/0x2e8 Modules linked in: CPU: 3 PID: 196 Comm: systemd-udevd Not tainted 4.8.0-22-generic #24 Hardware name: AMD Seattle/Seattle, BIOS 10:34:40 Oct 6 2016 task: ffff8d1bef7dde80 task.stack: ffff8d1bef6b0000 PC is at ftrace_bug+0x220/0x2e8 LR is at ftrace_process_locs+0x330/0x430 So make RANDOMIZE_MODULE_REGION_FULL mutually exclusive with DYNAMIC_FTRACE at the Kconfig level. Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 8fe88a4145cdeee486af60e61f5d5a14f804fa45) Signed-off-by: Jeff Vander Stoep Change-Id: Ifb2474dcbb7a3066fe5724ee53a2048d61e80ccc --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 488fed2d5e5e..a4d3c3cd7b23 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -822,7 +822,7 @@ config RANDOMIZE_BASE config RANDOMIZE_MODULE_REGION_FULL bool "Randomize the module region independently from the core kernel" - depends on RANDOMIZE_BASE + depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE default y help Randomizes the location of the module region without considering the From 4a8b645cef3caa87b85269078d9e1f02115107b6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Oct 2016 17:42:09 +0100 Subject: [PATCH 562/607] UPSTREAM: arm64: kaslr: fix breakage with CONFIG_MODVERSIONS=y As it turns out, the KASLR code breaks CONFIG_MODVERSIONS, since the kcrctab has an absolute address field that is relocated at runtime when the kernel offset is randomized. This has been fixed already for PowerPC in the past, so simply wire up the existing code dealing with this issue. Cc: Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Tested-by: Timur Tabi Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 8fe88a4145cdeee486af60e61f5d5a14f804fa45) Signed-off-by: Jeff Vander Stoep Change-Id: Ia40bb68eb5ba7df14214243657948d469f1d5717 --- arch/arm64/include/asm/module.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index e12af6754634..06ff7fd9e81f 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -17,6 +17,7 @@ #define __ASM_MODULE_H #include +#include #define MODULE_ARCH_VERMAGIC "aarch64" @@ -32,6 +33,10 @@ u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, Elf64_Sym *sym); #ifdef CONFIG_RANDOMIZE_BASE +#ifdef CONFIG_MODVERSIONS +#define ARCH_RELOCATES_KCRCTAB +#define reloc_start (kimage_vaddr - KIMAGE_VADDR) +#endif extern u64 module_alloc_base; #else #define module_alloc_base ((u64)_etext - MODULES_VSIZE) From a3c8dc25c446485dcbf8a978f755eead9eac3671 Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Thu, 9 Jun 2016 23:43:28 -0700 Subject: [PATCH 563/607] UPSTREAM: cpu/hotplug: Handle unbalanced hotplug enable/disable (cherry picked from commit 01b41159066531cc8d664362ff0cd89dd137bbfa) When cpu_hotplug_enable() is called unbalanced w/o a preceeding cpu_hotplug_disable() the code emits a warning, but happily decrements the disabled counter. This causes the next operations to malfunction. Prevent the decrement and just emit a warning. Signed-off-by: Lianwei Wang Cc: peterz@infradead.org Cc: linux-pm@vger.kernel.org Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ced7c751648..c8a1751be224 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -185,10 +185,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -631,7 +638,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; From 64f0245f312a849acc83b7c6cdcc8f7c1a14cca3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 18 Oct 2016 16:20:23 -0700 Subject: [PATCH 564/607] [RFC]cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions Try to better match what we're pushing upstream, use CAP_SYS_RESOURCE instead of CAP_SYS_NICE, which shoudln't affect Android as Zygote and system_server already use CAP_SYS_RESOURCE. Signed-off-by: John Stultz --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 45c5e134d05b..00af24ac0167 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2678,7 +2678,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_NICE)) + !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { From b6be9b7b199b05a13d72dc562110eebc725ed0ec Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Thu, 20 Oct 2016 15:45:01 -0400 Subject: [PATCH 565/607] disable aio support in recommended configuration The aio interface adds substantial attack surface for a feature that's not being exposed by Android at all. It's unlikely that anyone is using the kernel feature directly either. This feature is rarely used even on servers. The glibc POSIX aio calls really use thread pools. The lack of widespread usage also means this is relatively poorly audited/tested. The kernel's aio rarely provides performance benefits over using a thread pool and is quite incomplete in terms of system call coverage along with having edge cases where blocking can occur. Part of the performance issue is the fact that it only supports direct io, not buffered io. The existing API is considered fundamentally flawed and it's unlikely it will be expanded, but rather replaced: https://marc.info/?l=linux-aio&m=145255815216051&w=2 Since ext4 encryption means no direct io support, kernel aio isn't even going to work properly on Android devices using file-based encryption. Change-Id: Iccc7cab4437791240817e6275a23e1d3f4a47f2d Signed-off-by: Daniel Micay --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 3465a848d74d..3fd0b13488a1 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -1,4 +1,5 @@ # KEEP ALPHABETICALLY SORTED +# CONFIG_AIO is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set From ce0c65986ceeaa05249c990e1744d4caeb734e60 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 13 Jul 2016 12:06:49 +0200 Subject: [PATCH 566/607] android: binder: split flat_binder_object. flat_binder_object is used for both handling binder objects and file descriptors, even though the two are mostly independent. Since we'll have more fixup objects in binder in the future, instead of extending flat_binder_object again, split out file descriptors to their own object while retaining backwards compatibility to existing user-space clients. All binder objects just share a header. Change-Id: If3c55f27a2aa8f21815383e0e807be47895e4786 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 162 +++++++++++++++++++--------- include/uapi/linux/android/binder.h | 31 +++++- 2 files changed, 141 insertions(+), 52 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b8ba5b7516b9..50cc56f3d0cc 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -146,6 +146,11 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, binder_stop_on_user_error = 2; \ } while (0) +#define to_flat_binder_object(hdr) \ + container_of(hdr, struct flat_binder_object, hdr) + +#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -1241,6 +1246,47 @@ static void binder_send_failed_reply(struct binder_transaction *t, } } +/** + * binder_validate_object() - checks for a valid metadata object in a buffer. + * @buffer: binder_buffer that we're parsing. + * @offset: offset in the buffer at which to validate an object. + * + * Return: If there's a valid metadata object at @offset in @buffer, the + * size of that object. Otherwise, it returns zero. + */ +static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) +{ + /* Check if we can read a header first */ + struct binder_object_header *hdr; + size_t object_size = 0; + + if (offset > buffer->data_size - sizeof(*hdr) || + buffer->data_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) + return 0; + + /* Ok, now see if we can read a complete object. */ + hdr = (struct binder_object_header *)(buffer->data + offset); + switch (hdr->type) { + case BINDER_TYPE_BINDER: + case BINDER_TYPE_WEAK_BINDER: + case BINDER_TYPE_HANDLE: + case BINDER_TYPE_WEAK_HANDLE: + object_size = sizeof(struct flat_binder_object); + break; + case BINDER_TYPE_FD: + object_size = sizeof(struct binder_fd_object); + break; + default: + return 0; + } + if (offset <= buffer->data_size - object_size && + buffer->data_size >= object_size) + return object_size; + else + return 0; +} + static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_buffer *buffer, binder_size_t *failed_at) @@ -1263,21 +1309,23 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, else off_end = (void *)offp + buffer->offsets_size; for (; offp < off_end; offp++) { - struct flat_binder_object *fp; + struct binder_object_header *hdr; + size_t object_size = binder_validate_object(buffer, *offp); - if (*offp > buffer->data_size - sizeof(*fp) || - buffer->data_size < sizeof(*fp) || - !IS_ALIGNED(*offp, sizeof(u32))) { - pr_err("transaction release %d bad offset %lld, size %zd\n", + if (object_size == 0) { + pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)*offp, buffer->data_size); continue; } - fp = (struct flat_binder_object *)(buffer->data + *offp); - switch (fp->type) { + hdr = (struct binder_object_header *)(buffer->data + *offp); + switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { - struct binder_node *node = binder_get_node(proc, fp->binder); + struct flat_binder_object *fp; + struct binder_node *node; + fp = to_flat_binder_object(hdr); + node = binder_get_node(proc, fp->binder); if (node == NULL) { pr_err("transaction release %d bad node %016llx\n", debug_id, (u64)fp->binder); @@ -1286,13 +1334,17 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx\n", node->debug_id, (u64)node->ptr); - binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0); + binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, + 0); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle, - fp->type == BINDER_TYPE_HANDLE); + struct flat_binder_object *fp; + struct binder_ref *ref; + fp = to_flat_binder_object(hdr); + ref = binder_get_ref(proc, fp->handle, + hdr->type == BINDER_TYPE_HANDLE); if (ref == NULL) { pr_err("transaction release %d bad handle %d\n", debug_id, fp->handle); @@ -1301,19 +1353,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d (node %d)\n", ref->debug_id, ref->desc, ref->node->debug_id); - binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE); + binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE); } break; - case BINDER_TYPE_FD: + case BINDER_TYPE_FD: { + struct binder_fd_object *fp = to_binder_fd_object(hdr); + binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d\n", fp->handle); + " fd %d\n", fp->fd); if (failed_at) - task_close_fd(proc, fp->handle); - break; + task_close_fd(proc, fp->fd); + } break; default: pr_err("transaction release %d bad object type %x\n", - debug_id, fp->type); + debug_id, hdr->type); break; } } @@ -1530,28 +1584,29 @@ static void binder_transaction(struct binder_proc *proc, off_end = (void *)offp + tr->offsets_size; off_min = 0; for (; offp < off_end; offp++) { - struct flat_binder_object *fp; + struct binder_object_header *hdr; + size_t object_size = binder_validate_object(t->buffer, *offp); - if (*offp > t->buffer->data_size - sizeof(*fp) || - *offp < off_min || - t->buffer->data_size < sizeof(*fp) || - !IS_ALIGNED(*offp, sizeof(u32))) { - binder_user_error("%d:%d got transaction with invalid offset, %lld (min %lld, max %lld)\n", + if (object_size == 0 || *offp < off_min) { + binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, (u64)*offp, (u64)off_min, - (u64)(t->buffer->data_size - - sizeof(*fp))); + (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; goto err_bad_offset; } - fp = (struct flat_binder_object *)(t->buffer->data + *offp); - off_min = *offp + sizeof(struct flat_binder_object); - switch (fp->type) { + + hdr = (struct binder_object_header *)(t->buffer->data + *offp); + off_min = *offp + object_size; + switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { + struct flat_binder_object *fp; + struct binder_node *node; struct binder_ref *ref; - struct binder_node *node = binder_get_node(proc, fp->binder); + fp = to_flat_binder_object(hdr); + node = binder_get_node(proc, fp->binder); if (node == NULL) { node = binder_new_node(proc, fp->binder, fp->cookie); if (node == NULL) { @@ -1579,14 +1634,14 @@ static void binder_transaction(struct binder_proc *proc, return_error = BR_FAILED_REPLY; goto err_binder_get_ref_for_node_failed; } - if (fp->type == BINDER_TYPE_BINDER) - fp->type = BINDER_TYPE_HANDLE; + if (hdr->type == BINDER_TYPE_BINDER) + hdr->type = BINDER_TYPE_HANDLE; else - fp->type = BINDER_TYPE_WEAK_HANDLE; + hdr->type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; fp->handle = ref->desc; fp->cookie = 0; - binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE, + binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE, &thread->todo); trace_binder_transaction_node_to_ref(t, node, ref); @@ -1597,9 +1652,12 @@ static void binder_transaction(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle, - fp->type == BINDER_TYPE_HANDLE); + struct flat_binder_object *fp; + struct binder_ref *ref; + fp = to_flat_binder_object(hdr); + ref = binder_get_ref(proc, fp->handle, + hdr->type == BINDER_TYPE_HANDLE); if (ref == NULL) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, @@ -1613,13 +1671,15 @@ static void binder_transaction(struct binder_proc *proc, goto err_binder_get_ref_failed; } if (ref->node->proc == target_proc) { - if (fp->type == BINDER_TYPE_HANDLE) - fp->type = BINDER_TYPE_BINDER; + if (hdr->type == BINDER_TYPE_HANDLE) + hdr->type = BINDER_TYPE_BINDER; else - fp->type = BINDER_TYPE_WEAK_BINDER; + hdr->type = BINDER_TYPE_WEAK_BINDER; fp->binder = ref->node->ptr; fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL); + binder_inc_node(ref->node, + hdr->type == BINDER_TYPE_BINDER, + 0, NULL); trace_binder_transaction_ref_to_node(t, ref); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", @@ -1636,7 +1696,9 @@ static void binder_transaction(struct binder_proc *proc, fp->binder = 0; fp->handle = new_ref->desc; fp->cookie = 0; - binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); + binder_inc_ref(new_ref, + hdr->type == BINDER_TYPE_HANDLE, + NULL); trace_binder_transaction_ref_to_ref(t, ref, new_ref); binder_debug(BINDER_DEBUG_TRANSACTION, @@ -1649,25 +1711,26 @@ static void binder_transaction(struct binder_proc *proc, case BINDER_TYPE_FD: { int target_fd; struct file *file; + struct binder_fd_object *fp = to_binder_fd_object(hdr); if (reply) { if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->handle); + proc->pid, thread->pid, fp->fd); return_error = BR_FAILED_REPLY; goto err_fd_not_allowed; } } else if (!target_node->accept_fds) { binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->handle); + proc->pid, thread->pid, fp->fd); return_error = BR_FAILED_REPLY; goto err_fd_not_allowed; } - file = fget(fp->handle); + file = fget(fp->fd); if (file == NULL) { binder_user_error("%d:%d got transaction with invalid fd, %d\n", - proc->pid, thread->pid, fp->handle); + proc->pid, thread->pid, fp->fd); return_error = BR_FAILED_REPLY; goto err_fget_failed; } @@ -1685,17 +1748,18 @@ static void binder_transaction(struct binder_proc *proc, goto err_get_unused_fd_failed; } task_fd_install(target_proc, target_fd, file); - trace_binder_transaction_fd(t, fp->handle, target_fd); + trace_binder_transaction_fd(t, fp->fd, target_fd); binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d -> %d\n", fp->handle, target_fd); + " fd %d -> %d\n", fp->fd, + target_fd); /* TODO: fput? */ - fp->binder = 0; - fp->handle = target_fd; + fp->pad_binder = 0; + fp->fd = target_fd; } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", - proc->pid, thread->pid, fp->type); + proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; goto err_bad_object_type; } diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 41420e341e75..f67c2b1c0713 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -48,6 +48,14 @@ typedef __u64 binder_size_t; typedef __u64 binder_uintptr_t; #endif +/** + * struct binder_object_header - header shared by all binder metadata objects. + * @type: type of the object + */ +struct binder_object_header { + __u32 type; +}; + /* * This is the flattened representation of a Binder object for transfer * between processes. The 'offsets' supplied as part of a binder transaction @@ -56,9 +64,8 @@ typedef __u64 binder_uintptr_t; * between processes. */ struct flat_binder_object { - /* 8 bytes for large_flat_header. */ - __u32 type; - __u32 flags; + struct binder_object_header hdr; + __u32 flags; /* 8 bytes of data. */ union { @@ -70,6 +77,24 @@ struct flat_binder_object { binder_uintptr_t cookie; }; +/** + * struct binder_fd_object - describes a filedescriptor to be fixed up. + * @hdr: common header structure + * @pad_flags: padding to remain compatible with old userspace code + * @pad_binder: padding to remain compatible with old userspace code + * @fd: file descriptor + * @cookie: opaque data, used by user-space + */ +struct binder_fd_object { + struct binder_object_header hdr; + __u32 pad_flags; + union { + binder_uintptr_t pad_binder; + __u32 fd; + }; + + binder_uintptr_t cookie; +}; /* * On 64-bit platforms where user code may run in 32-bits the driver must * translate the buffer (and local binder) addresses appropriately. From 803df5635e4ac377586e3664cf798c3152670bc0 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 15:51:48 +0200 Subject: [PATCH 567/607] android: binder: support multiple context managers. Move the context manager state into a separate struct context, and allow for each process to have its own context associated with it. Change-Id: Ifa934370241a2d447dd519eac3fd0682c6d00ab4 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 60 ++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 50cc56f3d0cc..93780d1cc199 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -56,8 +56,6 @@ static HLIST_HEAD(binder_dead_nodes); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; -static struct binder_node *binder_context_mgr_node; -static kuid_t binder_context_mgr_uid = INVALID_UID; static int binder_last_id; static struct workqueue_struct *binder_deferred_workqueue; @@ -216,6 +214,15 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( return e; } +struct binder_context { + struct binder_node *binder_context_mgr_node; + kuid_t binder_context_mgr_uid; +}; + +static struct binder_context global_context = { + .binder_context_mgr_uid = INVALID_UID, +}; + struct binder_work { struct list_head entry; enum { @@ -331,6 +338,7 @@ struct binder_proc { int ready_threads; long default_priority; struct dentry *debugfs_entry; + struct binder_context *context; }; enum { @@ -935,8 +943,10 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal, if (internal) { if (target_list == NULL && node->internal_strong_refs == 0 && - !(node == binder_context_mgr_node && - node->has_strong_ref)) { + !(node->proc && + node == node->proc->context-> + binder_context_mgr_node && + node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); return -EINVAL; @@ -1037,6 +1047,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, struct rb_node **p = &proc->refs_by_node.rb_node; struct rb_node *parent = NULL; struct binder_ref *ref, *new_ref; + struct binder_context *context = proc->context; while (*p) { parent = *p; @@ -1059,7 +1070,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); - new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1; + new_ref->desc = (node == context->binder_context_mgr_node) ? 0 : 1; for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->desc > new_ref->desc) @@ -1389,6 +1400,7 @@ static void binder_transaction(struct binder_proc *proc, struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error; + struct binder_context *context = proc->context; e = binder_transaction_log_add(&binder_transaction_log); e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); @@ -1449,7 +1461,7 @@ static void binder_transaction(struct binder_proc *proc, } target_node = ref->node; } else { - target_node = binder_context_mgr_node; + target_node = context->binder_context_mgr_node; if (target_node == NULL) { return_error = BR_DEAD_REPLY; goto err_no_context_mgr_node; @@ -1840,6 +1852,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_size_t *consumed) { uint32_t cmd; + struct binder_context *context = proc->context; void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; @@ -1866,10 +1879,10 @@ static int binder_thread_write(struct binder_proc *proc, if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); - if (target == 0 && binder_context_mgr_node && + if (target == 0 && context->binder_context_mgr_node && (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) { ref = binder_get_ref_for_node(proc, - binder_context_mgr_node); + context->binder_context_mgr_node); if (ref->desc != target) { binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", proc->pid, thread->pid, @@ -2775,9 +2788,11 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) { int ret = 0; struct binder_proc *proc = filp->private_data; + struct binder_context *context = proc->context; + kuid_t curr_euid = current_euid(); - if (binder_context_mgr_node != NULL) { + if (context->binder_context_mgr_node) { pr_err("BINDER_SET_CONTEXT_MGR already set\n"); ret = -EBUSY; goto out; @@ -2785,27 +2800,27 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) ret = security_binder_set_context_mgr(proc->tsk); if (ret < 0) goto out; - if (uid_valid(binder_context_mgr_uid)) { - if (!uid_eq(binder_context_mgr_uid, curr_euid)) { + if (uid_valid(context->binder_context_mgr_uid)) { + if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) { pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", from_kuid(&init_user_ns, curr_euid), from_kuid(&init_user_ns, - binder_context_mgr_uid)); + context->binder_context_mgr_uid)); ret = -EPERM; goto out; } } else { - binder_context_mgr_uid = curr_euid; + context->binder_context_mgr_uid = curr_euid; } - binder_context_mgr_node = binder_new_node(proc, 0, 0); - if (binder_context_mgr_node == NULL) { + context->binder_context_mgr_node = binder_new_node(proc, 0, 0); + if (!context->binder_context_mgr_node) { ret = -ENOMEM; goto out; } - binder_context_mgr_node->local_weak_refs++; - binder_context_mgr_node->local_strong_refs++; - binder_context_mgr_node->has_strong_ref = 1; - binder_context_mgr_node->has_weak_ref = 1; + context->binder_context_mgr_node->local_weak_refs++; + context->binder_context_mgr_node->local_strong_refs++; + context->binder_context_mgr_node->has_strong_ref = 1; + context->binder_context_mgr_node->has_weak_ref = 1; out: return ret; } @@ -3035,6 +3050,7 @@ static int binder_open(struct inode *nodp, struct file *filp) return -ENOMEM; get_task_struct(current); proc->tsk = current; + proc->context = &global_context; INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); @@ -3147,6 +3163,7 @@ static int binder_node_release(struct binder_node *node, int refs) static void binder_deferred_release(struct binder_proc *proc) { struct binder_transaction *t; + struct binder_context *context = proc->context; struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, buffers, active_transactions, page_count; @@ -3156,11 +3173,12 @@ static void binder_deferred_release(struct binder_proc *proc) hlist_del(&proc->proc_node); - if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) { + if (context->binder_context_mgr_node && + context->binder_context_mgr_node->proc == proc) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%s: %d context_mgr_node gone\n", __func__, proc->pid); - binder_context_mgr_node = NULL; + context->binder_context_mgr_node = NULL; } threads = 0; From 8b980bee79719478fddc7ff595386b0c32661a33 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Mon, 17 Oct 2016 15:17:31 +0200 Subject: [PATCH 568/607] android: binder: deal with contexts in debugfs. Properly print the context in debugfs entries. Change-Id: If10c2129536d9f39bae542afd7318ca79af60e3a Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 93780d1cc199..635be4d9b8f3 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -190,6 +190,7 @@ struct binder_transaction_log_entry { int to_node; int data_size; int offsets_size; + const char *context_name; }; struct binder_transaction_log { int next; @@ -217,10 +218,12 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_context { struct binder_node *binder_context_mgr_node; kuid_t binder_context_mgr_uid; + const char *name; }; static struct binder_context global_context = { .binder_context_mgr_uid = INVALID_UID, + .name = "binder", }; struct binder_work { @@ -1409,6 +1412,7 @@ static void binder_transaction(struct binder_proc *proc, e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; + e->context_name = proc->context->name; if (reply) { in_reply_to = thread->transaction_stack; @@ -3069,8 +3073,17 @@ static int binder_open(struct inode *nodp, struct file *filp) char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); + /* + * proc debug entries are shared between contexts, so + * this will fail if the process tries to open the driver + * again with a different context. The priting code will + * anyway print all contexts that a given PID has, so this + * is not a problem. + */ proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO, - binder_debugfs_dir_entry_proc, proc, &binder_proc_fops); + binder_debugfs_dir_entry_proc, + (void *)(unsigned long)proc->pid, + &binder_proc_fops); } return 0; @@ -3465,6 +3478,7 @@ static void print_binder_proc(struct seq_file *m, size_t header_pos; seq_printf(m, "proc %d\n", proc->pid); + seq_printf(m, "context %s\n", proc->context->name); header_pos = m->count; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) @@ -3589,6 +3603,7 @@ static void print_binder_proc_stats(struct seq_file *m, int count, strong, weak; seq_printf(m, "proc %d\n", proc->pid); + seq_printf(m, "context %s\n", proc->context->name); count = 0; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) count++; @@ -3696,23 +3711,18 @@ static int binder_transactions_show(struct seq_file *m, void *unused) static int binder_proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; - struct binder_proc *proc = m->private; + int pid = (unsigned long)m->private; int do_lock = !binder_debug_no_lock; - bool valid_proc = false; if (do_lock) binder_lock(__func__); hlist_for_each_entry(itr, &binder_procs, proc_node) { - if (itr == proc) { - valid_proc = true; - break; + if (itr->pid == pid) { + seq_puts(m, "binder proc state:\n"); + print_binder_proc(m, itr, 1); } } - if (valid_proc) { - seq_puts(m, "binder proc state:\n"); - print_binder_proc(m, proc, 1); - } if (do_lock) binder_unlock(__func__); return 0; @@ -3722,11 +3732,11 @@ static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { seq_printf(m, - "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n", + "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d\n", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, - e->from_thread, e->to_proc, e->to_thread, e->to_node, - e->target_handle, e->data_size, e->offsets_size); + e->from_thread, e->to_proc, e->to_thread, e->context_name, + e->to_node, e->target_handle, e->data_size, e->offsets_size); } static int binder_transaction_log_show(struct seq_file *m, void *unused) From 04e3812e712fce296da074fbfba6519a77678dfb Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 16:08:09 +0200 Subject: [PATCH 569/607] android: binder: support multiple /dev instances. Add a new module parameter 'devices', that can be used to specify the names of the binder device nodes we want to populate in /dev. Each device node has its own context manager, and is therefore logically separated from all the other device nodes. The config option CONFIG_ANDROID_BINDER_DEVICES can be used to set the default value of the parameter. This approach was favored over using IPC namespaces, mostly because we require a single process to be a part of multiple binder contexts, which seemed harder to achieve with namespaces. Change-Id: I3df72b2a19b5ad5a0360e6322482db7b00a12b24 Signed-off-by: Martijn Coenen --- drivers/android/Kconfig | 12 ++++++ drivers/android/binder.c | 85 ++++++++++++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index bdfc6c6f4f5a..a82fc022d34b 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -19,6 +19,18 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. +config ANDROID_BINDER_DEVICES + string "Android Binder devices" + depends on ANDROID_BINDER_IPC + default "binder" + ---help--- + Default value for the binder.devices parameter. + + The binder.devices parameter is a comma-separated list of strings + that specifies the names of the binder device nodes that will be + created. Each binder device has its own context manager, and is + therefore logically separated from the other devices. + config ANDROID_BINDER_IPC_32BIT bool depends on !64BIT && ANDROID_BINDER_IPC diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 635be4d9b8f3..29700728bbd1 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -50,6 +50,7 @@ static DEFINE_MUTEX(binder_main_lock); static DEFINE_MUTEX(binder_deferred_lock); static DEFINE_MUTEX(binder_mmap_lock); +static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); static HLIST_HEAD(binder_deferred_list); static HLIST_HEAD(binder_dead_nodes); @@ -114,6 +115,9 @@ module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); static bool binder_debug_no_lock; module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO); +static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(devices, binder_devices_param, charp, S_IRUGO); + static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; @@ -221,9 +225,10 @@ struct binder_context { const char *name; }; -static struct binder_context global_context = { - .binder_context_mgr_uid = INVALID_UID, - .name = "binder", +struct binder_device { + struct hlist_node hlist; + struct miscdevice miscdev; + struct binder_context context; }; struct binder_work { @@ -3045,6 +3050,7 @@ err_bad_arg: static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc; + struct binder_device *binder_dev; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n", current->group_leader->pid, current->pid); @@ -3054,10 +3060,12 @@ static int binder_open(struct inode *nodp, struct file *filp) return -ENOMEM; get_task_struct(current); proc->tsk = current; - proc->context = &global_context; INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); + binder_dev = container_of(filp->private_data, struct binder_device, + miscdev); + proc->context = &binder_dev->context; binder_lock(__func__); @@ -3764,20 +3772,44 @@ static const struct file_operations binder_fops = { .release = binder_release, }; -static struct miscdevice binder_miscdev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "binder", - .fops = &binder_fops -}; - BINDER_DEBUG_ENTRY(state); BINDER_DEBUG_ENTRY(stats); BINDER_DEBUG_ENTRY(transactions); BINDER_DEBUG_ENTRY(transaction_log); +static int __init init_binder_device(const char *name) +{ + int ret; + struct binder_device *binder_device; + + binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); + if (!binder_device) + return -ENOMEM; + + binder_device->miscdev.fops = &binder_fops; + binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; + binder_device->miscdev.name = name; + + binder_device->context.binder_context_mgr_uid = INVALID_UID; + binder_device->context.name = name; + + ret = misc_register(&binder_device->miscdev); + if (ret < 0) { + kfree(binder_device); + return ret; + } + + hlist_add_head(&binder_device->hlist, &binder_devices); + + return ret; +} + static int __init binder_init(void) { int ret; + char *device_name, *device_names; + struct binder_device *device; + struct hlist_node *tmp; binder_deferred_workqueue = create_singlethread_workqueue("binder"); if (!binder_deferred_workqueue) @@ -3787,7 +3819,7 @@ static int __init binder_init(void) if (binder_debugfs_dir_entry_root) binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); - ret = misc_register(&binder_miscdev); + if (binder_debugfs_dir_entry_root) { debugfs_create_file("state", S_IRUGO, @@ -3815,6 +3847,37 @@ static int __init binder_init(void) &binder_transaction_log_failed, &binder_transaction_log_fops); } + + /* + * Copy the module_parameter string, because we don't want to + * tokenize it in-place. + */ + device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); + if (!device_names) { + ret = -ENOMEM; + goto err_alloc_device_names_failed; + } + strcpy(device_names, binder_devices_param); + + while ((device_name = strsep(&device_names, ","))) { + ret = init_binder_device(device_name); + if (ret) + goto err_init_binder_device_failed; + } + + return ret; + +err_init_binder_device_failed: + hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { + misc_deregister(&device->miscdev); + hlist_del(&device->hlist); + kfree(device); + } +err_alloc_device_names_failed: + debugfs_remove_recursive(binder_debugfs_dir_entry_root); + + destroy_workqueue(binder_deferred_workqueue); + return ret; } From bfd49fea44974857219e7387a11534c12d2440fc Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 29 Sep 2016 15:38:14 +0200 Subject: [PATCH 570/607] android: binder: refactor binder_transact() Moved handling of fixup for binder objects, handles and file descriptors into separate functions. Change-Id: If6849f1caee3834aa87d0ab08950bb1e21ec6e38 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 309 ++++++++++++++++++++++----------------- 1 file changed, 172 insertions(+), 137 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 29700728bbd1..ef42b10c41ad 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1392,10 +1392,172 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } } +static int binder_translate_binder(struct flat_binder_object *fp, + struct binder_transaction *t, + struct binder_thread *thread) +{ + struct binder_node *node; + struct binder_ref *ref; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + node = binder_get_node(proc, fp->binder); + if (!node) { + node = binder_new_node(proc, fp->binder, fp->cookie); + if (!node) + return -ENOMEM; + + node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + } + if (fp->cookie != node->cookie) { + binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", + proc->pid, thread->pid, (u64)fp->binder, + node->debug_id, (u64)fp->cookie, + (u64)node->cookie); + return -EINVAL; + } + if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) + return -EPERM; + + ref = binder_get_ref_for_node(target_proc, node); + if (!ref) + return -EINVAL; + + if (fp->hdr.type == BINDER_TYPE_BINDER) + fp->hdr.type = BINDER_TYPE_HANDLE; + else + fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; + fp->binder = 0; + fp->handle = ref->desc; + fp->cookie = 0; + binder_inc_ref(ref, fp->hdr.type == BINDER_TYPE_HANDLE, &thread->todo); + + trace_binder_transaction_node_to_ref(t, node, ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " node %d u%016llx -> ref %d desc %d\n", + node->debug_id, (u64)node->ptr, + ref->debug_id, ref->desc); + + return 0; +} + +static int binder_translate_handle(struct flat_binder_object *fp, + struct binder_transaction *t, + struct binder_thread *thread) +{ + struct binder_ref *ref; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + ref = binder_get_ref(proc, fp->handle, + fp->hdr.type == BINDER_TYPE_HANDLE); + if (!ref) { + binder_user_error("%d:%d got transaction with invalid handle, %d\n", + proc->pid, thread->pid, fp->handle); + return -EINVAL; + } + if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) + return -EPERM; + + if (ref->node->proc == target_proc) { + if (fp->hdr.type == BINDER_TYPE_HANDLE) + fp->hdr.type = BINDER_TYPE_BINDER; + else + fp->hdr.type = BINDER_TYPE_WEAK_BINDER; + fp->binder = ref->node->ptr; + fp->cookie = ref->node->cookie; + binder_inc_node(ref->node, fp->hdr.type == BINDER_TYPE_BINDER, + 0, NULL); + trace_binder_transaction_ref_to_node(t, ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d -> node %d u%016llx\n", + ref->debug_id, ref->desc, ref->node->debug_id, + (u64)ref->node->ptr); + } else { + struct binder_ref *new_ref; + + new_ref = binder_get_ref_for_node(target_proc, ref->node); + if (!new_ref) + return -EINVAL; + + fp->binder = 0; + fp->handle = new_ref->desc; + fp->cookie = 0; + binder_inc_ref(new_ref, fp->hdr.type == BINDER_TYPE_HANDLE, + NULL); + trace_binder_transaction_ref_to_ref(t, ref, new_ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d -> ref %d desc %d (node %d)\n", + ref->debug_id, ref->desc, new_ref->debug_id, + new_ref->desc, ref->node->debug_id); + } + return 0; +} + +static int binder_translate_fd(int fd, + struct binder_transaction *t, + struct binder_thread *thread, + struct binder_transaction *in_reply_to) +{ + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + int target_fd; + struct file *file; + int ret; + bool target_allows_fd; + + if (in_reply_to) + target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS); + else + target_allows_fd = t->buffer->target_node->accept_fds; + if (!target_allows_fd) { + binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n", + proc->pid, thread->pid, + in_reply_to ? "reply" : "transaction", + fd); + ret = -EPERM; + goto err_fd_not_accepted; + } + + file = fget(fd); + if (!file) { + binder_user_error("%d:%d got transaction with invalid fd, %d\n", + proc->pid, thread->pid, fd); + ret = -EBADF; + goto err_fget; + } + ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file); + if (ret < 0) { + ret = -EPERM; + goto err_security; + } + + target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); + if (target_fd < 0) { + ret = -ENOMEM; + goto err_get_unused_fd; + } + task_fd_install(target_proc, target_fd, file); + trace_binder_transaction_fd(t, fd, target_fd); + binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n", + fd, target_fd); + + return target_fd; + +err_get_unused_fd: +err_security: + fput(file); +err_fget: +err_fd_not_accepted: + return ret; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply) { + int ret; struct binder_transaction *t; struct binder_work *tcomplete; binder_size_t *offp, *off_end; @@ -1623,157 +1785,35 @@ static void binder_transaction(struct binder_proc *proc, case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; - struct binder_node *node; - struct binder_ref *ref; fp = to_flat_binder_object(hdr); - node = binder_get_node(proc, fp->binder); - if (node == NULL) { - node = binder_new_node(proc, fp->binder, fp->cookie); - if (node == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_new_node_failed; - } - node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; - node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); - } - if (fp->cookie != node->cookie) { - binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", - proc->pid, thread->pid, - (u64)fp->binder, node->debug_id, - (u64)fp->cookie, (u64)node->cookie); + ret = binder_translate_binder(fp, t, thread); + if (ret < 0) { return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; + goto err_translate_failed; } - if (security_binder_transfer_binder(proc->tsk, - target_proc->tsk)) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - ref = binder_get_ref_for_node(target_proc, node); - if (ref == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - if (hdr->type == BINDER_TYPE_BINDER) - hdr->type = BINDER_TYPE_HANDLE; - else - hdr->type = BINDER_TYPE_WEAK_HANDLE; - fp->binder = 0; - fp->handle = ref->desc; - fp->cookie = 0; - binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE, - &thread->todo); - - trace_binder_transaction_node_to_ref(t, node, ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " node %d u%016llx -> ref %d desc %d\n", - node->debug_id, (u64)node->ptr, - ref->debug_id, ref->desc); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; - struct binder_ref *ref; fp = to_flat_binder_object(hdr); - ref = binder_get_ref(proc, fp->handle, - hdr->type == BINDER_TYPE_HANDLE); - if (ref == NULL) { - binder_user_error("%d:%d got transaction with invalid handle, %d\n", - proc->pid, - thread->pid, fp->handle); + ret = binder_translate_handle(fp, t, thread); + if (ret < 0) { return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_failed; - } - if (security_binder_transfer_binder(proc->tsk, - target_proc->tsk)) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_failed; - } - if (ref->node->proc == target_proc) { - if (hdr->type == BINDER_TYPE_HANDLE) - hdr->type = BINDER_TYPE_BINDER; - else - hdr->type = BINDER_TYPE_WEAK_BINDER; - fp->binder = ref->node->ptr; - fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, - hdr->type == BINDER_TYPE_BINDER, - 0, NULL); - trace_binder_transaction_ref_to_node(t, ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> node %d u%016llx\n", - ref->debug_id, ref->desc, ref->node->debug_id, - (u64)ref->node->ptr); - } else { - struct binder_ref *new_ref; - - new_ref = binder_get_ref_for_node(target_proc, ref->node); - if (new_ref == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - fp->binder = 0; - fp->handle = new_ref->desc; - fp->cookie = 0; - binder_inc_ref(new_ref, - hdr->type == BINDER_TYPE_HANDLE, - NULL); - trace_binder_transaction_ref_to_ref(t, ref, - new_ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, new_ref->debug_id, - new_ref->desc, ref->node->debug_id); + goto err_translate_failed; } } break; case BINDER_TYPE_FD: { - int target_fd; - struct file *file; struct binder_fd_object *fp = to_binder_fd_object(hdr); + int target_fd = binder_translate_fd(fp->fd, t, thread, + in_reply_to); - if (reply) { - if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { - binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->fd); - return_error = BR_FAILED_REPLY; - goto err_fd_not_allowed; - } - } else if (!target_node->accept_fds) { - binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->fd); - return_error = BR_FAILED_REPLY; - goto err_fd_not_allowed; - } - - file = fget(fp->fd); - if (file == NULL) { - binder_user_error("%d:%d got transaction with invalid fd, %d\n", - proc->pid, thread->pid, fp->fd); - return_error = BR_FAILED_REPLY; - goto err_fget_failed; - } - if (security_binder_transfer_file(proc->tsk, - target_proc->tsk, - file) < 0) { - fput(file); - return_error = BR_FAILED_REPLY; - goto err_get_unused_fd_failed; - } - target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); if (target_fd < 0) { - fput(file); return_error = BR_FAILED_REPLY; - goto err_get_unused_fd_failed; + goto err_translate_failed; } - task_fd_install(target_proc, target_fd, file); - trace_binder_transaction_fd(t, fp->fd, target_fd); - binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d -> %d\n", fp->fd, - target_fd); - /* TODO: fput? */ fp->pad_binder = 0; fp->fd = target_fd; } break; @@ -1810,12 +1850,7 @@ static void binder_transaction(struct binder_proc *proc, wake_up_interruptible(target_wait); return; -err_get_unused_fd_failed: -err_fget_failed: -err_fd_not_allowed: -err_binder_get_ref_for_node_failed: -err_binder_get_ref_failed: -err_binder_new_node_failed: +err_translate_failed: err_bad_object_type: err_bad_offset: err_copy_data_failed: From 843a25788dba57df35820bcc91796fb30b56335c Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 14:05:40 +0200 Subject: [PATCH 571/607] android: binder: add extra size to allocator. The binder_buffer allocator currently only allocates space for the data and offsets buffers of a Parcel. This change allows for requesting an additional chunk of data in the buffer, which can for example be used to hold additional meta-data about the transaction (eg a security context). Change-Id: I58ab9c383a2e1a3057aae6adaa596ce867f1b157 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 41 +++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ef42b10c41ad..83a6c1bbbad5 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -303,6 +303,7 @@ struct binder_buffer { struct binder_node *target_node; size_t data_size; size_t offsets_size; + size_t extra_buffers_size; uint8_t data[0]; }; @@ -670,7 +671,9 @@ err_no_vma: static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, size_t data_size, - size_t offsets_size, int is_async) + size_t offsets_size, + size_t extra_buffers_size, + int is_async) { struct rb_node *n = proc->free_buffers.rb_node; struct binder_buffer *buffer; @@ -678,7 +681,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, struct rb_node *best_fit = NULL; void *has_page_addr; void *end_page_addr; - size_t size; + size_t size, data_offsets_size; if (proc->vma == NULL) { pr_err("%d: binder_alloc_buf, no vma\n", @@ -686,15 +689,20 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, return NULL; } - size = ALIGN(data_size, sizeof(void *)) + + data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); - if (size < data_size || size < offsets_size) { + if (data_offsets_size < data_size || data_offsets_size < offsets_size) { binder_user_error("%d: got transaction with invalid size %zd-%zd\n", proc->pid, data_size, offsets_size); return NULL; } - + size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); + if (size < data_offsets_size || size < extra_buffers_size) { + binder_user_error("%d: got transaction with invalid extra_buffers_size %zd\n", + proc->pid, extra_buffers_size); + return NULL; + } if (is_async && proc->free_async_space < size + sizeof(struct binder_buffer)) { binder_debug(BINDER_DEBUG_BUFFER_ALLOC, @@ -763,6 +771,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, proc->pid, size, buffer); buffer->data_size = data_size; buffer->offsets_size = offsets_size; + buffer->extra_buffers_size = extra_buffers_size; buffer->async_transaction = is_async; if (is_async) { proc->free_async_space -= size + sizeof(struct binder_buffer); @@ -837,7 +846,8 @@ static void binder_free_buf(struct binder_proc *proc, buffer_size = binder_buffer_size(proc, buffer); size = ALIGN(buffer->data_size, sizeof(void *)) + - ALIGN(buffer->offsets_size, sizeof(void *)); + ALIGN(buffer->offsets_size, sizeof(void *)) + + ALIGN(buffer->extra_buffers_size, sizeof(void *)); binder_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_free_buf %p size %zd buffer_size %zd\n", @@ -1555,7 +1565,8 @@ err_fd_not_accepted: static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, - struct binder_transaction_data *tr, int reply) + struct binder_transaction_data *tr, int reply, + binder_size_t extra_buffers_size) { int ret; struct binder_transaction *t; @@ -1699,20 +1710,22 @@ static void binder_transaction(struct binder_proc *proc, if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n", + "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_thread->pid, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, - (u64)tr->data_size, (u64)tr->offsets_size); + (u64)tr->data_size, (u64)tr->offsets_size, + (u64)extra_buffers_size); else binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n", + "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_node->debug_id, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, - (u64)tr->data_size, (u64)tr->offsets_size); + (u64)tr->data_size, (u64)tr->offsets_size, + (u64)extra_buffers_size); if (!reply && !(tr->flags & TF_ONE_WAY)) t->from = thread; @@ -1728,7 +1741,8 @@ static void binder_transaction(struct binder_proc *proc, trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_buf(target_proc, tr->data_size, - tr->offsets_size, !reply && (t->flags & TF_ONE_WAY)); + tr->offsets_size, extra_buffers_size, + !reply && (t->flags & TF_ONE_WAY)); if (t->buffer == NULL) { return_error = BR_FAILED_REPLY; goto err_binder_alloc_buf_failed; @@ -2078,7 +2092,8 @@ static int binder_thread_write(struct binder_proc *proc, if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); - binder_transaction(proc, thread, &tr, cmd == BC_REPLY); + binder_transaction(proc, thread, &tr, + cmd == BC_REPLY, 0); break; } From dd9bc4f9f144a89d52a197fcf0ea033a287b7095 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 14:10:07 +0200 Subject: [PATCH 572/607] android: binder: support for scatter-gather. Previously all data passed over binder needed to be serialized, with the exception of Binder objects and file descriptors. This patchs adds support for scatter-gathering raw memory buffers into a binder transaction, avoiding the need to first serialize them into a Parcel. To remain backwards compatibile with existing binder clients, it introduces two new command ioctls for this purpose - BC_TRANSACTION_SG and BC_REPLY_SG. These commands may only be used with the new binder_transaction_data_sg structure, which adds a field for the total size of the buffers we are scatter-gathering. Because memory buffers may contain pointers to other buffers, we allow callers to specify a parent buffer and an offset into it, to indicate this is a location pointing to the buffer that we are fixing up. The kernel will then take care of fixing up the pointer to that buffer as well. Change-Id: I02417f28cff14688f2e1d6fcb959438fd96566cc Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 242 ++++++++++++++++++++++++++-- include/uapi/linux/android/binder.h | 45 ++++++ 2 files changed, 275 insertions(+), 12 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 83a6c1bbbad5..a0f4d29e5148 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -153,6 +153,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) +#define to_binder_buffer_object(hdr) \ + container_of(hdr, struct binder_buffer_object, hdr) + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -166,7 +169,7 @@ enum binder_stat_types { struct binder_stats { int br[_IOC_NR(BR_FAILED_REPLY) + 1]; - int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1]; + int bc[_IOC_NR(BC_REPLY_SG) + 1]; int obj_created[BINDER_STAT_COUNT]; int obj_deleted[BINDER_STAT_COUNT]; }; @@ -1306,6 +1309,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) case BINDER_TYPE_FD: object_size = sizeof(struct binder_fd_object); break; + case BINDER_TYPE_PTR: + object_size = sizeof(struct binder_buffer_object); + break; default: return 0; } @@ -1316,11 +1322,111 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) return 0; } +/** + * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. + * @b: binder_buffer containing the object + * @index: index in offset array at which the binder_buffer_object is + * located + * @start: points to the start of the offset array + * @num_valid: the number of valid offsets in the offset array + * + * Return: If @index is within the valid range of the offset array + * described by @start and @num_valid, and if there's a valid + * binder_buffer_object at the offset found in index @index + * of the offset array, that object is returned. Otherwise, + * %NULL is returned. + * Note that the offset found in index @index itself is not + * verified; this function assumes that @num_valid elements + * from @start were previously verified to have valid offsets. + */ +static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b, + binder_size_t index, + binder_size_t *start, + binder_size_t num_valid) +{ + struct binder_buffer_object *buffer_obj; + binder_size_t *offp; + + if (index >= num_valid) + return NULL; + + offp = start + index; + buffer_obj = (struct binder_buffer_object *)(b->data + *offp); + if (buffer_obj->hdr.type != BINDER_TYPE_PTR) + return NULL; + + return buffer_obj; +} + +/** + * binder_validate_fixup() - validates pointer/fd fixups happen in order. + * @b: transaction buffer + * @objects_start start of objects buffer + * @buffer: binder_buffer_object in which to fix up + * @offset: start offset in @buffer to fix up + * @last_obj: last binder_buffer_object that we fixed up in + * @last_min_offset: minimum fixup offset in @last_obj + * + * Return: %true if a fixup in buffer @buffer at offset @offset is + * allowed. + * + * For safety reasons, we only allow fixups inside a buffer to happen + * at increasing offsets; additionally, we only allow fixup on the last + * buffer object that was verified, or one of its parents. + * + * Example of what is allowed: + * + * A + * B (parent = A, offset = 0) + * C (parent = A, offset = 16) + * D (parent = C, offset = 0) + * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) + * + * Examples of what is not allowed: + * + * Decreasing offsets within the same parent: + * A + * C (parent = A, offset = 16) + * B (parent = A, offset = 0) // decreasing offset within A + * + * Referring to a parent that wasn't the last object or any of its parents: + * A + * B (parent = A, offset = 0) + * C (parent = A, offset = 0) + * C (parent = A, offset = 16) + * D (parent = B, offset = 0) // B is not A or any of A's parents + */ +static bool binder_validate_fixup(struct binder_buffer *b, + binder_size_t *objects_start, + struct binder_buffer_object *buffer, + binder_size_t fixup_offset, + struct binder_buffer_object *last_obj, + binder_size_t last_min_offset) +{ + if (!last_obj) { + /* Nothing to fix up in */ + return false; + } + + while (last_obj != buffer) { + /* + * Safe to retrieve the parent of last_obj, since it + * was already previously verified by the driver. + */ + if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) + return false; + last_min_offset = last_obj->parent_offset + sizeof(uintptr_t); + last_obj = (struct binder_buffer_object *) + (b->data + *(objects_start + last_obj->parent)); + } + return (fixup_offset >= last_min_offset); +} + static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_buffer *buffer, binder_size_t *failed_at) { - binder_size_t *offp, *off_end; + binder_size_t *offp, *off_start, *off_end; int debug_id = buffer->debug_id; binder_debug(BINDER_DEBUG_TRANSACTION, @@ -1331,13 +1437,13 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); - offp = (binder_size_t *)(buffer->data + - ALIGN(buffer->data_size, sizeof(void *))); + off_start = (binder_size_t *)(buffer->data + + ALIGN(buffer->data_size, sizeof(void *))); if (failed_at) off_end = failed_at; else - off_end = (void *)offp + buffer->offsets_size; - for (; offp < off_end; offp++) { + off_end = (void *)off_start + buffer->offsets_size; + for (offp = off_start; offp < off_end; offp++) { struct binder_object_header *hdr; size_t object_size = binder_validate_object(buffer, *offp); @@ -1393,7 +1499,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, if (failed_at) task_close_fd(proc, fp->fd); } break; - + case BINDER_TYPE_PTR: + /* + * Nothing to do here, this will get cleaned up when the + * transaction buffer gets freed + */ + break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); @@ -1563,6 +1674,53 @@ err_fd_not_accepted: return ret; } +static int binder_fixup_parent(struct binder_transaction *t, + struct binder_thread *thread, + struct binder_buffer_object *bp, + binder_size_t *off_start, + binder_size_t num_valid, + struct binder_buffer_object *last_fixup_obj, + binder_size_t last_fixup_min_off) +{ + struct binder_buffer_object *parent; + u8 *parent_buffer; + struct binder_buffer *b = t->buffer; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) + return 0; + + parent = binder_validate_ptr(b, bp->parent, off_start, num_valid); + if (!parent) { + binder_user_error("%d:%d got transaction with invalid parent offset or type\n", + proc->pid, thread->pid); + return -EINVAL; + } + + if (!binder_validate_fixup(b, off_start, + parent, bp->parent_offset, + last_fixup_obj, + last_fixup_min_off)) { + binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", + proc->pid, thread->pid); + return -EINVAL; + } + + if (parent->length < sizeof(binder_uintptr_t) || + bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) { + /* No space for a pointer here! */ + binder_user_error("%d:%d got transaction with invalid parent offset\n", + proc->pid, thread->pid); + return -EINVAL; + } + parent_buffer = (u8 *)(parent->buffer - + target_proc->user_buffer_offset); + *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; + + return 0; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -1571,8 +1729,9 @@ static void binder_transaction(struct binder_proc *proc, int ret; struct binder_transaction *t; struct binder_work *tcomplete; - binder_size_t *offp, *off_end; + binder_size_t *offp, *off_end, *off_start; binder_size_t off_min; + u8 *sg_bufp, *sg_buf_end; struct binder_proc *target_proc; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; @@ -1581,6 +1740,8 @@ static void binder_transaction(struct binder_proc *proc, struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error; + struct binder_buffer_object *last_fixup_obj = NULL; + binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; e = binder_transaction_log_add(&binder_transaction_log); @@ -1755,8 +1916,9 @@ static void binder_transaction(struct binder_proc *proc, if (target_node) binder_inc_node(target_node, 1, 0, NULL); - offp = (binder_size_t *)(t->buffer->data + - ALIGN(tr->data_size, sizeof(void *))); + off_start = (binder_size_t *)(t->buffer->data + + ALIGN(tr->data_size, sizeof(void *))); + offp = off_start; if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t) tr->data.ptr.buffer, tr->data_size)) { @@ -1778,7 +1940,16 @@ static void binder_transaction(struct binder_proc *proc, return_error = BR_FAILED_REPLY; goto err_bad_offset; } - off_end = (void *)offp + tr->offsets_size; + if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { + binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", + proc->pid, thread->pid, + extra_buffers_size); + return_error = BR_FAILED_REPLY; + goto err_bad_offset; + } + off_end = (void *)off_start + tr->offsets_size; + sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *))); + sg_buf_end = sg_bufp + extra_buffers_size; off_min = 0; for (; offp < off_end; offp++) { struct binder_object_header *hdr; @@ -1831,7 +2002,41 @@ static void binder_transaction(struct binder_proc *proc, fp->pad_binder = 0; fp->fd = target_fd; } break; + case BINDER_TYPE_PTR: { + struct binder_buffer_object *bp = + to_binder_buffer_object(hdr); + size_t buf_left = sg_buf_end - sg_bufp; + if (bp->length > buf_left) { + binder_user_error("%d:%d got transaction with too large buffer\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_bad_offset; + } + if (copy_from_user(sg_bufp, + (const void __user *)(uintptr_t) + bp->buffer, bp->length)) { + binder_user_error("%d:%d got transaction with invalid offsets ptr\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_copy_data_failed; + } + /* Fixup buffer pointer to target proc address space */ + bp->buffer = (uintptr_t)sg_bufp + + target_proc->user_buffer_offset; + sg_bufp += ALIGN(bp->length, sizeof(u64)); + + ret = binder_fixup_parent(t, thread, bp, off_start, + offp - off_start, + last_fixup_obj, + last_fixup_min_off); + if (ret < 0) { + return_error = BR_FAILED_REPLY; + goto err_translate_failed; + } + last_fixup_obj = bp; + last_fixup_min_off = 0; + } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); @@ -2085,6 +2290,17 @@ static int binder_thread_write(struct binder_proc *proc, break; } + case BC_TRANSACTION_SG: + case BC_REPLY_SG: { + struct binder_transaction_data_sg tr; + + if (copy_from_user(&tr, ptr, sizeof(tr))) + return -EFAULT; + ptr += sizeof(tr); + binder_transaction(proc, thread, &tr.transaction_data, + cmd == BC_REPLY_SG, tr.buffers_size); + break; + } case BC_TRANSACTION: case BC_REPLY: { struct binder_transaction_data tr; @@ -3606,7 +3822,9 @@ static const char * const binder_command_strings[] = { "BC_EXIT_LOOPER", "BC_REQUEST_DEATH_NOTIFICATION", "BC_CLEAR_DEATH_NOTIFICATION", - "BC_DEAD_BINDER_DONE" + "BC_DEAD_BINDER_DONE", + "BC_TRANSACTION_SG", + "BC_REPLY_SG", }; static const char * const binder_objstat_strings[] = { diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index f67c2b1c0713..f3ef6e2634ba 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -33,6 +33,7 @@ enum { BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE), + BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; enum { @@ -95,6 +96,39 @@ struct binder_fd_object { binder_uintptr_t cookie; }; + +/* struct binder_buffer_object - object describing a userspace buffer + * @hdr: common header structure + * @flags: one or more BINDER_BUFFER_* flags + * @buffer: address of the buffer + * @length: length of the buffer + * @parent: index in offset array pointing to parent buffer + * @parent_offset: offset in @parent pointing to this buffer + * + * A binder_buffer object represents an object that the + * binder kernel driver can copy verbatim to the target + * address space. A buffer itself may be pointed to from + * within another buffer, meaning that the pointer inside + * that other buffer needs to be fixed up as well. This + * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT + * flag in @flags, by setting @parent buffer to the index + * in the offset array pointing to the parent binder_buffer_object, + * and by setting @parent_offset to the offset in the parent buffer + * at which the pointer to this buffer is located. + */ +struct binder_buffer_object { + struct binder_object_header hdr; + __u32 flags; + binder_uintptr_t buffer; + binder_size_t length; + binder_size_t parent; + binder_size_t parent_offset; +}; + +enum { + BINDER_BUFFER_FLAG_HAS_PARENT = 0x01, +}; + /* * On 64-bit platforms where user code may run in 32-bits the driver must * translate the buffer (and local binder) addresses appropriately. @@ -187,6 +221,11 @@ struct binder_transaction_data { } data; }; +struct binder_transaction_data_sg { + struct binder_transaction_data transaction_data; + binder_size_t buffers_size; +}; + struct binder_ptr_cookie { binder_uintptr_t ptr; binder_uintptr_t cookie; @@ -371,6 +410,12 @@ enum binder_driver_command_protocol { /* * void *: cookie */ + + BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg), + BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg), + /* + * binder_transaction_data_sg: the sent command. + */ }; #endif /* _UAPI_LINUX_BINDER_H */ From e124de382771254e09bb2bbd2ac753cdae71a36d Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 18 Oct 2016 13:58:55 +0200 Subject: [PATCH 573/607] android: binder: support for file-descriptor arrays. This patch introduces a new binder_fd_array object, that allows us to support one or more file descriptors embedded in a buffer that is scatter-gathered. Change-Id: I647a53cf0d905c7be0dfd9333806982def68dd74 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 137 ++++++++++++++++++++++++++++ include/uapi/linux/android/binder.h | 28 ++++++ 2 files changed, 165 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index a0f4d29e5148..951393825261 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -156,6 +156,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) +#define to_binder_fd_array_object(hdr) \ + container_of(hdr, struct binder_fd_array_object, hdr) + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -1312,6 +1315,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) case BINDER_TYPE_PTR: object_size = sizeof(struct binder_buffer_object); break; + case BINDER_TYPE_FDA: + object_size = sizeof(struct binder_fd_array_object); + break; default: return 0; } @@ -1505,6 +1511,47 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, * transaction buffer gets freed */ break; + case BINDER_TYPE_FDA: { + struct binder_fd_array_object *fda; + struct binder_buffer_object *parent; + uintptr_t parent_buffer; + u32 *fd_array; + size_t fd_index; + binder_size_t fd_buf_size; + + fda = to_binder_fd_array_object(hdr); + parent = binder_validate_ptr(buffer, fda->parent, + off_start, + offp - off_start); + if (!parent) { + pr_err("transaction release %d bad parent offset", + debug_id); + continue; + } + /* + * Since the parent was already fixed up, convert it + * back to kernel address space to access it + */ + parent_buffer = parent->buffer - + proc->user_buffer_offset; + + fd_buf_size = sizeof(u32) * fda->num_fds; + if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { + pr_err("transaction release %d invalid number of fds (%lld)\n", + debug_id, (u64)fda->num_fds); + continue; + } + if (fd_buf_size > parent->length || + fda->parent_offset > parent->length - fd_buf_size) { + /* No space for all file descriptors here. */ + pr_err("transaction release %d not enough space for %lld fds in buffer\n", + debug_id, (u64)fda->num_fds); + continue; + } + fd_array = (u32 *)(parent_buffer + fda->parent_offset); + for (fd_index = 0; fd_index < fda->num_fds; fd_index++) + task_close_fd(proc, fd_array[fd_index]); + } break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); @@ -1674,6 +1721,63 @@ err_fd_not_accepted: return ret; } +static int binder_translate_fd_array(struct binder_fd_array_object *fda, + struct binder_buffer_object *parent, + struct binder_transaction *t, + struct binder_thread *thread, + struct binder_transaction *in_reply_to) +{ + binder_size_t fdi, fd_buf_size, num_installed_fds; + int target_fd; + uintptr_t parent_buffer; + u32 *fd_array; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + fd_buf_size = sizeof(u32) * fda->num_fds; + if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { + binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n", + proc->pid, thread->pid, (u64)fda->num_fds); + return -EINVAL; + } + if (fd_buf_size > parent->length || + fda->parent_offset > parent->length - fd_buf_size) { + /* No space for all file descriptors here. */ + binder_user_error("%d:%d not enough space to store %lld fds in buffer\n", + proc->pid, thread->pid, (u64)fda->num_fds); + return -EINVAL; + } + /* + * Since the parent was already fixed up, convert it + * back to the kernel address space to access it + */ + parent_buffer = parent->buffer - target_proc->user_buffer_offset; + fd_array = (u32 *)(parent_buffer + fda->parent_offset); + if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { + binder_user_error("%d:%d parent offset not aligned correctly.\n", + proc->pid, thread->pid); + return -EINVAL; + } + for (fdi = 0; fdi < fda->num_fds; fdi++) { + target_fd = binder_translate_fd(fd_array[fdi], t, thread, + in_reply_to); + if (target_fd < 0) + goto err_translate_fd_failed; + fd_array[fdi] = target_fd; + } + return 0; + +err_translate_fd_failed: + /* + * Failed to allocate fd or security error, free fds + * installed so far. + */ + num_installed_fds = fdi; + for (fdi = 0; fdi < num_installed_fds; fdi++) + task_close_fd(target_proc, fd_array[fdi]); + return target_fd; +} + static int binder_fixup_parent(struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, @@ -2002,6 +2106,38 @@ static void binder_transaction(struct binder_proc *proc, fp->pad_binder = 0; fp->fd = target_fd; } break; + case BINDER_TYPE_FDA: { + struct binder_fd_array_object *fda = + to_binder_fd_array_object(hdr); + struct binder_buffer_object *parent = + binder_validate_ptr(t->buffer, fda->parent, + off_start, + offp - off_start); + if (!parent) { + binder_user_error("%d:%d got transaction with invalid parent offset or type\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_bad_parent; + } + if (!binder_validate_fixup(t->buffer, off_start, + parent, fda->parent_offset, + last_fixup_obj, + last_fixup_min_off)) { + binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_bad_parent; + } + ret = binder_translate_fd_array(fda, parent, t, thread, + in_reply_to); + if (ret < 0) { + return_error = BR_FAILED_REPLY; + goto err_translate_failed; + } + last_fixup_obj = parent; + last_fixup_min_off = + fda->parent_offset + sizeof(u32) * fda->num_fds; + } break; case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); @@ -2072,6 +2208,7 @@ static void binder_transaction(struct binder_proc *proc, err_translate_failed: err_bad_object_type: err_bad_offset: +err_bad_parent: err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index f3ef6e2634ba..51f891fb1b18 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -33,6 +33,7 @@ enum { BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE), + BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE), BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; @@ -129,6 +130,33 @@ enum { BINDER_BUFFER_FLAG_HAS_PARENT = 0x01, }; +/* struct binder_fd_array_object - object describing an array of fds in a buffer + * @hdr: common header structure + * @num_fds: number of file descriptors in the buffer + * @parent: index in offset array to buffer holding the fd array + * @parent_offset: start offset of fd array in the buffer + * + * A binder_fd_array object represents an array of file + * descriptors embedded in a binder_buffer_object. It is + * different from a regular binder_buffer_object because it + * describes a list of file descriptors to fix up, not an opaque + * blob of memory, and hence the kernel needs to treat it differently. + * + * An example of how this would be used is with Android's + * native_handle_t object, which is a struct with a list of integers + * and a list of file descriptors. The native_handle_t struct itself + * will be represented by a struct binder_buffer_objct, whereas the + * embedded list of file descriptors is represented by a + * struct binder_fd_array_object with that binder_buffer_object as + * a parent. + */ +struct binder_fd_array_object { + struct binder_object_header hdr; + binder_size_t num_fds; + binder_size_t parent; + binder_size_t parent_offset; +}; + /* * On 64-bit platforms where user code may run in 32-bits the driver must * translate the buffer (and local binder) addresses appropriately. From 96009ab737c4e6da44a0fff72d33534583454d03 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 15 Sep 2016 16:05:40 +0530 Subject: [PATCH 574/607] ANDROID: usb: gadget: audio_source: fix comparison of distinct pointer types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use div_s64() instead of do_div() to fix following "comparison of distinct pointer types lacks a cast" warning in do_div() call in audio_send() for ARCH=arm in Linux 4.8-rc6: CC drivers/usb/gadget/function/f_audio_source.o In file included from ./arch/arm/include/asm/div64.h:126:0, from ./include/linux/kernel.h:142, from ./include/linux/list.h:8, from ./include/linux/kobject.h:20, from ./include/linux/device.h:17, from drivers/usb/gadget/function/f_audio_source.c:17: drivers/usb/gadget/function/f_audio_source.c: In function ‘audio_send’: ./include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ^ drivers/usb/gadget/function/f_audio_source.c:381:2: note: in expansion of macro ‘do_div’ do_div(msecs, 1000000); ^ ./include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ^ drivers/usb/gadget/function/f_audio_source.c:383:2: note: in expansion of macro ‘do_div’ do_div(frames, 1000); ^ LD drivers/usb/gadget/function/usb_f_audio_source.o Change-Id: Ie1a920c8948f3fc3f1263add25a402ded132fd66 Signed-off-by: Amit Pundir --- drivers/usb/gadget/function/f_audio_source.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c index bcd817439dbf..2489a5fa2685 100644 --- a/drivers/usb/gadget/function/f_audio_source.c +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -377,10 +377,9 @@ static void audio_send(struct audio_dev *audio) /* compute number of frames to send */ now = ktime_get(); - msecs = ktime_to_ns(now) - ktime_to_ns(audio->start_time); - do_div(msecs, 1000000); - frames = msecs * SAMPLE_RATE; - do_div(frames, 1000); + msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)), + 1000000); + frames = div_s64((msecs * SAMPLE_RATE), 1000); /* Readjust our frames_sent if we fall too far behind. * If we get too far behind it is better to drop some frames than From 2182c4daf185176ac99bf71a4ae1786ea9620714 Mon Sep 17 00:00:00 2001 From: Jonathan Hamilton Date: Wed, 21 Sep 2016 12:40:51 -0700 Subject: [PATCH 575/607] ANDROID: video: adf: Avoid directly referencing user pointers Enabling KASAN on a kernel using ADF causes a number of places where user-supplied pointers to ioctls pointers are directly dereferenced without copy_from_user or access_ok. Bug: 31806036 Signed-off-by: Jonathan Hamilton Change-Id: I6e86237aaa6cec0f6e1c385336aefcc5332080ae --- drivers/video/adf/adf_fops.c | 73 +++++++++++++++--------------------- 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/drivers/video/adf/adf_fops.c b/drivers/video/adf/adf_fops.c index 8726617f73ab..705411bfaebb 100644 --- a/drivers/video/adf/adf_fops.c +++ b/drivers/video/adf/adf_fops.c @@ -132,7 +132,7 @@ static int adf_eng_get_data(struct adf_overlay_engine *eng, eng->ops->n_supported_formats)); mutex_lock(&dev->client_lock); - ret = adf_obj_copy_custom_data_to_user(&eng->base, arg->custom_data, + ret = adf_obj_copy_custom_data_to_user(&eng->base, data.custom_data, &data.custom_data_size); mutex_unlock(&dev->client_lock); @@ -144,7 +144,7 @@ static int adf_eng_get_data(struct adf_overlay_engine *eng, goto done; } - if (supported_formats && copy_to_user(arg->supported_formats, + if (supported_formats && copy_to_user(data.supported_formats, supported_formats, n_supported_formats * sizeof(supported_formats[0]))) ret = -EFAULT; @@ -220,56 +220,45 @@ static int adf_device_post_config(struct adf_device *dev, int complete_fence_fd; struct adf_buffer *bufs = NULL; struct adf_interface **intfs = NULL; - size_t n_intfs, n_bufs, i; + struct adf_post_config data; + size_t i; void *custom_data = NULL; - size_t custom_data_size; int ret = 0; + if (copy_from_user(&data, arg, sizeof(data))) + return -EFAULT; + complete_fence_fd = get_unused_fd_flags(O_CLOEXEC); if (complete_fence_fd < 0) return complete_fence_fd; - if (get_user(n_intfs, &arg->n_interfaces)) { - ret = -EFAULT; - goto err_get_user; - } - - if (n_intfs > ADF_MAX_INTERFACES) { + if (data.n_interfaces > ADF_MAX_INTERFACES) { ret = -EINVAL; goto err_get_user; } - if (get_user(n_bufs, &arg->n_bufs)) { - ret = -EFAULT; - goto err_get_user; - } - - if (n_bufs > ADF_MAX_BUFFERS) { + if (data.n_bufs > ADF_MAX_BUFFERS) { ret = -EINVAL; goto err_get_user; } - if (get_user(custom_data_size, &arg->custom_data_size)) { - ret = -EFAULT; - goto err_get_user; - } - - if (custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) { + if (data.custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) { ret = -EINVAL; goto err_get_user; } - if (n_intfs) { - intfs = kmalloc(sizeof(intfs[0]) * n_intfs, GFP_KERNEL); + if (data.n_interfaces) { + intfs = kmalloc(sizeof(intfs[0]) * data.n_interfaces, + GFP_KERNEL); if (!intfs) { ret = -ENOMEM; goto err_get_user; } } - for (i = 0; i < n_intfs; i++) { + for (i = 0; i < data.n_interfaces; i++) { u32 intf_id; - if (get_user(intf_id, &arg->interfaces[i])) { + if (get_user(intf_id, &data.interfaces[i])) { ret = -EFAULT; goto err_get_user; } @@ -281,31 +270,31 @@ static int adf_device_post_config(struct adf_device *dev, } } - if (n_bufs) { - bufs = kzalloc(sizeof(bufs[0]) * n_bufs, GFP_KERNEL); + if (data.n_bufs) { + bufs = kzalloc(sizeof(bufs[0]) * data.n_bufs, GFP_KERNEL); if (!bufs) { ret = -ENOMEM; goto err_get_user; } } - for (i = 0; i < n_bufs; i++) { - ret = adf_buffer_import(dev, &arg->bufs[i], &bufs[i]); + for (i = 0; i < data.n_bufs; i++) { + ret = adf_buffer_import(dev, &data.bufs[i], &bufs[i]); if (ret < 0) { memset(&bufs[i], 0, sizeof(bufs[i])); goto err_import; } } - if (custom_data_size) { - custom_data = kzalloc(custom_data_size, GFP_KERNEL); + if (data.custom_data_size) { + custom_data = kzalloc(data.custom_data_size, GFP_KERNEL); if (!custom_data) { ret = -ENOMEM; goto err_import; } - if (copy_from_user(custom_data, arg->custom_data, - custom_data_size)) { + if (copy_from_user(custom_data, data.custom_data, + data.custom_data_size)) { ret = -EFAULT; goto err_import; } @@ -316,8 +305,8 @@ static int adf_device_post_config(struct adf_device *dev, goto err_import; } - complete_fence = adf_device_post_nocopy(dev, intfs, n_intfs, bufs, - n_bufs, custom_data, custom_data_size); + complete_fence = adf_device_post_nocopy(dev, intfs, data.n_interfaces, + bufs, data.n_bufs, custom_data, data.custom_data_size); if (IS_ERR(complete_fence)) { ret = PTR_ERR(complete_fence); goto err_import; @@ -327,7 +316,7 @@ static int adf_device_post_config(struct adf_device *dev, return 0; err_import: - for (i = 0; i < n_bufs; i++) + for (i = 0; i < data.n_bufs; i++) adf_buffer_cleanup(&bufs[i]); err_get_user: @@ -481,19 +470,19 @@ static int adf_device_get_data(struct adf_device *dev, data.n_allowed_attachments); mutex_lock(&dev->client_lock); - ret = adf_obj_copy_custom_data_to_user(&dev->base, arg->custom_data, + ret = adf_obj_copy_custom_data_to_user(&dev->base, data.custom_data, &data.custom_data_size); mutex_unlock(&dev->client_lock); if (ret < 0) goto done; - ret = adf_copy_attachment_list_to_user(arg->attachments, + ret = adf_copy_attachment_list_to_user(data.attachments, data.n_attachments, attach, n_attach); if (ret < 0) goto done; - ret = adf_copy_attachment_list_to_user(arg->allowed_attachments, + ret = adf_copy_attachment_list_to_user(data.allowed_attachments, data.n_allowed_attachments, allowed_attach, n_allowed_attach); if (ret < 0) @@ -592,7 +581,7 @@ static int adf_intf_get_data(struct adf_interface *intf, data.n_available_modes = intf->n_modes; read_unlock_irqrestore(&intf->hotplug_modelist_lock, flags); - if (copy_to_user(arg->available_modes, modelist, modelist_size)) { + if (copy_to_user(data.available_modes, modelist, modelist_size)) { ret = -EFAULT; goto done; } @@ -601,7 +590,7 @@ static int adf_intf_get_data(struct adf_interface *intf, memcpy(&data.current_mode, &intf->current_mode, sizeof(intf->current_mode)); - ret = adf_obj_copy_custom_data_to_user(&intf->base, arg->custom_data, + ret = adf_obj_copy_custom_data_to_user(&intf->base, data.custom_data, &data.custom_data_size); done: mutex_unlock(&dev->client_lock); From 0a60e50f0b910612c3f9560fc3a791e70d146b0b Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 11 Aug 2015 12:34:45 +0530 Subject: [PATCH 576/607] usb: gadget: f_mtp: simplify ptp NULL pointer check Simplify MTP/PTP dev NULL pointer check introduced in Change-Id: Ic44a699d96df2e13467fc081bff88b97dcc5afb2 and restrict it to MTP/PTP function level only. Return ERR_PTR() instead of NULL from mtp_ptp function to skip doing NULL pointer checks all the way up to configfs.c Fixes: Change-Id: Ic44a699d96df2e13467fc081bff88b97dcc5afb2 ("usb: gadget: fix NULL ptr derefer while symlinking PTP func") Change-Id: Iab7c55089c115550c3506f6cca960a07ae52713d Signed-off-by: Amit Pundir --- drivers/usb/gadget/configfs.c | 5 ----- drivers/usb/gadget/function/f_mtp.c | 2 +- drivers/usb/gadget/functions.c | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 8df96cb3bb58..54849fe9cb01 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -425,11 +425,6 @@ static int config_usb_cfg_link( } f = usb_get_function(fi); - if (f == NULL) { - /* Are we trying to symlink PTP without MTP function? */ - ret = -EINVAL; /* Invalid Configuration */ - goto out; - } if (IS_ERR(f)) { ret = PTR_ERR(f); goto out; diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 74195be9b054..b9e3e97c57c4 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -1498,7 +1498,7 @@ struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi, pr_err("\t2: Create MTP function\n"); pr_err("\t3: Create and symlink PTP function" " with a gadget configuration\n"); - return NULL; + return ERR_PTR(-EINVAL); /* Invalid Configuration */ } dev = fi_mtp->dev; diff --git a/drivers/usb/gadget/functions.c b/drivers/usb/gadget/functions.c index 389c1f3d0fee..b13f839e7368 100644 --- a/drivers/usb/gadget/functions.c +++ b/drivers/usb/gadget/functions.c @@ -58,7 +58,7 @@ struct usb_function *usb_get_function(struct usb_function_instance *fi) struct usb_function *f; f = fi->fd->alloc_func(fi); - if ((f == NULL) || IS_ERR(f)) + if (IS_ERR(f)) return f; f->fi = fi; return f; From ab2bff9a8f87be3710edef3f2f17dee78962a6a1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 13 May 2016 09:34:12 -0400 Subject: [PATCH 577/607] UPSTREAM: ring-buffer: Prevent overflow of size in ring_buffer_resize() (Cherry picked from commit 59643d1535eb220668692a5359de22545af579f6) If the size passed to ring_buffer_resize() is greater than MAX_LONG - BUF_PAGE_SIZE then the DIV_ROUND_UP() will return zero. Here's the details: # echo 18014398509481980 > /sys/kernel/debug/tracing/buffer_size_kb tracing_entries_write() processes this and converts kb to bytes. 18014398509481980 << 10 = 18446744073709547520 and this is passed to ring_buffer_resize() as unsigned long size. size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); Where DIV_ROUND_UP(a, b) is (a + b - 1)/b BUF_PAGE_SIZE is 4080 and here 18446744073709547520 + 4080 - 1 = 18446744073709551599 where 18446744073709551599 is still smaller than 2^64 2^64 - 18446744073709551599 = 17 But now 18446744073709551599 / 4080 = 4521260802379792 and size = size * 4080 = 18446744073709551360 This is checked to make sure its still greater than 2 * 4080, which it is. Then we convert to the number of buffer pages needed. nr_page = DIV_ROUND_UP(size, BUF_PAGE_SIZE) but this time size is 18446744073709551360 and 2^64 - (18446744073709551360 + 4080 - 1) = -3823 Thus it overflows and the resulting number is less than 4080, which makes 3823 / 4080 = 0 an nr_pages is set to this. As we already checked against the minimum that nr_pages may be, this causes the logic to fail as well, and we crash the kernel. There's no reason to have the two DIV_ROUND_UP() (that's just result of historical code changes), clean up the code and fix this bug. Cc: stable@vger.kernel.org # 3.5+ Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic") Signed-off-by: Steven Rostedt Signed-off-by: Willy Tarreau Change-Id: I1147672317a3ad0fc995b1f32baaa050a7976ac4 Bug: 32659848 --- kernel/trace/ring_buffer.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c6045a27ba3..7fa26a57a847 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1665,14 +1665,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be From 6dc8c51a76e57ae01fa087bdb5450423ce76a373 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 11:58:52 +0530 Subject: [PATCH 578/607] cpufreq: sched: Fix kernel crash on accessing sysfs file If the cpufreq driver hasn't set the CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag, then the kernel will crash on accessing sysfs files for the sched governor. CPUFreq governors we can have the governor specific sysfs files in two places: A. /sys/devices/system/cpu/cpuX/cpufreq/ B. /sys/devices/system/cpu/cpufreq/ The case A. is for governor per policy case, where we can control the governor tunables for each policy separately. The case B. is for system wide tunable values. The schedfreq governor only implements the case A. and not B. The sysfs files in case B will still be present in /sys/devices/system/cpu/cpufreq/, but accessing them will crash kernel as the governor doesn't support that. Moreover the sched governor is pretty new and will be used only for the ARM platforms and there is no need to support the case B at all. Hence use policy->kobj instead of get_governor_parent_kobj(), so that we always create the sysfs files in path A. Signed-off-by: Viresh Kumar --- kernel/sched/cpufreq_sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index f6f9b9b3a4a8..d751bc2d0d6e 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -289,7 +289,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); goto err; @@ -332,7 +332,7 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } - sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + sysfs_remove_group(&policy->kobj, get_sysfs_attr()); policy->governor_data = NULL; From ff458c4fca9797d314a024bad05b59831eff3f58 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Fri, 11 Nov 2016 01:10:04 -0500 Subject: [PATCH 579/607] ANDROID: usb: gadget: function: cleanup: Add blank line after declaration Fix warning generated by checkpatch.pl: Missing a blank line after declarations Change-Id: Id129bb8cc8fa37c67a647e2e5996bb2817020e65 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 2 ++ drivers/usb/gadget/function/f_audio_source.c | 1 + drivers/usb/gadget/function/f_mtp.c | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 2ca16a577542..9d3ec0e37475 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -211,6 +211,7 @@ static inline struct acc_dev *func_to_dev(struct usb_function *f) static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size) { struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + if (!req) return NULL; @@ -1021,6 +1022,7 @@ acc_function_unbind(struct usb_configuration *c, struct usb_function *f) static void acc_start_work(struct work_struct *data) { char *envp[2] = { "ACCESSORY=START", NULL }; + kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp); } diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c index 2489a5fa2685..db7903d19c43 100644 --- a/drivers/usb/gadget/function/f_audio_source.c +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -310,6 +310,7 @@ static struct device_attribute *audio_source_function_attributes[] = { static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size) { struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + if (!req) return NULL; diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index b9e3e97c57c4..d8b69af6e335 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -361,6 +361,7 @@ static inline struct mtp_dev *func_to_mtp(struct usb_function *f) static struct usb_request *mtp_request_new(struct usb_ep *ep, int buffer_size) { struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + if (!req) return NULL; @@ -1121,6 +1122,7 @@ static int mtp_ctrlrequest(struct usb_composite_dev *cdev, } else if (ctrl->bRequest == MTP_REQ_GET_DEVICE_STATUS && w_index == 0 && w_value == 0) { struct mtp_device_status *status = cdev->req->buf; + status->wLength = __constant_cpu_to_le16(sizeof(*status)); @@ -1143,6 +1145,7 @@ static int mtp_ctrlrequest(struct usb_composite_dev *cdev, /* respond with data transfer or status phase? */ if (value >= 0) { int rc; + cdev->req->zero = value < w_length; cdev->req->length = value; rc = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC); @@ -1378,6 +1381,7 @@ static struct mtp_instance *to_mtp_instance(struct config_item *item) static void mtp_attr_release(struct config_item *item) { struct mtp_instance *fi_mtp = to_mtp_instance(item); + usb_put_function_instance(&fi_mtp->func_inst); } From 1a8b993d92021b4f81da4b219a4214bdec06f541 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 19:00:03 +0000 Subject: [PATCH 580/607] BACKPORT: staging: goldfish: audio: add devicetree bindings Introduce devicetree bindings to the Goldfish staging audio driver. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 283ded10312a3b75e384313f6f529ec2c636cf2c) Change-Id: Ib75d3a4cac7353084a8da18a96fb298a759bacc0 --- .../devicetree/bindings/goldfish/audio.txt | 17 +++++++++++++++++ drivers/staging/goldfish/goldfish_audio.c | 9 ++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/goldfish/audio.txt diff --git a/Documentation/devicetree/bindings/goldfish/audio.txt b/Documentation/devicetree/bindings/goldfish/audio.txt new file mode 100644 index 000000000000..d043fda433ba --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/audio.txt @@ -0,0 +1,17 @@ +Android Goldfish Audio + +Android goldfish audio device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-audio" to match emulator +- reg : +- interrupts : + +Example: + + goldfish_audio@9030000 { + compatible = "google,goldfish-audio"; + reg = <0x9030000 0x100>; + interrupts = <0x4>; + }; diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index b0927e49d0a8..8841680ced9f 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -344,11 +344,18 @@ static int goldfish_audio_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_audio_of_match[] = { + { .compatible = "google,goldfish-audio", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_audio_of_match); + static struct platform_driver goldfish_audio_driver = { .probe = goldfish_audio_probe, .remove = goldfish_audio_remove, .driver = { - .name = "goldfish_audio" + .name = "goldfish_audio", + .of_match_table = goldfish_audio_of_match, } }; From ac4d96f09a06c127d9d7509efbe6403a6b19d208 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 18:45:30 +0000 Subject: [PATCH 581/607] BACKPORT: power: goldfish_battery: add devicetree bindings Add device tree bindings to the Goldfish virtual platform battery drivers. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Sebastian Reichel (cherry picked from commit 65d687a7b7d6f27e4306fe8cc8a1ca66a1a760f6) Change-Id: If947ea3341ff0cb713c56e14d18d51a3f5912b64 --- .../devicetree/bindings/goldfish/battery.txt | 17 +++++++++++++++++ drivers/power/goldfish_battery.c | 9 ++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/goldfish/battery.txt diff --git a/Documentation/devicetree/bindings/goldfish/battery.txt b/Documentation/devicetree/bindings/goldfish/battery.txt new file mode 100644 index 000000000000..4fb613933214 --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/battery.txt @@ -0,0 +1,17 @@ +Android Goldfish Battery + +Android goldfish battery device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-battery" to match emulator +- reg : +- interrupts : + +Example: + + goldfish_battery@9020000 { + compatible = "google,goldfish-battery"; + reg = <0x9020000 0x1000>; + interrupts = <0x3>; + }; diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c index a50bb988c69a..7510796e190c 100644 --- a/drivers/power/goldfish_battery.c +++ b/drivers/power/goldfish_battery.c @@ -227,11 +227,18 @@ static int goldfish_battery_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_battery_of_match[] = { + { .compatible = "google,goldfish-battery", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_battery_of_match); + static struct platform_driver goldfish_battery_device = { .probe = goldfish_battery_probe, .remove = goldfish_battery_remove, .driver = { - .name = "goldfish-battery" + .name = "goldfish-battery", + .of_match_table = goldfish_battery_of_match, } }; module_platform_driver(goldfish_battery_device); From aec62280ca9d6091b08c979284ae41379549d506 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 12:05:02 -0800 Subject: [PATCH 582/607] BACKPORT: Input: goldfish_events - add devicetree bindings Add device tree bindings to the Goldfish virtual platform event driver. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Dmitry Torokhov (cherry picked from commit 8c5dc5a1ada2b79259e55a4bd150135d23529c6a) Change-Id: I677d8e0d92294f53f7cc5a79300b6462b65e8aad --- .../devicetree/bindings/goldfish/events.txt | 17 +++++++++++++++++ drivers/input/keyboard/goldfish_events.c | 7 +++++++ 2 files changed, 24 insertions(+) create mode 100644 Documentation/devicetree/bindings/goldfish/events.txt diff --git a/Documentation/devicetree/bindings/goldfish/events.txt b/Documentation/devicetree/bindings/goldfish/events.txt new file mode 100644 index 000000000000..5babf46317a4 --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/events.txt @@ -0,0 +1,17 @@ +Android Goldfish Events Keypad + +Android goldfish events keypad device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-events-keypad" to match emulator +- reg : +- interrupts : + +Example: + + goldfish-events@9040000 { + compatible = "google,goldfish-events-keypad"; + reg = <0x9040000 0x1000>; + interrupts = <0x5>; + }; diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index 907e4e278fce..b11d218604a7 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -178,10 +178,17 @@ static int events_probe(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_events_of_match[] = { + { .compatible = "google,goldfish-events-keypad", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_events_of_match); + static struct platform_driver events_driver = { .probe = events_probe, .driver = { .name = "goldfish_events", + .of_match_table = goldfish_events_of_match, }, }; From 3f5a380007fec7d4f6a0f361cf9da78a206e53c1 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 19:01:05 +0000 Subject: [PATCH 583/607] BACKPORT: tty: goldfish: support platform_device with id -1 When the platform bus sets the platform_device id to -1 (PLATFORM_DEVID_NONE), use an incrementing counter for the TTY index instead Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 465893e18878e119d8d0255439fad8debbd646fd) Change-Id: Ifec5ee9d71c7c076e59bb7af77c0184d1b1383cb --- drivers/tty/goldfish.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c index 0f82c0b146f6..4356a23c588f 100644 --- a/drivers/tty/goldfish.c +++ b/drivers/tty/goldfish.c @@ -68,8 +68,7 @@ static void goldfish_tty_do_write(int line, const char *buf, unsigned count) static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id) { - struct platform_device *pdev = dev_id; - struct goldfish_tty *qtty = &goldfish_ttys[pdev->id]; + struct goldfish_tty *qtty = dev_id; void __iomem *base = qtty->base; unsigned long irq_flags; unsigned char *buf; @@ -233,6 +232,7 @@ static int goldfish_tty_probe(struct platform_device *pdev) struct device *ttydev; void __iomem *base; u32 irq; + unsigned int line; r = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (r == NULL) @@ -248,10 +248,16 @@ static int goldfish_tty_probe(struct platform_device *pdev) irq = r->start; - if (pdev->id >= goldfish_tty_line_count) - goto err_unmap; - mutex_lock(&goldfish_tty_lock); + + if (pdev->id == PLATFORM_DEVID_NONE) + line = goldfish_tty_current_line_count; + else + line = pdev->id; + + if (line >= goldfish_tty_line_count) + goto err_create_driver_failed; + if (goldfish_tty_current_line_count == 0) { ret = goldfish_tty_create_driver(); if (ret) @@ -259,7 +265,7 @@ static int goldfish_tty_probe(struct platform_device *pdev) } goldfish_tty_current_line_count++; - qtty = &goldfish_ttys[pdev->id]; + qtty = &goldfish_ttys[line]; spin_lock_init(&qtty->lock); tty_port_init(&qtty->port); qtty->port.ops = &goldfish_port_ops; @@ -269,13 +275,13 @@ static int goldfish_tty_probe(struct platform_device *pdev) writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_CMD); ret = request_irq(irq, goldfish_tty_interrupt, IRQF_SHARED, - "goldfish_tty", pdev); + "goldfish_tty", qtty); if (ret) goto err_request_irq_failed; ttydev = tty_port_register_device(&qtty->port, goldfish_tty_driver, - pdev->id, &pdev->dev); + line, &pdev->dev); if (IS_ERR(ttydev)) { ret = PTR_ERR(ttydev); goto err_tty_register_device_failed; @@ -286,8 +292,9 @@ static int goldfish_tty_probe(struct platform_device *pdev) qtty->console.device = goldfish_tty_console_device; qtty->console.setup = goldfish_tty_console_setup; qtty->console.flags = CON_PRINTBUFFER; - qtty->console.index = pdev->id; + qtty->console.index = line; register_console(&qtty->console); + platform_set_drvdata(pdev, qtty); mutex_unlock(&goldfish_tty_lock); return 0; @@ -307,13 +314,12 @@ err_unmap: static int goldfish_tty_remove(struct platform_device *pdev) { - struct goldfish_tty *qtty; + struct goldfish_tty *qtty = platform_get_drvdata(pdev); mutex_lock(&goldfish_tty_lock); - qtty = &goldfish_ttys[pdev->id]; unregister_console(&qtty->console); - tty_unregister_device(goldfish_tty_driver, pdev->id); + tty_unregister_device(goldfish_tty_driver, qtty->console.index); iounmap(qtty->base); qtty->base = NULL; free_irq(qtty->irq, pdev); From b58abfa8bd400855bf09c84ae27dd2a4446c60f9 Mon Sep 17 00:00:00 2001 From: Miodrag Dinic Date: Fri, 26 Feb 2016 19:00:44 +0000 Subject: [PATCH 584/607] BACKPORT: drivers: tty: goldfish: Add device tree bindings Enable support for registering this device using the device tree. Device tree node example for registering Goldfish TTY device : goldfish_tty@1f004000 { interrupts = <0xc>; reg = <0x1f004000 0x1000>; compatible = "google,goldfish-tty"; }; Signed-off-by: Miodrag Dinic Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 9b883eea26ccf043b608e398cf6a26231d44f5fb) Change-Id: Idbe1bbac4f371e2feb6730712b08b66be1188ea7 --- .../devicetree/bindings/goldfish/tty.txt | 17 +++++++++++++++++ drivers/tty/goldfish.c | 10 +++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/goldfish/tty.txt diff --git a/Documentation/devicetree/bindings/goldfish/tty.txt b/Documentation/devicetree/bindings/goldfish/tty.txt new file mode 100644 index 000000000000..82648278da77 --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/tty.txt @@ -0,0 +1,17 @@ +Android Goldfish TTY + +Android goldfish tty device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-tty" to match emulator +- reg : +- interrupts : + +Example: + + goldfish_tty@1f004000 { + compatible = "google,goldfish-tty"; + reg = <0x1f004000 0x1000>; + interrupts = <0xc>; + }; diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c index 4356a23c588f..1e332855b933 100644 --- a/drivers/tty/goldfish.c +++ b/drivers/tty/goldfish.c @@ -330,11 +330,19 @@ static int goldfish_tty_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_tty_of_match[] = { + { .compatible = "google,goldfish-tty", }, + {}, +}; + +MODULE_DEVICE_TABLE(of, goldfish_tty_of_match); + static struct platform_driver goldfish_tty_platform_driver = { .probe = goldfish_tty_probe, .remove = goldfish_tty_remove, .driver = { - .name = "goldfish_tty" + .name = "goldfish_tty", + .of_match_table = goldfish_tty_of_match, } }; From 779a63ef878a4d8cd601e791fdf81df9fd5f953a Mon Sep 17 00:00:00 2001 From: Yu Ning Date: Tue, 1 Mar 2016 23:46:10 +0000 Subject: [PATCH 585/607] BACKPORT: goldfish: Enable ACPI-based enumeration for goldfish battery Add the ACPI bindings to the goldfish battery driver. Signed-off-by: Yu Ning Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Sebastian Reichel (cherry picked from commit fdb2f37a54470473c6b7c9d680c4c114dd9bc434) Change-Id: I3b53481b5868b0b26848397420c9ba16a747819f --- drivers/power/goldfish_battery.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c index 7510796e190c..f5c525e4482a 100644 --- a/drivers/power/goldfish_battery.c +++ b/drivers/power/goldfish_battery.c @@ -24,6 +24,7 @@ #include #include #include +#include struct goldfish_battery_data { void __iomem *reg_base; @@ -233,12 +234,19 @@ static const struct of_device_id goldfish_battery_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_battery_of_match); +static const struct acpi_device_id goldfish_battery_acpi_match[] = { + { "GFSH0001", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_battery_acpi_match); + static struct platform_driver goldfish_battery_device = { .probe = goldfish_battery_probe, .remove = goldfish_battery_remove, .driver = { .name = "goldfish-battery", .of_match_table = goldfish_battery_of_match, + .acpi_match_table = ACPI_PTR(goldfish_battery_acpi_match), } }; module_platform_driver(goldfish_battery_device); From 212048596dbaa5e81386c9594782c4928ed6efa1 Mon Sep 17 00:00:00 2001 From: Jason Hu Date: Fri, 26 Feb 2016 12:06:47 -0800 Subject: [PATCH 586/607] BACKPORT: Input: goldfish_events - enable ACPI-based enumeration for goldfish events Add ACPI binding to the goldfish events driver. Signed-off-by: Jason Hu Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Dmitry Torokhov (cherry picked from commit 0581ce09fd2c976125a20791268d7206db156d2f) Change-Id: Ic3e4f1cffb111ea6c69977e63dd598e3fcb55f19 --- drivers/input/keyboard/goldfish_events.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index b11d218604a7..f6e643b589b6 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -22,6 +22,7 @@ #include #include #include +#include enum { REG_READ = 0x00, @@ -184,11 +185,20 @@ static const struct of_device_id goldfish_events_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_events_of_match); +#ifdef CONFIG_ACPI +static const struct acpi_device_id goldfish_events_acpi_match[] = { + { "GFSH0002", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_events_acpi_match); +#endif + static struct platform_driver events_driver = { .probe = events_probe, .driver = { .name = "goldfish_events", .of_match_table = goldfish_events_of_match, + .acpi_match_table = ACPI_PTR(goldfish_events_acpi_match), }, }; From f3d949fd57efc8d8a868517e0f64b77662388417 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 19:00:18 +0000 Subject: [PATCH 587/607] BACKPORT: staging: goldfish: audio: fix compiliation on arm We do actually need slab.h, by luck we get it on other platforms but not always on ARM. Include it properly. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 4532150762ceb0d6fd765ebcb3ba6966fbb8faab) Change-Id: I93a0c35da40f26aaa7c253e3c0cefaa883ea3391 --- drivers/staging/goldfish/goldfish_audio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index 8841680ced9f..4191054b878e 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -26,6 +26,7 @@ #include #include #include +#include #include MODULE_AUTHOR("Google, Inc."); From 95f38dcf1d106e62e97f207344978d6230a9c292 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Mon, 28 Oct 2013 15:33:33 -0700 Subject: [PATCH 588/607] ANDROID: video: goldfishfb: add devicetree bindings Change-Id: I5f4ba861b981edf39af537001f8ac72202927031 Signed-off-by: Greg Hackmann --- drivers/video/fbdev/goldfishfb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 7f6c9e6cfc6c..f0e651bbbd61 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -304,12 +304,19 @@ static int goldfish_fb_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_fb_of_match[] = { + { .compatible = "google,goldfish-fb", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_fb_of_match); static struct platform_driver goldfish_fb_driver = { .probe = goldfish_fb_probe, .remove = goldfish_fb_remove, .driver = { - .name = "goldfish_fb" + .name = "goldfish_fb", + .owner = THIS_MODULE, + .of_match_table = goldfish_fb_of_match, } }; From 9cec6e315327318d99542f5ce695f62351cc2886 Mon Sep 17 00:00:00 2001 From: Yu Ning Date: Thu, 12 Feb 2015 11:44:40 +0800 Subject: [PATCH 589/607] ANDROID: goldfish: Enable ACPI-based enumeration for goldfish framebuffer Follow the same way in which ACPI was enabled for goldfish battery. See commit d3be10e for details. Note that this patch also depends on commit af33cac. Change-Id: Ic63b6e7e0a4b9896ef9a9d0ed135a7796a4c1fdb Signed-off-by: Yu Ning --- drivers/video/fbdev/goldfishfb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index f0e651bbbd61..58b33e4af16e 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -26,6 +26,7 @@ #include #include #include +#include enum { FB_GET_WIDTH = 0x00, @@ -310,6 +311,12 @@ static const struct of_device_id goldfish_fb_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_fb_of_match); +static const struct acpi_device_id goldfish_fb_acpi_match[] = { + { "GFSH0004", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_fb_acpi_match); + static struct platform_driver goldfish_fb_driver = { .probe = goldfish_fb_probe, .remove = goldfish_fb_remove, @@ -317,6 +324,7 @@ static struct platform_driver goldfish_fb_driver = { .name = "goldfish_fb", .owner = THIS_MODULE, .of_match_table = goldfish_fb_of_match, + .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match), } }; From a666500dd7205043280ed38c2bca81854b4067be Mon Sep 17 00:00:00 2001 From: Yu Ning Date: Tue, 31 Mar 2015 14:41:48 +0800 Subject: [PATCH 590/607] ANDROID: goldfish: Enable ACPI-based enumeration for goldfish audio Follow the same way in which ACPI was enabled for goldfish battery. See commit d3be10e for details. Change-Id: I6ffe38ebc80fb8af8322152370b9d1fd227eaf50 Signed-off-by: Yu Ning --- drivers/staging/goldfish/goldfish_audio.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index 4191054b878e..83bbe459b93a 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -28,6 +28,7 @@ #include #include #include +#include MODULE_AUTHOR("Google, Inc."); MODULE_DESCRIPTION("Android QEMU Audio Driver"); @@ -351,12 +352,19 @@ static const struct of_device_id goldfish_audio_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_audio_of_match); +static const struct acpi_device_id goldfish_audio_acpi_match[] = { + { "GFSH0005", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_audio_acpi_match); + static struct platform_driver goldfish_audio_driver = { .probe = goldfish_audio_probe, .remove = goldfish_audio_remove, .driver = { .name = "goldfish_audio", .of_match_table = goldfish_audio_of_match, + .acpi_match_table = ACPI_PTR(goldfish_audio_acpi_match), } }; From d5a5ff91f852f26d7261cb31cb126f267567af24 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 19 Jun 2014 16:24:04 +0200 Subject: [PATCH 591/607] ANDROID: goldfish_fb: Set pixclock = 0 User space Android code identifies pixclock == 0 as a sign for emulation and will set the frame rate to 60 fps when reading this value, which is the desired outcome. Change-Id: I759bf518bf6683446bc786bf1be3cafa02dd8d42 Signed-off-by: Christoffer Dall Signed-off-by: Peter Maydell --- drivers/video/fbdev/goldfishfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 58b33e4af16e..131fee0341c3 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -235,7 +235,7 @@ static int goldfish_fb_probe(struct platform_device *pdev) fb->fb.var.activate = FB_ACTIVATE_NOW; fb->fb.var.height = readl(fb->reg_base + FB_GET_PHYS_HEIGHT); fb->fb.var.width = readl(fb->reg_base + FB_GET_PHYS_WIDTH); - fb->fb.var.pixclock = 10000; + fb->fb.var.pixclock = 0; fb->fb.var.red.offset = 11; fb->fb.var.red.length = 5; From 8bf12bc1b78dac6cb4fb7e4fbc0920978d17f5ea Mon Sep 17 00:00:00 2001 From: Lingfeng Yang Date: Fri, 18 Dec 2015 12:04:43 -0800 Subject: [PATCH 592/607] ANDROID: goldfish_events: no extra EV_SYN; register goldfish If we send SYN_REPORT on every single multitouch event, it breaks the multitouch. The multitouch becomes janky and having to click 2-3 times to do stuff (plus randomly activating notification bars when not clicking) If we suppress these SYN_REPORTS, multitouch will work fine, plus the events will have a protocol that looks nice. In addition, we need to register Goldfish Events as a multitouch device by issuing input_mt_init_slots, otherwise input_handle_abs_event in drivers/input/input.c will silently drop all ABS_MT_SLOT events, making it so that touches with more than 1 finger do not work properly. Signed-off-by: "Lingfeng Yang" Change-Id: Ib2350f7d1732449d246f6f0d9b7b08f02cc7c2dd (cherry picked from commit 6cf40d0a16330e1ef42bdf07d9aba6c16ee11fbc) --- drivers/input/keyboard/goldfish_events.c | 28 +++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index f6e643b589b6..c877e56a9bd5 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#define GOLDFISH_MAX_FINGERS 5 + enum { REG_READ = 0x00, REG_SET_PAGE = 0x00, @@ -52,7 +55,21 @@ static irqreturn_t events_interrupt(int irq, void *dev_id) value = __raw_readl(edev->addr + REG_READ); input_event(edev->input, type, code, value); - input_sync(edev->input); + // Send an extra (EV_SYN, SYN_REPORT, 0x0) event + // if a key was pressed. Some keyboard device + // drivers may only send the EV_KEY event and + // not EV_SYN. + // Note that sending an extra SYN_REPORT is not + // necessary nor correct protocol with other + // devices such as touchscreens, which will send + // their own SYN_REPORT's when sufficient event + // information has been collected (e.g., for + // touchscreens, when pressure and X/Y coordinates + // have been received). Hence, we will only send + // this extra SYN_REPORT if type == EV_KEY. + if (type == EV_KEY) { + input_sync(edev->input); + } return IRQ_HANDLED; } @@ -154,6 +171,15 @@ static int events_probe(struct platform_device *pdev) input_dev->name = edev->name; input_dev->id.bustype = BUS_HOST; + // Set the Goldfish Device to be multi-touch. + // In the Ranchu kernel, there is multi-touch-specific + // code for handling ABS_MT_SLOT events. + // See drivers/input/input.c:input_handle_abs_event. + // If we do not issue input_mt_init_slots, + // the kernel will filter out needed ABS_MT_SLOT + // events when we touch the screen in more than one place, + // preventing multi-touch with more than one finger from working. + input_mt_init_slots(input_dev, GOLDFISH_MAX_FINGERS, 0); events_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX); events_import_bits(edev, input_dev->keybit, EV_KEY, KEY_MAX); From 18e334986224af1dc9d1a61dd8957f07919c76d0 Mon Sep 17 00:00:00 2001 From: Joshua Lang Date: Fri, 17 Jun 2016 17:30:55 -0700 Subject: [PATCH 593/607] ANDROID: goldfish_audio: Clear audio read buffer status after each read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The buffer_status field is interrupt updated. After every read request, the buffer_status read field should be reset so that on the next loop iteration we don't read a stale value and read data before the device is ready. Signed-off-by: “Joshua Lang” Change-Id: I4943d5aaada1cad9c7e59a94a87c387578dabe86 --- drivers/staging/goldfish/goldfish_audio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index 83bbe459b93a..63b79c09b41b 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -117,6 +117,7 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf, size_t count, loff_t *pos) { struct goldfish_audio *data = fp->private_data; + unsigned long irq_flags; int length; int result = 0; @@ -130,6 +131,10 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf, wait_event_interruptible(data->wait, data->buffer_status & AUDIO_INT_READ_BUFFER_FULL); + spin_lock_irqsave(&data->lock, irq_flags); + data->buffer_status &= ~AUDIO_INT_READ_BUFFER_FULL; + spin_unlock_irqrestore(&data->lock, irq_flags); + length = AUDIO_READ(data, AUDIO_READ_BUFFER_AVAILABLE); /* copy data to user space */ From 0c1529aecf8fb719cf49de4d5a452b855e437387 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Fri, 7 Oct 2016 16:20:47 -0700 Subject: [PATCH 594/607] ANDROID: goldfish: add ranchu defconfigs Change-Id: I73ef1b132b6203ae921a1e1d4850eaadf58f8926 --- arch/arm/configs/ranchu_defconfig | 315 +++++++++++++++++ arch/arm64/configs/ranchu_defconfig | 311 +++++++++++++++++ arch/x86/configs/i386_ranchu_defconfig | 422 +++++++++++++++++++++++ arch/x86/configs/x86_64_ranchu_defconfig | 417 ++++++++++++++++++++++ 4 files changed, 1465 insertions(+) create mode 100644 arch/arm/configs/ranchu_defconfig create mode 100644 arch/arm64/configs/ranchu_defconfig create mode 100644 arch/x86/configs/i386_ranchu_defconfig create mode 100644 arch/x86/configs/x86_64_ranchu_defconfig diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig new file mode 100644 index 000000000000..35a90af941a4 --- /dev/null +++ b/arch/arm/configs/ranchu_defconfig @@ -0,0 +1,315 @@ +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_ARCH_MMAP_RND_BITS=16 +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_IOSCHED_CFQ is not set +CONFIG_ARCH_VIRT=y +CONFIG_ARM_KERNMEM_PERMS=y +CONFIG_SMP=y +CONFIG_PREEMPT=y +CONFIG_AEABI=y +CONFIG_HIGHMEM=y +CONFIG_KSM=y +CONFIG_SECCOMP=y +CONFIG_CMDLINE="console=ttyAMA0" +CONFIG_VFP=y +CONFIG_NEON=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_INET_ESP=y +# CONFIG_INET_LRO is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_BRIDGE=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_MTD=y +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_CFI=y +CONFIG_MTD_CFI_INTELEXT=y +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_SMSC911X=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +CONFIG_USB_USBNET=y +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_AMBAKMI=y +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +# CONFIG_HW_RANDOM is not set +# CONFIG_HWMON is not set +CONFIG_MEDIA_SUPPORT=y +CONFIG_FB=y +CONFIG_FB_GOLDFISH=y +CONFIG_FB_SIMPLE=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_PL031=y +CONFIG_VIRTIO_MMIO=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_SW_SYNC_USER=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_FUSE_FS=y +CONFIG_CUSE=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +CONFIG_NFS_FS=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_DEBUG_INFO=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_TIMEOUT=5 +# CONFIG_SCHED_DEBUG is not set +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_ENABLE_DEFAULT_TRACERS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_VIRTUALIZATION=y diff --git a/arch/arm64/configs/ranchu_defconfig b/arch/arm64/configs/ranchu_defconfig new file mode 100644 index 000000000000..00eb346e0928 --- /dev/null +++ b/arch/arm64/configs/ranchu_defconfig @@ -0,0 +1,311 @@ +# CONFIG_LOCALVERSION_AUTO is not set +# CONFIG_SWAP is not set +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SCHED_AUTOGROUP=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +CONFIG_PROFILING=y +CONFIG_ARCH_MMAP_RND_BITS=24 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_ARCH_VEXPRESS=y +CONFIG_NR_CPUS=4 +CONFIG_PREEMPT=y +CONFIG_KSM=y +CONFIG_SECCOMP=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_SWP_EMULATION=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y +CONFIG_CMDLINE="console=ttyAMA0" +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_COMPAT=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_INET_ESP=y +# CONFIG_INET_LRO is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_RPFILTER=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_TARGET_ECN=y +CONFIG_IP_NF_TARGET_TTL=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MATCH_AH=y +CONFIG_IP6_NF_MATCH_EUI64=y +CONFIG_IP6_NF_MATCH_FRAG=y +CONFIG_IP6_NF_MATCH_OPTS=y +CONFIG_IP6_NF_MATCH_HL=y +CONFIG_IP6_NF_MATCH_IPV6HEADER=y +CONFIG_IP6_NF_MATCH_MH=y +CONFIG_IP6_NF_MATCH_RT=y +CONFIG_IP6_NF_TARGET_HL=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_BRIDGE=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_SCSI=y +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_SMC91X=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_INPUT_TABLET=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +# CONFIG_HW_RANDOM is not set +CONFIG_BATTERY_GOLDFISH=y +# CONFIG_HWMON is not set +CONFIG_MEDIA_SUPPORT=y +CONFIG_FB=y +CONFIG_FB_GOLDFISH=y +CONFIG_FB_SIMPLE=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +# CONFIG_USB_SUPPORT is not set +CONFIG_RTC_CLASS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_TIMED_GPIO=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_SW_SYNC_USER=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +# CONFIG_IOMMU_SUPPORT is not set +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_EXT2_FS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_FUSE_FS=y +CONFIG_CUSE=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_NFS_FS=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_PANIC_TIMEOUT=5 +# CONFIG_SCHED_DEBUG is not set +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +# CONFIG_FTRACE is not set +CONFIG_ATOMIC64_SELFTEST=y +CONFIG_DEBUG_RODATA=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig new file mode 100644 index 000000000000..b0e4e0ed4b11 --- /dev/null +++ b/arch/x86/configs/i386_ranchu_defconfig @@ -0,0 +1,422 @@ +# CONFIG_64BIT is not set +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +CONFIG_ARCH_MMAP_RND_BITS=16 +CONFIG_PARTITION_ADVANCED=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_SGI_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_SMP=y +CONFIG_X86_BIGSMP=y +CONFIG_MCORE2=y +CONFIG_X86_GENERIC=y +CONFIG_HPET_TIMER=y +CONFIG_NR_CPUS=512 +CONFIG_PREEMPT=y +# CONFIG_X86_MCE is not set +CONFIG_X86_REBOOTFIXUPS=y +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y +CONFIG_KSM=y +CONFIG_CMA=y +# CONFIG_MTRR_SANITIZER is not set +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_HZ_100=y +CONFIG_PHYSICAL_START=0x100000 +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_PCIEPORTBUS=y +# CONFIG_PCIEASPM is not set +CONFIG_PCCARD=y +CONFIG_YENTA=y +CONFIG_HOTPLUG_PCI=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_ESP=y +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +# CONFIG_INET_DIAG is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETLABEL=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +CONFIG_CFG80211=y +CONFIG_MAC80211=y +CONFIG_MAC80211_LEDS=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=16 +CONFIG_CONNECTOR=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SPI_ATTRS=y +CONFIG_SCSI_ISCSI_ATTRS=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_ATA_PIIX=y +CONFIG_PATA_AMD=y +CONFIG_PATA_OLDPIIX=y +CONFIG_PATA_SCH=y +CONFIG_PATA_MPIIX=y +CONFIG_ATA_GENERIC=y +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_DEBUG=y +CONFIG_DM_CRYPT=y +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_NETCONSOLE=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_BNX2=y +CONFIG_TIGON3=y +CONFIG_NET_TULIP=y +CONFIG_E100=y +CONFIG_E1000=y +CONFIG_E1000E=y +CONFIG_SKY2=y +CONFIG_NE2K_PCI=y +CONFIG_FORCEDETH=y +CONFIG_8139TOO=y +# CONFIG_8139TOO_PIO is not set +CONFIG_R8169=y +CONFIG_FDDI=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +CONFIG_USB_USBNET=y +CONFIG_INPUT_POLLDEV=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_NVRAM=y +CONFIG_I2C_I801=y +CONFIG_BATTERY_GOLDFISH=y +CONFIG_WATCHDOG=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_AGP=y +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y +CONFIG_DRM=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y +CONFIG_FB_EFI=y +CONFIG_FB_GOLDFISH=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +# CONFIG_LCD_CLASS_DEVICE is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_MON=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_UHCI_HCD=y +CONFIG_USB_PRINTER=y +CONFIG_USB_STORAGE=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_EDAC=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +CONFIG_DMADEVICES=y +CONFIG_VIRTIO_PCI=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_SND_HDA_INTEL=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +CONFIG_FUSE_FS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_HUGETLBFS=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_INFO=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PROVIDE_OHCI1394_DMA_INIT=y +CONFIG_KEYS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_CRYPTO_AES_586=y +CONFIG_CRYPTO_TWOFISH=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_PKCS7_TEST_KEY=y +# CONFIG_VIRTUALIZATION is not set +CONFIG_CRC_T10DIF=y diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig new file mode 100644 index 000000000000..8dae21ed3ede --- /dev/null +++ b/arch/x86/configs/x86_64_ranchu_defconfig @@ -0,0 +1,417 @@ +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_PARTITION_ADVANCED=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_SGI_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_SMP=y +CONFIG_MCORE2=y +CONFIG_MAXSMP=y +CONFIG_PREEMPT=y +# CONFIG_X86_MCE is not set +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y +CONFIG_KSM=y +CONFIG_CMA=y +# CONFIG_MTRR_SANITIZER is not set +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_HZ_100=y +CONFIG_PHYSICAL_START=0x100000 +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCIEPORTBUS=y +# CONFIG_PCIEASPM is not set +CONFIG_PCCARD=y +CONFIG_YENTA=y +CONFIG_HOTPLUG_PCI=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_IA32_EMULATION=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_ESP=y +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +# CONFIG_INET_DIAG is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETLABEL=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +CONFIG_CFG80211=y +CONFIG_MAC80211=y +CONFIG_MAC80211_LEDS=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DMA_CMA=y +CONFIG_CONNECTOR=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SPI_ATTRS=y +CONFIG_SCSI_ISCSI_ATTRS=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_ATA_PIIX=y +CONFIG_PATA_AMD=y +CONFIG_PATA_OLDPIIX=y +CONFIG_PATA_SCH=y +CONFIG_PATA_MPIIX=y +CONFIG_ATA_GENERIC=y +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_DEBUG=y +CONFIG_DM_CRYPT=y +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_NETCONSOLE=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_BNX2=y +CONFIG_TIGON3=y +CONFIG_NET_TULIP=y +CONFIG_E100=y +CONFIG_E1000=y +CONFIG_E1000E=y +CONFIG_SKY2=y +CONFIG_NE2K_PCI=y +CONFIG_FORCEDETH=y +CONFIG_8139TOO=y +# CONFIG_8139TOO_PIO is not set +CONFIG_R8169=y +CONFIG_FDDI=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +CONFIG_USB_USBNET=y +CONFIG_INPUT_POLLDEV=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_NVRAM=y +CONFIG_I2C_I801=y +CONFIG_BATTERY_GOLDFISH=y +CONFIG_WATCHDOG=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_AGP=y +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y +CONFIG_DRM=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y +CONFIG_FB_EFI=y +CONFIG_FB_GOLDFISH=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +# CONFIG_LCD_CLASS_DEVICE is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_MON=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_UHCI_HCD=y +CONFIG_USB_PRINTER=y +CONFIG_USB_STORAGE=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_EDAC=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +CONFIG_DMADEVICES=y +CONFIG_VIRTIO_PCI=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_SND_HDA_INTEL=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +CONFIG_FUSE_FS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_HUGETLBFS=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_INFO=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PROVIDE_OHCI1394_DMA_INIT=y +CONFIG_KEYS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_CRYPTO_TWOFISH=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_PKCS7_TEST_KEY=y +# CONFIG_VIRTUALIZATION is not set +CONFIG_CRC_T10DIF=y From 2c8ebe3687301b2a94bb551d6d1a6570e2b407d2 Mon Sep 17 00:00:00 2001 From: Lingfeng Yang Date: Mon, 13 Jun 2016 09:24:07 -0700 Subject: [PATCH 595/607] ANDROID: goldfish: Add goldfish sync driver This is kernel driver for controlling the Goldfish sync device on the host. It is used to maintain ordering in critical OpenGL state changes while using GPU emulation. The guest open()'s the Goldfish sync device to create a context for possibly maintaining sync timeline and fences. There is a 1:1 correspondence between such sync contexts and OpenGL contexts in the guest that need synchronization (which in turn, is anything involving swapping buffers, SurfaceFlinger, or Hardware Composer). The ioctl QUEUE_WORK takes a handle to a sync object and attempts to tell the host GPU to wait on the sync object and deal with signaling it. It possibly outputs a fence FD on which the Android systems that use them (GLConsumer, SurfaceFlinger, anything employing EGL_ANDROID_native_fence_sync) can use to wait. Design decisions and work log: - New approach is to have the guest issue ioctls that trigger host wait, and then host increments timeline. - We need the host's sync object handle and sync thread handle as the necessary information for that. - ioctl() from guest can work simultaneously with the interrupt handling for commands from host. - optimization: don't write back on timeline inc - Change spin lock design to be much more lightweight; do not call sw_sync functions or loop too long anywhere. - Send read/write commands in batches to minimize guest/host transitions. - robustness: BUG if we will overrun the cmd buffer. - robustness: return fd -1 if we cannot get an unused fd. - correctness: remove global mutex - cleanup pass done, incl. but not limited to: - removal of clear_upto and - switching to devm_*** This is part of a sequential, multi-CL change: external/qemu: https://android-review.googlesource.com/239442 <- host-side device's host interface https://android-review.googlesource.com/221593 https://android-review.googlesource.com/248563 https://android-review.googlesource.com/248564 https://android-review.googlesource.com/223032 external/qemu-android: https://android-review.googlesource.com/238790 <- host-side device implementation kernel/goldfish: https://android-review.googlesource.com/232631 <- needed https://android-review.googlesource.com/238399 <- this CL Also squash following bug fixes from android-goldfish-3.18 branch. b44d486 goldfish_sync: provide a signal to detect reboot ad1f597 goldfish_sync: fix stalls by avoiding early kfree() de208e8 [goldfish-sync] Fix possible race between kernel and user space Change-Id: I22f8a0e824717a7e751b1b0e1b461455501502b6 --- arch/x86/configs/i386_ranchu_defconfig | 1 + arch/x86/configs/x86_64_ranchu_defconfig | 1 + drivers/staging/goldfish/Kconfig | 6 + drivers/staging/goldfish/Makefile | 5 + drivers/staging/goldfish/goldfish_sync.c | 987 +++++++++++++++++++++++ 5 files changed, 1000 insertions(+) create mode 100644 drivers/staging/goldfish/goldfish_sync.c diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig index b0e4e0ed4b11..0206eb8cfb61 100644 --- a/arch/x86/configs/i386_ranchu_defconfig +++ b/arch/x86/configs/i386_ranchu_defconfig @@ -363,6 +363,7 @@ CONFIG_SYNC=y CONFIG_SW_SYNC=y CONFIG_ION=y CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH_SYNC=y CONFIG_SND_HDA_INTEL=y CONFIG_GOLDFISH=y CONFIG_GOLDFISH_PIPE=y diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig index 8dae21ed3ede..dd389774bacb 100644 --- a/arch/x86/configs/x86_64_ranchu_defconfig +++ b/arch/x86/configs/x86_64_ranchu_defconfig @@ -360,6 +360,7 @@ CONFIG_SYNC=y CONFIG_SW_SYNC=y CONFIG_ION=y CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH_SYNC=y CONFIG_SND_HDA_INTEL=y CONFIG_GOLDFISH=y CONFIG_GOLDFISH_PIPE=y diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig index 4e094602437c..c579141a7bed 100644 --- a/drivers/staging/goldfish/Kconfig +++ b/drivers/staging/goldfish/Kconfig @@ -4,6 +4,12 @@ config GOLDFISH_AUDIO ---help--- Emulated audio channel for the Goldfish Android Virtual Device +config GOLDFISH_SYNC + tristate "Goldfish AVD Sync Driver" + depends on GOLDFISH + ---help--- + Emulated sync fences for the Goldfish Android Virtual Device + config MTD_GOLDFISH_NAND tristate "Goldfish NAND device" depends on GOLDFISH diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile index dec34ad58162..0cf525588210 100644 --- a/drivers/staging/goldfish/Makefile +++ b/drivers/staging/goldfish/Makefile @@ -4,3 +4,8 @@ obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o obj-$(CONFIG_MTD_GOLDFISH_NAND) += goldfish_nand.o + +# and sync + +ccflags-y := -Idrivers/staging/android +obj-$(CONFIG_GOLDFISH_SYNC) += goldfish_sync.o diff --git a/drivers/staging/goldfish/goldfish_sync.c b/drivers/staging/goldfish/goldfish_sync.c new file mode 100644 index 000000000000..ba8def29901e --- /dev/null +++ b/drivers/staging/goldfish/goldfish_sync.c @@ -0,0 +1,987 @@ +/* + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "sw_sync.h" +#include "sync.h" + +#define ERR(...) printk(KERN_ERR __VA_ARGS__); + +#define INFO(...) printk(KERN_INFO __VA_ARGS__); + +#define DPRINT(...) pr_debug(__VA_ARGS__); + +#define DTRACE() DPRINT("%s: enter", __func__) + +/* The Goldfish sync driver is designed to provide a interface + * between the underlying host's sync device and the kernel's + * sw_sync. + * The purpose of the device/driver is to enable lightweight + * creation and signaling of timelines and fences + * in order to synchronize the guest with host-side graphics events. + * + * Each time the interrupt trips, the driver + * may perform a sw_sync operation. + */ + +/* The operations are: */ + +/* Ready signal - used to mark when irq should lower */ +#define CMD_SYNC_READY 0 + +/* Create a new timeline. writes timeline handle */ +#define CMD_CREATE_SYNC_TIMELINE 1 + +/* Create a fence object. reads timeline handle and time argument. + * Writes fence fd to the SYNC_REG_HANDLE register. */ +#define CMD_CREATE_SYNC_FENCE 2 + +/* Increments timeline. reads timeline handle and time argument */ +#define CMD_SYNC_TIMELINE_INC 3 + +/* Destroys a timeline. reads timeline handle */ +#define CMD_DESTROY_SYNC_TIMELINE 4 + +/* Starts a wait on the host with + * the given glsync object and sync thread handle. */ +#define CMD_TRIGGER_HOST_WAIT 5 + +/* The register layout is: */ + +#define SYNC_REG_BATCH_COMMAND 0x00 /* host->guest batch commands */ +#define SYNC_REG_BATCH_GUESTCOMMAND 0x04 /* guest->host batch commands */ +#define SYNC_REG_BATCH_COMMAND_ADDR 0x08 /* communicate physical address of host->guest batch commands */ +#define SYNC_REG_BATCH_COMMAND_ADDR_HIGH 0x0c /* 64-bit part */ +#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR 0x10 /* communicate physical address of guest->host commands */ +#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH 0x14 /* 64-bit part */ +#define SYNC_REG_INIT 0x18 /* signals that the device has been probed */ + +/* There is an ioctl associated with goldfish sync driver. + * Make it conflict with ioctls that are not likely to be used + * in the emulator. + * + * '@' 00-0F linux/radeonfb.h conflict! + * '@' 00-0F drivers/video/aty/aty128fb.c conflict! + */ +#define GOLDFISH_SYNC_IOC_MAGIC '@' + +#define GOLDFISH_SYNC_IOC_QUEUE_WORK _IOWR(GOLDFISH_SYNC_IOC_MAGIC, 0, struct goldfish_sync_ioctl_info) + +/* The above definitions (command codes, register layout, ioctl definitions) + * need to be in sync with the following files: + * + * Host-side (emulator): + * external/qemu/android/emulation/goldfish_sync.h + * external/qemu-android/hw/misc/goldfish_sync.c + * + * Guest-side (system image): + * device/generic/goldfish-opengl/system/egl/goldfish_sync.h + * device/generic/goldfish/ueventd.ranchu.rc + * platform/build/target/board/generic/sepolicy/file_contexts + */ +struct goldfish_sync_hostcmd { + /* sorted for alignment */ + uint64_t handle; + uint64_t hostcmd_handle; + uint32_t cmd; + uint32_t time_arg; +}; + +struct goldfish_sync_guestcmd { + uint64_t host_command; /* uint64_t for alignment */ + uint64_t glsync_handle; + uint64_t thread_handle; + uint64_t guest_timeline_handle; +}; + +#define GOLDFISH_SYNC_MAX_CMDS 64 + +struct goldfish_sync_state { + char __iomem *reg_base; + int irq; + + /* Spinlock protects |to_do| / |to_do_end|. */ + spinlock_t lock; + /* |mutex_lock| protects all concurrent access + * to timelines for both kernel and user space. */ + struct mutex mutex_lock; + + /* Buffer holding commands issued from host. */ + struct goldfish_sync_hostcmd to_do[GOLDFISH_SYNC_MAX_CMDS]; + uint32_t to_do_end; + + /* Addresses for the reading or writing + * of individual commands. The host can directly write + * to |batch_hostcmd| (and then this driver immediately + * copies contents to |to_do|). This driver either replies + * through |batch_hostcmd| or simply issues a + * guest->host command through |batch_guestcmd|. + */ + struct goldfish_sync_hostcmd *batch_hostcmd; + struct goldfish_sync_guestcmd *batch_guestcmd; + + /* Used to give this struct itself to a work queue + * function for executing actual sync commands. */ + struct work_struct work_item; +}; + +static struct goldfish_sync_state global_sync_state[1]; + +struct goldfish_sync_timeline_obj { + struct sw_sync_timeline *sw_sync_tl; + uint32_t current_time; + /* We need to be careful about when we deallocate + * this |goldfish_sync_timeline_obj| struct. + * In order to ensure proper cleanup, we need to + * consider the triggered host-side wait that may + * still be in flight when the guest close()'s a + * goldfish_sync device's sync context fd (and + * destroys the |sw_sync_tl| field above). + * The host-side wait may raise IRQ + * and tell the kernel to increment the timeline _after_ + * the |sw_sync_tl| has already been set to null. + * + * From observations on OpenGL apps and CTS tests, this + * happens at some very low probability upon context + * destruction or process close, but it does happen + * and it needs to be handled properly. Otherwise, + * if we clean up the surrounding |goldfish_sync_timeline_obj| + * too early, any |handle| field of any host->guest command + * might not even point to a null |sw_sync_tl| field, + * but to garbage memory or even a reclaimed |sw_sync_tl|. + * If we do not count such "pending waits" and kfree the object + * immediately upon |goldfish_sync_timeline_destroy|, + * we might get mysterous RCU stalls after running a long + * time because the garbage memory that is being read + * happens to be interpretable as a |spinlock_t| struct + * that is currently in the locked state. + * + * To track when to free the |goldfish_sync_timeline_obj| + * itself, we maintain a kref. + * The kref essentially counts the timeline itself plus + * the number of waits in flight. kref_init/kref_put + * are issued on + * |goldfish_sync_timeline_create|/|goldfish_sync_timeline_destroy| + * and kref_get/kref_put are issued on + * |goldfish_sync_fence_create|/|goldfish_sync_timeline_inc|. + * + * The timeline is destroyed after reference count + * reaches zero, which would happen after + * |goldfish_sync_timeline_destroy| and all pending + * |goldfish_sync_timeline_inc|'s are fulfilled. + * + * NOTE (1): We assume that |fence_create| and + * |timeline_inc| calls are 1:1, otherwise the kref scheme + * will not work. This is a valid assumption as long + * as the host-side virtual device implementation + * does not insert any timeline increments + * that we did not trigger from here. + * + * NOTE (2): The use of kref by itself requires no locks, + * but this does not mean everything works without locks. + * Related timeline operations do require a lock of some sort, + * or at least are not proven to work without it. + * In particualr, we assume that all the operations + * done on the |kref| field above are done in contexts where + * |global_sync_state->mutex_lock| is held. Do not + * remove that lock until everything is proven to work + * without it!!! */ + struct kref kref; +}; + +/* We will call |delete_timeline_obj| when the last reference count + * of the kref is decremented. This deletes the sw_sync + * timeline object along with the wrapper itself. */ +static void delete_timeline_obj(struct kref* kref) { + struct goldfish_sync_timeline_obj* obj = + container_of(kref, struct goldfish_sync_timeline_obj, kref); + + sync_timeline_destroy(&obj->sw_sync_tl->obj); + obj->sw_sync_tl = NULL; + kfree(obj); +} + +static uint64_t gensym_ctr; +static void gensym(char *dst) +{ + sprintf(dst, "goldfish_sync:gensym:%llu", gensym_ctr); + gensym_ctr++; +} + +/* |goldfish_sync_timeline_create| assumes that |global_sync_state->mutex_lock| + * is held. */ +static struct goldfish_sync_timeline_obj* +goldfish_sync_timeline_create(void) +{ + + char timeline_name[256]; + struct sw_sync_timeline *res_sync_tl = NULL; + struct goldfish_sync_timeline_obj *res; + + DTRACE(); + + gensym(timeline_name); + + res_sync_tl = sw_sync_timeline_create(timeline_name); + if (!res_sync_tl) { + ERR("Failed to create sw_sync timeline."); + return NULL; + } + + res = kzalloc(sizeof(struct goldfish_sync_timeline_obj), GFP_KERNEL); + res->sw_sync_tl = res_sync_tl; + res->current_time = 0; + kref_init(&res->kref); + + DPRINT("new timeline_obj=0x%p", res); + return res; +} + +/* |goldfish_sync_fence_create| assumes that |global_sync_state->mutex_lock| + * is held. */ +static int +goldfish_sync_fence_create(struct goldfish_sync_timeline_obj *obj, + uint32_t val) +{ + + int fd; + char fence_name[256]; + struct sync_pt *syncpt = NULL; + struct sync_fence *sync_obj = NULL; + struct sw_sync_timeline *tl; + + DTRACE(); + + if (!obj) return -1; + + tl = obj->sw_sync_tl; + + syncpt = sw_sync_pt_create(tl, val); + if (!syncpt) { + ERR("could not create sync point! " + "sync_timeline=0x%p val=%d", + tl, val); + return -1; + } + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + ERR("could not get unused fd for sync fence. " + "errno=%d", fd); + goto err_cleanup_pt; + } + + gensym(fence_name); + + sync_obj = sync_fence_create(fence_name, syncpt); + if (!sync_obj) { + ERR("could not create sync fence! " + "sync_timeline=0x%p val=%d sync_pt=0x%p", + tl, val, syncpt); + goto err_cleanup_fd_pt; + } + + DPRINT("installing sync fence into fd %d sync_obj=0x%p", fd, sync_obj); + sync_fence_install(sync_obj, fd); + kref_get(&obj->kref); + + return fd; + +err_cleanup_fd_pt: + put_unused_fd(fd); +err_cleanup_pt: + sync_pt_free(syncpt); + return -1; +} + +/* |goldfish_sync_timeline_inc| assumes that |global_sync_state->mutex_lock| + * is held. */ +static void +goldfish_sync_timeline_inc(struct goldfish_sync_timeline_obj *obj, uint32_t inc) +{ + DTRACE(); + /* Just give up if someone else nuked the timeline. + * Whoever it was won't care that it doesn't get signaled. */ + if (!obj) return; + + DPRINT("timeline_obj=0x%p", obj); + sw_sync_timeline_inc(obj->sw_sync_tl, inc); + DPRINT("incremented timeline. increment max_time"); + obj->current_time += inc; + + /* Here, we will end up deleting the timeline object if it + * turns out that this call was a pending increment after + * |goldfish_sync_timeline_destroy| was called. */ + kref_put(&obj->kref, delete_timeline_obj); + DPRINT("done"); +} + +/* |goldfish_sync_timeline_destroy| assumes + * that |global_sync_state->mutex_lock| is held. */ +static void +goldfish_sync_timeline_destroy(struct goldfish_sync_timeline_obj *obj) +{ + DTRACE(); + /* See description of |goldfish_sync_timeline_obj| for why we + * should not immediately destroy |obj| */ + kref_put(&obj->kref, delete_timeline_obj); +} + +static inline void +goldfish_sync_cmd_queue(struct goldfish_sync_state *sync_state, + uint32_t cmd, + uint64_t handle, + uint32_t time_arg, + uint64_t hostcmd_handle) +{ + struct goldfish_sync_hostcmd *to_add; + + DTRACE(); + + BUG_ON(sync_state->to_do_end == GOLDFISH_SYNC_MAX_CMDS); + + to_add = &sync_state->to_do[sync_state->to_do_end]; + + to_add->cmd = cmd; + to_add->handle = handle; + to_add->time_arg = time_arg; + to_add->hostcmd_handle = hostcmd_handle; + + sync_state->to_do_end += 1; +} + +static inline void +goldfish_sync_hostcmd_reply(struct goldfish_sync_state *sync_state, + uint32_t cmd, + uint64_t handle, + uint32_t time_arg, + uint64_t hostcmd_handle) +{ + unsigned long irq_flags; + struct goldfish_sync_hostcmd *batch_hostcmd = + sync_state->batch_hostcmd; + + DTRACE(); + + spin_lock_irqsave(&sync_state->lock, irq_flags); + + batch_hostcmd->cmd = cmd; + batch_hostcmd->handle = handle; + batch_hostcmd->time_arg = time_arg; + batch_hostcmd->hostcmd_handle = hostcmd_handle; + writel(0, sync_state->reg_base + SYNC_REG_BATCH_COMMAND); + + spin_unlock_irqrestore(&sync_state->lock, irq_flags); +} + +static inline void +goldfish_sync_send_guestcmd(struct goldfish_sync_state *sync_state, + uint32_t cmd, + uint64_t glsync_handle, + uint64_t thread_handle, + uint64_t timeline_handle) +{ + unsigned long irq_flags; + struct goldfish_sync_guestcmd *batch_guestcmd = + sync_state->batch_guestcmd; + + DTRACE(); + + spin_lock_irqsave(&sync_state->lock, irq_flags); + + batch_guestcmd->host_command = (uint64_t)cmd; + batch_guestcmd->glsync_handle = (uint64_t)glsync_handle; + batch_guestcmd->thread_handle = (uint64_t)thread_handle; + batch_guestcmd->guest_timeline_handle = (uint64_t)timeline_handle; + writel(0, sync_state->reg_base + SYNC_REG_BATCH_GUESTCOMMAND); + + spin_unlock_irqrestore(&sync_state->lock, irq_flags); +} + +/* |goldfish_sync_interrupt| handles IRQ raises from the virtual device. + * In the context of OpenGL, this interrupt will fire whenever we need + * to signal a fence fd in the guest, with the command + * |CMD_SYNC_TIMELINE_INC|. + * However, because this function will be called in an interrupt context, + * it is necessary to do the actual work of signaling off of interrupt context. + * The shared work queue is used for this purpose. At the end when + * all pending commands are intercepted by the interrupt handler, + * we call |schedule_work|, which will later run the actual + * desired sync command in |goldfish_sync_work_item_fn|. + */ +static irqreturn_t goldfish_sync_interrupt(int irq, void *dev_id) +{ + + struct goldfish_sync_state *sync_state = dev_id; + + uint32_t nextcmd; + uint32_t command_r; + uint64_t handle_rw; + uint32_t time_r; + uint64_t hostcmd_handle_rw; + + int count = 0; + + DTRACE(); + + sync_state = dev_id; + + spin_lock(&sync_state->lock); + + for (;;) { + + readl(sync_state->reg_base + SYNC_REG_BATCH_COMMAND); + nextcmd = sync_state->batch_hostcmd->cmd; + + if (nextcmd == 0) + break; + + command_r = nextcmd; + handle_rw = sync_state->batch_hostcmd->handle; + time_r = sync_state->batch_hostcmd->time_arg; + hostcmd_handle_rw = sync_state->batch_hostcmd->hostcmd_handle; + + goldfish_sync_cmd_queue( + sync_state, + command_r, + handle_rw, + time_r, + hostcmd_handle_rw); + + count++; + } + + spin_unlock(&sync_state->lock); + + schedule_work(&sync_state->work_item); + + return (count == 0) ? IRQ_NONE : IRQ_HANDLED; +} + +/* |goldfish_sync_work_item_fn| does the actual work of servicing + * host->guest sync commands. This function is triggered whenever + * the IRQ for the goldfish sync device is raised. Once it starts + * running, it grabs the contents of the buffer containing the + * commands it needs to execute (there may be multiple, because + * our IRQ is active high and not edge triggered), and then + * runs all of them one after the other. + */ +static void goldfish_sync_work_item_fn(struct work_struct *input) +{ + + struct goldfish_sync_state *sync_state; + int sync_fence_fd; + + struct goldfish_sync_timeline_obj *timeline; + uint64_t timeline_ptr; + + uint64_t hostcmd_handle; + + uint32_t cmd; + uint64_t handle; + uint32_t time_arg; + + struct goldfish_sync_hostcmd *todo; + uint32_t todo_end; + + unsigned long irq_flags; + + struct goldfish_sync_hostcmd to_run[GOLDFISH_SYNC_MAX_CMDS]; + uint32_t i = 0; + + sync_state = container_of(input, struct goldfish_sync_state, work_item); + + mutex_lock(&sync_state->mutex_lock); + + spin_lock_irqsave(&sync_state->lock, irq_flags); { + + todo_end = sync_state->to_do_end; + + DPRINT("num sync todos: %u", sync_state->to_do_end); + + for (i = 0; i < todo_end; i++) + to_run[i] = sync_state->to_do[i]; + + /* We expect that commands will come in at a slow enough rate + * so that incoming items will not be more than + * GOLDFISH_SYNC_MAX_CMDS. + * + * This is because the way the sync device is used, + * it's only for managing buffer data transfers per frame, + * with a sequential dependency between putting things in + * to_do and taking them out. Once a set of commands is + * queued up in to_do, the user of the device waits for + * them to be processed before queuing additional commands, + * which limits the rate at which commands come in + * to the rate at which we take them out here. + * + * We also don't expect more than MAX_CMDS to be issued + * at once; there is a correspondence between + * which buffers need swapping to the (display / buffer queue) + * to particular commands, and we don't expect there to be + * enough display or buffer queues in operation at once + * to overrun GOLDFISH_SYNC_MAX_CMDS. + */ + sync_state->to_do_end = 0; + + } spin_unlock_irqrestore(&sync_state->lock, irq_flags); + + for (i = 0; i < todo_end; i++) { + DPRINT("todo index: %u", i); + + todo = &to_run[i]; + + cmd = todo->cmd; + + handle = (uint64_t)todo->handle; + time_arg = todo->time_arg; + hostcmd_handle = (uint64_t)todo->hostcmd_handle; + + DTRACE(); + + timeline = (struct goldfish_sync_timeline_obj *)(uintptr_t)handle; + + switch (cmd) { + case CMD_SYNC_READY: + break; + case CMD_CREATE_SYNC_TIMELINE: + DPRINT("exec CMD_CREATE_SYNC_TIMELINE: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + timeline = goldfish_sync_timeline_create(); + timeline_ptr = (uintptr_t)timeline; + goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_TIMELINE, + timeline_ptr, + 0, + hostcmd_handle); + DPRINT("sync timeline created: %p", timeline); + break; + case CMD_CREATE_SYNC_FENCE: + DPRINT("exec CMD_CREATE_SYNC_FENCE: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + sync_fence_fd = goldfish_sync_fence_create(timeline, time_arg); + goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_FENCE, + sync_fence_fd, + 0, + hostcmd_handle); + break; + case CMD_SYNC_TIMELINE_INC: + DPRINT("exec CMD_SYNC_TIMELINE_INC: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + goldfish_sync_timeline_inc(timeline, time_arg); + break; + case CMD_DESTROY_SYNC_TIMELINE: + DPRINT("exec CMD_DESTROY_SYNC_TIMELINE: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + goldfish_sync_timeline_destroy(timeline); + break; + } + DPRINT("Done executing sync command"); + } + mutex_unlock(&sync_state->mutex_lock); +} + +/* Guest-side interface: file operations */ + +/* Goldfish sync context and ioctl info. + * + * When a sync context is created by open()-ing the goldfish sync device, we + * create a sync context (|goldfish_sync_context|). + * + * Currently, the only data required to track is the sync timeline itself + * along with the current time, which are all packed up in the + * |goldfish_sync_timeline_obj| field. We use a |goldfish_sync_context| + * as the filp->private_data. + * + * Next, when a sync context user requests that work be queued and a fence + * fd provided, we use the |goldfish_sync_ioctl_info| struct, which holds + * information about which host handles to touch for this particular + * queue-work operation. We need to know about the host-side sync thread + * and the particular host-side GLsync object. We also possibly write out + * a file descriptor. + */ +struct goldfish_sync_context { + struct goldfish_sync_timeline_obj *timeline; +}; + +struct goldfish_sync_ioctl_info { + uint64_t host_glsync_handle_in; + uint64_t host_syncthread_handle_in; + int fence_fd_out; +}; + +static int goldfish_sync_open(struct inode *inode, struct file *file) +{ + + struct goldfish_sync_context *sync_context; + + DTRACE(); + + mutex_lock(&global_sync_state->mutex_lock); + + sync_context = kzalloc(sizeof(struct goldfish_sync_context), GFP_KERNEL); + + if (sync_context == NULL) { + ERR("Creation of goldfish sync context failed!"); + mutex_unlock(&global_sync_state->mutex_lock); + return -ENOMEM; + } + + sync_context->timeline = NULL; + + file->private_data = sync_context; + + DPRINT("successfully create a sync context @0x%p", sync_context); + + mutex_unlock(&global_sync_state->mutex_lock); + + return 0; +} + +static int goldfish_sync_release(struct inode *inode, struct file *file) +{ + + struct goldfish_sync_context *sync_context; + + DTRACE(); + + mutex_lock(&global_sync_state->mutex_lock); + + sync_context = file->private_data; + + if (sync_context->timeline) + goldfish_sync_timeline_destroy(sync_context->timeline); + + sync_context->timeline = NULL; + + kfree(sync_context); + + mutex_unlock(&global_sync_state->mutex_lock); + + return 0; +} + +/* |goldfish_sync_ioctl| is the guest-facing interface of goldfish sync + * and is used in conjunction with eglCreateSyncKHR to queue up the + * actual work of waiting for the EGL sync command to complete, + * possibly returning a fence fd to the guest. + */ +static long goldfish_sync_ioctl(struct file *file, + unsigned int cmd, + unsigned long arg) +{ + struct goldfish_sync_context *sync_context_data; + struct goldfish_sync_timeline_obj *timeline; + int fd_out; + struct goldfish_sync_ioctl_info ioctl_data; + + DTRACE(); + + sync_context_data = file->private_data; + fd_out = -1; + + switch (cmd) { + case GOLDFISH_SYNC_IOC_QUEUE_WORK: + + DPRINT("exec GOLDFISH_SYNC_IOC_QUEUE_WORK"); + + mutex_lock(&global_sync_state->mutex_lock); + + if (copy_from_user(&ioctl_data, + (void __user *)arg, + sizeof(ioctl_data))) { + ERR("Failed to copy memory for ioctl_data from user."); + mutex_unlock(&global_sync_state->mutex_lock); + return -EFAULT; + } + + if (ioctl_data.host_syncthread_handle_in == 0) { + DPRINT("Error: zero host syncthread handle!!!"); + mutex_unlock(&global_sync_state->mutex_lock); + return -EFAULT; + } + + if (!sync_context_data->timeline) { + DPRINT("no timeline yet, create one."); + sync_context_data->timeline = goldfish_sync_timeline_create(); + DPRINT("timeline: 0x%p", &sync_context_data->timeline); + } + + timeline = sync_context_data->timeline; + fd_out = goldfish_sync_fence_create(timeline, + timeline->current_time + 1); + DPRINT("Created fence with fd %d and current time %u (timeline: 0x%p)", + fd_out, + sync_context_data->timeline->current_time + 1, + sync_context_data->timeline); + + ioctl_data.fence_fd_out = fd_out; + + if (copy_to_user((void __user *)arg, + &ioctl_data, + sizeof(ioctl_data))) { + DPRINT("Error, could not copy to user!!!"); + + sys_close(fd_out); + /* We won't be doing an increment, kref_put immediately. */ + kref_put(&timeline->kref, delete_timeline_obj); + mutex_unlock(&global_sync_state->mutex_lock); + return -EFAULT; + } + + /* We are now about to trigger a host-side wait; + * accumulate on |pending_waits|. */ + goldfish_sync_send_guestcmd(global_sync_state, + CMD_TRIGGER_HOST_WAIT, + ioctl_data.host_glsync_handle_in, + ioctl_data.host_syncthread_handle_in, + (uint64_t)(uintptr_t)(sync_context_data->timeline)); + + mutex_unlock(&global_sync_state->mutex_lock); + return 0; + default: + return -ENOTTY; + } +} + +static const struct file_operations goldfish_sync_fops = { + .owner = THIS_MODULE, + .open = goldfish_sync_open, + .release = goldfish_sync_release, + .unlocked_ioctl = goldfish_sync_ioctl, + .compat_ioctl = goldfish_sync_ioctl, +}; + +static struct miscdevice goldfish_sync_device = { + .name = "goldfish_sync", + .fops = &goldfish_sync_fops, +}; + + +static bool setup_verify_batch_cmd_addr(struct goldfish_sync_state *sync_state, + void *batch_addr, + uint32_t addr_offset, + uint32_t addr_offset_high) +{ + uint64_t batch_addr_phys; + uint32_t batch_addr_phys_test_lo; + uint32_t batch_addr_phys_test_hi; + + if (!batch_addr) { + ERR("Could not use batch command address!"); + return false; + } + + batch_addr_phys = virt_to_phys(batch_addr); + writel((uint32_t)(batch_addr_phys), + sync_state->reg_base + addr_offset); + writel((uint32_t)(batch_addr_phys >> 32), + sync_state->reg_base + addr_offset_high); + + batch_addr_phys_test_lo = + readl(sync_state->reg_base + addr_offset); + batch_addr_phys_test_hi = + readl(sync_state->reg_base + addr_offset_high); + + if (virt_to_phys(batch_addr) != + (((uint64_t)batch_addr_phys_test_hi << 32) | + batch_addr_phys_test_lo)) { + ERR("Invalid batch command address!"); + return false; + } + + return true; +} + +int goldfish_sync_probe(struct platform_device *pdev) +{ + struct resource *ioresource; + struct goldfish_sync_state *sync_state = global_sync_state; + int status; + + DTRACE(); + + sync_state->to_do_end = 0; + + spin_lock_init(&sync_state->lock); + mutex_init(&sync_state->mutex_lock); + + platform_set_drvdata(pdev, sync_state); + + ioresource = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (ioresource == NULL) { + ERR("platform_get_resource failed"); + return -ENODEV; + } + + sync_state->reg_base = devm_ioremap(&pdev->dev, ioresource->start, PAGE_SIZE); + if (sync_state->reg_base == NULL) { + ERR("Could not ioremap"); + return -ENOMEM; + } + + sync_state->irq = platform_get_irq(pdev, 0); + if (sync_state->irq < 0) { + ERR("Could not platform_get_irq"); + return -ENODEV; + } + + status = devm_request_irq(&pdev->dev, + sync_state->irq, + goldfish_sync_interrupt, + IRQF_SHARED, + pdev->name, + sync_state); + if (status) { + ERR("request_irq failed"); + return -ENODEV; + } + + INIT_WORK(&sync_state->work_item, + goldfish_sync_work_item_fn); + + misc_register(&goldfish_sync_device); + + /* Obtain addresses for batch send/recv of commands. */ + { + struct goldfish_sync_hostcmd *batch_addr_hostcmd; + struct goldfish_sync_guestcmd *batch_addr_guestcmd; + + batch_addr_hostcmd = devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_hostcmd), + GFP_KERNEL); + batch_addr_guestcmd = devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_guestcmd), + GFP_KERNEL); + + if (!setup_verify_batch_cmd_addr(sync_state, + batch_addr_hostcmd, + SYNC_REG_BATCH_COMMAND_ADDR, + SYNC_REG_BATCH_COMMAND_ADDR_HIGH)) { + ERR("goldfish_sync: Could not setup batch command address"); + return -ENODEV; + } + + if (!setup_verify_batch_cmd_addr(sync_state, + batch_addr_guestcmd, + SYNC_REG_BATCH_GUESTCOMMAND_ADDR, + SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH)) { + ERR("goldfish_sync: Could not setup batch guest command address"); + return -ENODEV; + } + + sync_state->batch_hostcmd = batch_addr_hostcmd; + sync_state->batch_guestcmd = batch_addr_guestcmd; + } + + INFO("goldfish_sync: Initialized goldfish sync device"); + + writel(0, sync_state->reg_base + SYNC_REG_INIT); + + return 0; +} + +static int goldfish_sync_remove(struct platform_device *pdev) +{ + struct goldfish_sync_state *sync_state = global_sync_state; + + DTRACE(); + + misc_deregister(&goldfish_sync_device); + memset(sync_state, 0, sizeof(struct goldfish_sync_state)); + return 0; +} + +static const struct of_device_id goldfish_sync_of_match[] = { + { .compatible = "google,goldfish-sync", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_sync_of_match); + +static const struct acpi_device_id goldfish_sync_acpi_match[] = { + { "GFSH0006", 0 }, + { }, +}; + +MODULE_DEVICE_TABLE(acpi, goldfish_sync_acpi_match); + +static struct platform_driver goldfish_sync = { + .probe = goldfish_sync_probe, + .remove = goldfish_sync_remove, + .driver = { + .name = "goldfish_sync", + .of_match_table = goldfish_sync_of_match, + .acpi_match_table = ACPI_PTR(goldfish_sync_acpi_match), + } +}; + +module_platform_driver(goldfish_sync); + +MODULE_AUTHOR("Google, Inc."); +MODULE_DESCRIPTION("Android QEMU Sync Driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); + +/* This function is only to run a basic test of sync framework. + * It creates a timeline and fence object whose signal point is at 1. + * The timeline is incremented, and we use the sync framework's + * sync_fence_wait on that fence object. If everything works out, + * we should not hang in the wait and return immediately. + * There is no way to explicitly run this test yet, but it + * can be used by inserting it at the end of goldfish_sync_probe. + */ +void test_kernel_sync(void) +{ + struct goldfish_sync_timeline_obj *test_timeline; + int test_fence_fd; + + DTRACE(); + + DPRINT("test sw_sync"); + + test_timeline = goldfish_sync_timeline_create(); + DPRINT("sw_sync_timeline_create -> 0x%p", test_timeline); + + test_fence_fd = goldfish_sync_fence_create(test_timeline, 1); + DPRINT("sync_fence_create -> %d", test_fence_fd); + + DPRINT("incrementing test timeline"); + goldfish_sync_timeline_inc(test_timeline, 1); + + DPRINT("test waiting (should NOT hang)"); + sync_fence_wait( + sync_fence_fdget(test_fence_fd), -1); + + DPRINT("test waiting (afterward)"); +} From e96f35d6073717452d9bc484dad0c3cfb61f5c46 Mon Sep 17 00:00:00 2001 From: Yurii Zubrytskyi Date: Wed, 4 May 2016 13:05:38 -0700 Subject: [PATCH 596/607] ANDROID: goldfish_pipe: bugfixes and performance improvements. Combine following patches from android-goldfish-3.18 branch: c0f015a [pipe] Fix the pipe driver for x64 platform + correct pages count 48e6bf5 [pipe] Use get_use_pages_fast() which is possibly faster fb20f13 [goldfish] More pages in goldfish pipe f180e6d goldfish_pipe: Return from read_write on signal and EIO 3dec3b7 [pipe] Fix a minor leak in setup_access_params_addr() Change-Id: I1041fd65d7faaec123e6cedd3dbbc5a2fbb86c4d --- drivers/platform/goldfish/goldfish_pipe.c | 123 ++++++++++++++-------- 1 file changed, 77 insertions(+), 46 deletions(-) diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 3215a33cf4fe..55929ed58ea3 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -109,6 +109,16 @@ #define PIPE_WAKE_READ (1 << 1) /* pipe can now be read from */ #define PIPE_WAKE_WRITE (1 << 2) /* pipe can now be written to */ +#define MAX_PAGES_TO_GRAB 32 + +#define DEBUG 0 + +#if DEBUG +#define DPRINT(...) { printk(KERN_ERR __VA_ARGS__); } +#else +#define DPRINT(...) +#endif + struct access_params { unsigned long channel; u32 size; @@ -231,8 +241,10 @@ static int setup_access_params_addr(struct platform_device *pdev, if (valid_batchbuffer_addr(dev, aps)) { dev->aps = aps; return 0; - } else + } else { + devm_kfree(&pdev->dev, aps); return -1; + } } /* A value that will not be set by qemu emulator */ @@ -269,6 +281,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, struct goldfish_pipe *pipe = filp->private_data; struct goldfish_pipe_dev *dev = pipe->dev; unsigned long address, address_end; + struct page* pages[MAX_PAGES_TO_GRAB] = {}; int count = 0, ret = -EINVAL; /* If the emulator already closed the pipe, no need to go further */ @@ -292,45 +305,59 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, address_end = address + bufflen; while (address < address_end) { - unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE; - unsigned long next = page_end < address_end ? page_end - : address_end; - unsigned long avail = next - address; - int status, wakeBit; - - struct page *page; - - /* Either vaddr or paddr depending on the device version */ - unsigned long xaddr; + unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE; + unsigned long next, avail; + int status, wakeBit, page_i, num_contiguous_pages; + long first_page, last_page, requested_pages; + unsigned long xaddr, xaddr_prev, xaddr_i; /* - * We grab the pages on a page-by-page basis in case user - * space gives us a potentially huge buffer but the read only - * returns a small amount, then there's no need to pin that - * much memory to the process. + * Attempt to grab multiple physically contiguous pages. */ - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, address, 1, - !is_write, 0, &page, NULL); - up_read(¤t->mm->mmap_sem); - if (ret < 0) - return ret; - - if (dev->version) { - /* Device version 1 or newer (qemu-android) expects the - * physical address. */ - xaddr = page_to_phys(page) | (address & ~PAGE_MASK); - } else { - /* Device version 0 (classic emulator) expects the - * virtual address. */ - xaddr = address; + first_page = address & PAGE_MASK; + last_page = (address_end - 1) & PAGE_MASK; + requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1; + if (requested_pages > MAX_PAGES_TO_GRAB) { + requested_pages = MAX_PAGES_TO_GRAB; } + ret = get_user_pages_fast(first_page, requested_pages, + !is_write, pages); + + DPRINT("%s: requested pages: %d %d\n", __FUNCTION__, ret, requested_pages); + if (ret == 0) { + DPRINT("%s: error: (requested pages == 0) (wanted %d)\n", + __FUNCTION__, requested_pages); + return ret; + } + if (ret < 0) { + DPRINT("%s: (requested pages < 0) %d \n", + __FUNCTION__, requested_pages); + return ret; + } + + xaddr = page_to_phys(pages[0]) | (address & ~PAGE_MASK); + xaddr_prev = xaddr; + num_contiguous_pages = ret == 0 ? 0 : 1; + for (page_i = 1; page_i < ret; page_i++) { + xaddr_i = page_to_phys(pages[page_i]) | (address & ~PAGE_MASK); + if (xaddr_i == xaddr_prev + PAGE_SIZE) { + page_end += PAGE_SIZE; + xaddr_prev = xaddr_i; + num_contiguous_pages++; + } else { + DPRINT("%s: discontinuous page boundary: %d pages instead\n", + __FUNCTION__, page_i); + break; + } + } + next = page_end < address_end ? page_end : address_end; + avail = next - address; /* Now, try to transfer the bytes in the current page */ spin_lock_irqsave(&dev->lock, irq_flags); if (access_with_param(dev, - is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER, - xaddr, avail, pipe, &status)) { + is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER, + xaddr, avail, pipe, &status)) { gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL, dev->base + PIPE_REG_CHANNEL_HIGH); writel(avail, dev->base + PIPE_REG_SIZE); @@ -343,9 +370,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, } spin_unlock_irqrestore(&dev->lock, irq_flags); - if (status > 0 && !is_write) - set_page_dirty(page); - put_page(page); + for (page_i = 0; page_i < ret; page_i++) { + if (status > 0 && !is_write && + page_i < num_contiguous_pages) { + set_page_dirty(pages[page_i]); + } + put_page(pages[page_i]); + } if (status > 0) { /* Correct transfer */ count += status; @@ -367,7 +398,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, */ if (status != PIPE_ERROR_AGAIN) pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n", - status, is_write ? "write" : "read"); + status, is_write ? "write" : "read"); ret = 0; break; } @@ -377,7 +408,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, * non-blocking mode, just return the error code. */ if (status != PIPE_ERROR_AGAIN || - (filp->f_flags & O_NONBLOCK) != 0) { + (filp->f_flags & O_NONBLOCK) != 0) { ret = goldfish_pipe_error_convert(status); break; } @@ -391,7 +422,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, /* Tell the emulator we're going to wait for a wake event */ goldfish_cmd(pipe, - is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ); + is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ); /* Unlock the pipe, then wait for the wake signal */ mutex_unlock(&pipe->lock); @@ -399,15 +430,11 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, while (test_bit(wakeBit, &pipe->flags)) { if (wait_event_interruptible( pipe->wake_queue, - !test_bit(wakeBit, &pipe->flags))) { - ret = -ERESTARTSYS; - break; - } + !test_bit(wakeBit, &pipe->flags))) + return -ERESTARTSYS; - if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) { - ret = -EIO; - break; - } + if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) + return -EIO; } /* Try to re-acquire the lock */ @@ -543,6 +570,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file) pipe->dev = dev; mutex_init(&pipe->lock); + DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file); + // spin lock init, write head of list, i guess init_waitqueue_head(&pipe->wake_queue); /* @@ -565,6 +594,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp) { struct goldfish_pipe *pipe = filp->private_data; + DPRINT("%s: call. pipe=0x%lx file=0x%lx\n", __FUNCTION__, pipe, filp); /* The guest is closing the channel, so tell the emulator right now */ goldfish_cmd(pipe, CMD_CLOSE); kfree(pipe); @@ -589,6 +619,7 @@ static struct miscdevice goldfish_pipe_device = { static int goldfish_pipe_probe(struct platform_device *pdev) { + DPRINT("%s: call. platform_device=0x%lx\n", __FUNCTION__, pdev); int err; struct resource *r; struct goldfish_pipe_dev *dev = pipe_dev; From bc43565e1ac5ba3f204886a2275726bb4c3d44e6 Mon Sep 17 00:00:00 2001 From: Yurii Zubrytskyi Date: Fri, 29 Jul 2016 10:51:46 -0700 Subject: [PATCH 597/607] ANDROID: goldfish_pipe: An implementation of more parallel pipe This is a driver code for a redesigned android pipe. Currently it works for x86 and x64 emulators with the following performance results: ADB push to /dev/null, Ubuntu, 400 MB file, times are for 1/10/100 parallel adb commands x86 adb push: (4.4s / 11.5s / 2m10s) -> (2.8s / 6s / 51s) x64 adb push: (7s / 15s / (too long, 6m+) -> (2.7s / 6.2s / 52s) ADB pull and push to /data/ have the same %% of speedup More importantly, I don't see any signs of slowdowns when run in parallel with Antutu benchmark, so it is definitely making much better job at multithreading. The code features dynamic host detection: old emulator gets the previous version of the pipe driver code. Combine follow patch from android-goldfish-3.10 b543285 [pipe] Increase the default pipe buffers size, make it configurable Signed-off-by: "Yurii Zubrytskyi" Change-Id: I140d506204cab6e78dd503e5a43abc8886e4ffff --- drivers/platform/goldfish/Makefile | 2 +- drivers/platform/goldfish/goldfish_pipe.c | 168 +--- drivers/platform/goldfish/goldfish_pipe.h | 91 ++ drivers/platform/goldfish/goldfish_pipe_v2.c | 888 +++++++++++++++++++ 4 files changed, 1005 insertions(+), 144 deletions(-) create mode 100644 drivers/platform/goldfish/goldfish_pipe.h create mode 100644 drivers/platform/goldfish/goldfish_pipe_v2.c diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile index d3487125838c..e53ae2fc717b 100644 --- a/drivers/platform/goldfish/Makefile +++ b/drivers/platform/goldfish/Makefile @@ -2,4 +2,4 @@ # Makefile for Goldfish platform specific drivers # obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o -obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o +obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o goldfish_pipe_v2.o diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 55929ed58ea3..cf7ce97e7346 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -15,51 +15,11 @@ * */ -/* This source file contains the implementation of a special device driver - * that intends to provide a *very* fast communication channel between the - * guest system and the QEMU emulator. - * - * Usage from the guest is simply the following (error handling simplified): - * - * int fd = open("/dev/qemu_pipe",O_RDWR); - * .... write() or read() through the pipe. - * - * This driver doesn't deal with the exact protocol used during the session. - * It is intended to be as simple as something like: - * - * // do this _just_ after opening the fd to connect to a specific - * // emulator service. - * const char* msg = ""; - * if (write(fd, msg, strlen(msg)+1) < 0) { - * ... could not connect to service - * close(fd); - * } - * - * // after this, simply read() and write() to communicate with the - * // service. Exact protocol details left as an exercise to the reader. - * - * This driver is very fast because it doesn't copy any data through - * intermediate buffers, since the emulator is capable of translating - * guest user addresses into host ones. - * - * Note that we must however ensure that each user page involved in the - * exchange is properly mapped during a transfer. +/* This source file contains the implementation of the legacy version of + * a goldfish pipe device driver. See goldfish_pipe_v2.c for the current + * version. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "goldfish_pipe.h" /* * IMPORTANT: The following constants must match the ones used and defined @@ -119,6 +79,14 @@ #define DPRINT(...) #endif +/* This data type models a given pipe instance */ +struct goldfish_pipe { + struct goldfish_pipe_dev *dev; + struct mutex lock; + unsigned long flags; + wait_queue_head_t wake_queue; +}; + struct access_params { unsigned long channel; u32 size; @@ -129,29 +97,6 @@ struct access_params { u32 flags; }; -/* The global driver data. Holds a reference to the i/o page used to - * communicate with the emulator, and a wake queue for blocked tasks - * waiting to be awoken. - */ -struct goldfish_pipe_dev { - spinlock_t lock; - unsigned char __iomem *base; - struct access_params *aps; - int irq; - u32 version; -}; - -static struct goldfish_pipe_dev pipe_dev[1]; - -/* This data type models a given pipe instance */ -struct goldfish_pipe { - struct goldfish_pipe_dev *dev; - struct mutex lock; - unsigned long flags; - wait_queue_head_t wake_queue; -}; - - /* Bit flags for the 'flags' field */ enum { BIT_CLOSED_ON_HOST = 0, /* pipe closed by host */ @@ -323,7 +268,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, ret = get_user_pages_fast(first_page, requested_pages, !is_write, pages); - DPRINT("%s: requested pages: %d %d\n", __FUNCTION__, ret, requested_pages); + DPRINT("%s: requested pages: %d %d %p\n", __FUNCTION__, + ret, requested_pages, first_page); if (ret == 0) { DPRINT("%s: error: (requested pages == 0) (wanted %d)\n", __FUNCTION__, requested_pages); @@ -611,97 +557,33 @@ static const struct file_operations goldfish_pipe_fops = { .release = goldfish_pipe_release, }; -static struct miscdevice goldfish_pipe_device = { +static struct miscdevice goldfish_pipe_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "goldfish_pipe", .fops = &goldfish_pipe_fops, }; -static int goldfish_pipe_probe(struct platform_device *pdev) +int goldfish_pipe_device_init_v1(struct platform_device *pdev) { - DPRINT("%s: call. platform_device=0x%lx\n", __FUNCTION__, pdev); - int err; - struct resource *r; struct goldfish_pipe_dev *dev = pipe_dev; - - /* not thread safe, but this should not happen */ - WARN_ON(dev->base != NULL); - - spin_lock_init(&dev->lock); - - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (r == NULL || resource_size(r) < PAGE_SIZE) { - dev_err(&pdev->dev, "can't allocate i/o page\n"); - return -EINVAL; - } - dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE); - if (dev->base == NULL) { - dev_err(&pdev->dev, "ioremap failed\n"); - return -EINVAL; - } - - r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (r == NULL) { - err = -EINVAL; - goto error; - } - dev->irq = r->start; - - err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt, + int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt, IRQF_SHARED, "goldfish_pipe", dev); if (err) { - dev_err(&pdev->dev, "unable to allocate IRQ\n"); - goto error; + dev_err(&pdev->dev, "unable to allocate IRQ for v1\n"); + return err; } - err = misc_register(&goldfish_pipe_device); + err = misc_register(&goldfish_pipe_dev); if (err) { - dev_err(&pdev->dev, "unable to register device\n"); - goto error; + dev_err(&pdev->dev, "unable to register v1 device\n"); + return err; } + setup_access_params_addr(pdev, dev); - - /* Although the pipe device in the classic Android emulator does not - * recognize the 'version' register, it won't treat this as an error - * either and will simply return 0, which is fine. */ - dev->version = readl(dev->base + PIPE_REG_VERSION); return 0; - -error: - dev->base = NULL; - return err; } -static int goldfish_pipe_remove(struct platform_device *pdev) +void goldfish_pipe_device_deinit_v1(struct platform_device *pdev) { - struct goldfish_pipe_dev *dev = pipe_dev; - misc_deregister(&goldfish_pipe_device); - dev->base = NULL; - return 0; + misc_deregister(&goldfish_pipe_dev); } - -static const struct acpi_device_id goldfish_pipe_acpi_match[] = { - { "GFSH0003", 0 }, - { }, -}; -MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match); - -static const struct of_device_id goldfish_pipe_of_match[] = { - { .compatible = "generic,android-pipe", }, - {}, -}; -MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match); - -static struct platform_driver goldfish_pipe = { - .probe = goldfish_pipe_probe, - .remove = goldfish_pipe_remove, - .driver = { - .name = "goldfish_pipe", - .of_match_table = goldfish_pipe_of_match, - .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match), - } -}; - -module_platform_driver(goldfish_pipe); -MODULE_AUTHOR("David Turner "); -MODULE_LICENSE("GPL"); diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h new file mode 100644 index 000000000000..9b75a51dba24 --- /dev/null +++ b/drivers/platform/goldfish/goldfish_pipe.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef GOLDFISH_PIPE_H +#define GOLDFISH_PIPE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Initialize the legacy version of the pipe device driver */ +int goldfish_pipe_device_init_v1(struct platform_device *pdev); + +/* Deinitialize the legacy version of the pipe device driver */ +void goldfish_pipe_device_deinit_v1(struct platform_device *pdev); + +/* Forward declarations for the device struct */ +struct goldfish_pipe; +struct goldfish_pipe_device_buffers; + +/* The global driver data. Holds a reference to the i/o page used to + * communicate with the emulator, and a wake queue for blocked tasks + * waiting to be awoken. + */ +struct goldfish_pipe_dev { + /* + * Global device spinlock. Protects the following members: + * - pipes, pipes_capacity + * - [*pipes, *pipes + pipes_capacity) - array data + * - first_signalled_pipe, + * goldfish_pipe::prev_signalled, + * goldfish_pipe::next_signalled, + * goldfish_pipe::signalled_flags - all singnalled-related fields, + * in all allocated pipes + * - open_command_params - PIPE_CMD_OPEN-related buffers + * + * It looks like a lot of different fields, but the trick is that the only + * operation that happens often is the signalled pipes array manipulation. + * That's why it's OK for now to keep the rest of the fields under the same + * lock. If we notice too much contention because of PIPE_CMD_OPEN, + * then we should add a separate lock there. + */ + spinlock_t lock; + + /* + * Array of the pipes of |pipes_capacity| elements, + * indexed by goldfish_pipe::id + */ + struct goldfish_pipe **pipes; + u32 pipes_capacity; + + /* Pointers to the buffers host uses for interaction with this driver */ + struct goldfish_pipe_dev_buffers *buffers; + + /* Head of a doubly linked list of signalled pipes */ + struct goldfish_pipe *first_signalled_pipe; + + /* Some device-specific data */ + int irq; + int version; + unsigned char __iomem *base; + + /* v1-specific access parameters */ + struct access_params *aps; +}; + +extern struct goldfish_pipe_dev pipe_dev[1]; + +#endif /* GOLDFISH_PIPE_H */ diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c new file mode 100644 index 000000000000..bdb039bc7dde --- /dev/null +++ b/drivers/platform/goldfish/goldfish_pipe_v2.c @@ -0,0 +1,888 @@ +/* + * Copyright (C) 2012 Intel, Inc. + * Copyright (C) 2013 Intel, Inc. + * Copyright (C) 2014 Linaro Limited + * Copyright (C) 2011-2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +/* This source file contains the implementation of a special device driver + * that intends to provide a *very* fast communication channel between the + * guest system and the QEMU emulator. + * + * Usage from the guest is simply the following (error handling simplified): + * + * int fd = open("/dev/qemu_pipe",O_RDWR); + * .... write() or read() through the pipe. + * + * This driver doesn't deal with the exact protocol used during the session. + * It is intended to be as simple as something like: + * + * // do this _just_ after opening the fd to connect to a specific + * // emulator service. + * const char* msg = ""; + * if (write(fd, msg, strlen(msg)+1) < 0) { + * ... could not connect to service + * close(fd); + * } + * + * // after this, simply read() and write() to communicate with the + * // service. Exact protocol details left as an exercise to the reader. + * + * This driver is very fast because it doesn't copy any data through + * intermediate buffers, since the emulator is capable of translating + * guest user addresses into host ones. + * + * Note that we must however ensure that each user page involved in the + * exchange is properly mapped during a transfer. + */ + +#include "goldfish_pipe.h" + + +/* + * Update this when something changes in the driver's behavior so the host + * can benefit from knowing it + */ +enum { + PIPE_DRIVER_VERSION = 2, + PIPE_CURRENT_DEVICE_VERSION = 2 +}; + +/* + * IMPORTANT: The following constants must match the ones used and defined + * in external/qemu/hw/goldfish_pipe.c in the Android source tree. + */ + +/* List of bitflags returned in status of CMD_POLL command */ +enum PipePollFlags { + PIPE_POLL_IN = 1 << 0, + PIPE_POLL_OUT = 1 << 1, + PIPE_POLL_HUP = 1 << 2 +}; + +/* Possible status values used to signal errors - see goldfish_pipe_error_convert */ +enum PipeErrors { + PIPE_ERROR_INVAL = -1, + PIPE_ERROR_AGAIN = -2, + PIPE_ERROR_NOMEM = -3, + PIPE_ERROR_IO = -4 +}; + +/* Bit-flags used to signal events from the emulator */ +enum PipeWakeFlags { + PIPE_WAKE_CLOSED = 1 << 0, /* emulator closed pipe */ + PIPE_WAKE_READ = 1 << 1, /* pipe can now be read from */ + PIPE_WAKE_WRITE = 1 << 2 /* pipe can now be written to */ +}; + +/* Bit flags for the 'flags' field */ +enum PipeFlagsBits { + BIT_CLOSED_ON_HOST = 0, /* pipe closed by host */ + BIT_WAKE_ON_WRITE = 1, /* want to be woken on writes */ + BIT_WAKE_ON_READ = 2, /* want to be woken on reads */ +}; + +enum PipeRegs { + PIPE_REG_CMD = 0, + + PIPE_REG_SIGNAL_BUFFER_HIGH = 4, + PIPE_REG_SIGNAL_BUFFER = 8, + PIPE_REG_SIGNAL_BUFFER_COUNT = 12, + + PIPE_REG_OPEN_BUFFER_HIGH = 20, + PIPE_REG_OPEN_BUFFER = 24, + + PIPE_REG_VERSION = 36, + + PIPE_REG_GET_SIGNALLED = 48, +}; + +enum PipeCmdCode { + PIPE_CMD_OPEN = 1, /* to be used by the pipe device itself */ + PIPE_CMD_CLOSE, + PIPE_CMD_POLL, + PIPE_CMD_WRITE, + PIPE_CMD_WAKE_ON_WRITE, + PIPE_CMD_READ, + PIPE_CMD_WAKE_ON_READ, + + /* + * TODO(zyy): implement a deferred read/write execution to allow parallel + * processing of pipe operations on the host. + */ + PIPE_CMD_WAKE_ON_DONE_IO, +}; + +enum { + MAX_BUFFERS_PER_COMMAND = 336, + MAX_SIGNALLED_PIPES = 64, + INITIAL_PIPES_CAPACITY = 64 +}; + +struct goldfish_pipe_dev; +struct goldfish_pipe; +struct goldfish_pipe_command; + +/* A per-pipe command structure, shared with the host */ +struct goldfish_pipe_command { + s32 cmd; /* PipeCmdCode, guest -> host */ + s32 id; /* pipe id, guest -> host */ + s32 status; /* command execution status, host -> guest */ + s32 reserved; /* to pad to 64-bit boundary */ + union { + /* Parameters for PIPE_CMD_{READ,WRITE} */ + struct { + u32 buffers_count; /* number of buffers, guest -> host */ + s32 consumed_size; /* number of consumed bytes, host -> guest */ + u64 ptrs[MAX_BUFFERS_PER_COMMAND]; /* buffer pointers, guest -> host */ + u32 sizes[MAX_BUFFERS_PER_COMMAND]; /* buffer sizes, guest -> host */ + } rw_params; + }; +}; + +/* A single signalled pipe information */ +struct signalled_pipe_buffer { + u32 id; + u32 flags; +}; + +/* Parameters for the PIPE_CMD_OPEN command */ +struct open_command_param { + u64 command_buffer_ptr; + u32 rw_params_max_count; +}; + +/* Device-level set of buffers shared with the host */ +struct goldfish_pipe_dev_buffers { + struct open_command_param open_command_params; + struct signalled_pipe_buffer signalled_pipe_buffers[MAX_SIGNALLED_PIPES]; +}; + +/* This data type models a given pipe instance */ +struct goldfish_pipe { + u32 id; /* pipe ID - index into goldfish_pipe_dev::pipes array */ + unsigned long flags; /* The wake flags pipe is waiting for + * Note: not protected with any lock, uses atomic operations + * and barriers to make it thread-safe. + */ + unsigned long signalled_flags; /* wake flags host have signalled, + * - protected by goldfish_pipe_dev::lock */ + + struct goldfish_pipe_command *command_buffer; /* A pointer to command buffer */ + + /* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */ + struct goldfish_pipe *prev_signalled; + struct goldfish_pipe *next_signalled; + + /* + * A pipe's own lock. Protects the following: + * - *command_buffer - makes sure a command can safely write its parameters + * to the host and read the results back. + */ + struct mutex lock; + + wait_queue_head_t wake_queue; /* A wake queue for sleeping until host signals an event */ + struct goldfish_pipe_dev *dev; /* Pointer to the parent goldfish_pipe_dev instance */ +}; + +struct goldfish_pipe_dev pipe_dev[1] = {}; + +static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd) +{ + pipe->command_buffer->cmd = cmd; + pipe->command_buffer->status = PIPE_ERROR_INVAL; /* failure by default */ + writel(pipe->id, pipe->dev->base + PIPE_REG_CMD); + return pipe->command_buffer->status; +} + +static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd) +{ + int status; + if (mutex_lock_interruptible(&pipe->lock)) + return PIPE_ERROR_IO; + status = goldfish_cmd_locked(pipe, cmd); + mutex_unlock(&pipe->lock); + return status; +} + +/* + * This function converts an error code returned by the emulator through + * the PIPE_REG_STATUS i/o register into a valid negative errno value. + */ +static int goldfish_pipe_error_convert(int status) +{ + switch (status) { + case PIPE_ERROR_AGAIN: + return -EAGAIN; + case PIPE_ERROR_NOMEM: + return -ENOMEM; + case PIPE_ERROR_IO: + return -EIO; + default: + return -EINVAL; + } +} + +static int pin_user_pages(unsigned long first_page, unsigned long last_page, + unsigned last_page_size, int is_write, + struct page *pages[MAX_BUFFERS_PER_COMMAND], unsigned *iter_last_page_size) +{ + int ret; + int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1; + if (requested_pages > MAX_BUFFERS_PER_COMMAND) { + requested_pages = MAX_BUFFERS_PER_COMMAND; + *iter_last_page_size = PAGE_SIZE; + } else { + *iter_last_page_size = last_page_size; + } + + ret = get_user_pages_fast( + first_page, requested_pages, !is_write, pages); + if (ret <= 0) + return -EFAULT; + if (ret < requested_pages) + *iter_last_page_size = PAGE_SIZE; + return ret; + +} + +static void release_user_pages(struct page **pages, int pages_count, + int is_write, s32 consumed_size) +{ + int i; + for (i = 0; i < pages_count; i++) { + if (!is_write && consumed_size > 0) { + set_page_dirty(pages[i]); + } + put_page(pages[i]); + } +} + +/* Populate the call parameters, merging adjacent pages together */ +static void populate_rw_params( + struct page **pages, int pages_count, + unsigned long address, unsigned long address_end, + unsigned long first_page, unsigned long last_page, + unsigned iter_last_page_size, int is_write, + struct goldfish_pipe_command *command) +{ + /* + * Process the first page separately - it's the only page that + * needs special handling for its start address. + */ + unsigned long xaddr = page_to_phys(pages[0]); + unsigned long xaddr_prev = xaddr; + int buffer_idx = 0; + int i = 1; + int size_on_page = first_page == last_page + ? (int)(address_end - address) + : (PAGE_SIZE - (address & ~PAGE_MASK)); + command->rw_params.ptrs[0] = (u64)(xaddr | (address & ~PAGE_MASK)); + command->rw_params.sizes[0] = size_on_page; + for (; i < pages_count; ++i) { + xaddr = page_to_phys(pages[i]); + size_on_page = (i == pages_count - 1) ? iter_last_page_size : PAGE_SIZE; + if (xaddr == xaddr_prev + PAGE_SIZE) { + command->rw_params.sizes[buffer_idx] += size_on_page; + } else { + ++buffer_idx; + command->rw_params.ptrs[buffer_idx] = (u64)xaddr; + command->rw_params.sizes[buffer_idx] = size_on_page; + } + xaddr_prev = xaddr; + } + command->rw_params.buffers_count = buffer_idx + 1; +} + +static int transfer_max_buffers(struct goldfish_pipe* pipe, + unsigned long address, unsigned long address_end, int is_write, + unsigned long last_page, unsigned int last_page_size, + s32* consumed_size, int* status) +{ + struct page *pages[MAX_BUFFERS_PER_COMMAND]; + unsigned long first_page = address & PAGE_MASK; + unsigned int iter_last_page_size; + int pages_count = pin_user_pages(first_page, last_page, + last_page_size, is_write, + pages, &iter_last_page_size); + if (pages_count < 0) + return pages_count; + + /* Serialize access to the pipe command buffers */ + if (mutex_lock_interruptible(&pipe->lock)) + return -ERESTARTSYS; + + populate_rw_params(pages, pages_count, address, address_end, + first_page, last_page, iter_last_page_size, is_write, + pipe->command_buffer); + + /* Transfer the data */ + *status = goldfish_cmd_locked(pipe, + is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ); + + *consumed_size = pipe->command_buffer->rw_params.consumed_size; + + mutex_unlock(&pipe->lock); + + release_user_pages(pages, pages_count, is_write, *consumed_size); + + return 0; +} + +static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write) +{ + u32 wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ; + set_bit(wakeBit, &pipe->flags); + + /* Tell the emulator we're going to wait for a wake event */ + (void)goldfish_cmd(pipe, + is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ); + + while (test_bit(wakeBit, &pipe->flags)) { + if (wait_event_interruptible( + pipe->wake_queue, + !test_bit(wakeBit, &pipe->flags))) + return -ERESTARTSYS; + + if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) + return -EIO; + } + + return 0; +} + +static ssize_t goldfish_pipe_read_write(struct file *filp, + char __user *buffer, size_t bufflen, int is_write) +{ + struct goldfish_pipe *pipe = filp->private_data; + int count = 0, ret = -EINVAL; + unsigned long address, address_end, last_page; + unsigned int last_page_size; + + /* If the emulator already closed the pipe, no need to go further */ + if (unlikely(test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))) + return -EIO; + /* Null reads or writes succeeds */ + if (unlikely(bufflen == 0)) + return 0; + /* Check the buffer range for access */ + if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ, + buffer, bufflen))) + return -EFAULT; + + address = (unsigned long)buffer; + address_end = address + bufflen; + last_page = (address_end - 1) & PAGE_MASK; + last_page_size = ((address_end - 1) & ~PAGE_MASK) + 1; + + while (address < address_end) { + s32 consumed_size; + int status; + ret = transfer_max_buffers(pipe, address, address_end, is_write, + last_page, last_page_size, &consumed_size, &status); + if (ret < 0) + break; + + if (consumed_size > 0) { + /* No matter what's the status, we've transfered something */ + count += consumed_size; + address += consumed_size; + } + if (status > 0) + continue; + if (status == 0) { + /* EOF */ + ret = 0; + break; + } + if (count > 0) { + /* + * An error occured, but we already transfered + * something on one of the previous iterations. + * Just return what we already copied and log this + * err. + */ + if (status != PIPE_ERROR_AGAIN) + pr_info_ratelimited("goldfish_pipe: backend error %d on %s\n", + status, is_write ? "write" : "read"); + break; + } + + /* + * If the error is not PIPE_ERROR_AGAIN, or if we are in + * non-blocking mode, just return the error code. + */ + if (status != PIPE_ERROR_AGAIN || (filp->f_flags & O_NONBLOCK) != 0) { + ret = goldfish_pipe_error_convert(status); + break; + } + + status = wait_for_host_signal(pipe, is_write); + if (status < 0) + return status; + } + + if (count > 0) + return count; + return ret; +} + +static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer, + size_t bufflen, loff_t *ppos) +{ + return goldfish_pipe_read_write(filp, buffer, bufflen, /* is_write */ 0); +} + +static ssize_t goldfish_pipe_write(struct file *filp, + const char __user *buffer, size_t bufflen, + loff_t *ppos) +{ + return goldfish_pipe_read_write(filp, + /* cast away the const */(char __user *)buffer, bufflen, + /* is_write */ 1); +} + +static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait) +{ + struct goldfish_pipe *pipe = filp->private_data; + unsigned int mask = 0; + int status; + + poll_wait(filp, &pipe->wake_queue, wait); + + status = goldfish_cmd(pipe, PIPE_CMD_POLL); + if (status < 0) { + return -ERESTARTSYS; + } + + if (status & PIPE_POLL_IN) + mask |= POLLIN | POLLRDNORM; + if (status & PIPE_POLL_OUT) + mask |= POLLOUT | POLLWRNORM; + if (status & PIPE_POLL_HUP) + mask |= POLLHUP; + if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) + mask |= POLLERR; + + return mask; +} + +static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev, + u32 id, u32 flags) +{ + struct goldfish_pipe *pipe; + + BUG_ON(id >= dev->pipes_capacity); + + pipe = dev->pipes[id]; + if (!pipe) + return; + pipe->signalled_flags |= flags; + + if (pipe->prev_signalled || pipe->next_signalled + || dev->first_signalled_pipe == pipe) + return; /* already in the list */ + pipe->next_signalled = dev->first_signalled_pipe; + if (dev->first_signalled_pipe) { + dev->first_signalled_pipe->prev_signalled = pipe; + } + dev->first_signalled_pipe = pipe; +} + +static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev, + struct goldfish_pipe *pipe) { + if (pipe->prev_signalled) + pipe->prev_signalled->next_signalled = pipe->next_signalled; + if (pipe->next_signalled) + pipe->next_signalled->prev_signalled = pipe->prev_signalled; + if (pipe == dev->first_signalled_pipe) + dev->first_signalled_pipe = pipe->next_signalled; + pipe->prev_signalled = NULL; + pipe->next_signalled = NULL; +} + +static struct goldfish_pipe *signalled_pipes_pop_front(struct goldfish_pipe_dev *dev, + int *wakes) +{ + struct goldfish_pipe *pipe; + unsigned long flags; + spin_lock_irqsave(&dev->lock, flags); + + pipe = dev->first_signalled_pipe; + if (pipe) { + *wakes = pipe->signalled_flags; + pipe->signalled_flags = 0; + /* + * This is an optimized version of signalled_pipes_remove_locked() - + * we want to make it as fast as possible to wake the sleeping pipe + * operations faster + */ + dev->first_signalled_pipe = pipe->next_signalled; + if (dev->first_signalled_pipe) + dev->first_signalled_pipe->prev_signalled = NULL; + pipe->next_signalled = NULL; + } + + spin_unlock_irqrestore(&dev->lock, flags); + return pipe; +} + +static void goldfish_interrupt_task(unsigned long unused) +{ + struct goldfish_pipe_dev *dev = pipe_dev; + /* Iterate over the signalled pipes and wake them one by one */ + struct goldfish_pipe *pipe; + int wakes; + while ((pipe = signalled_pipes_pop_front(dev, &wakes)) != NULL) { + if (wakes & PIPE_WAKE_CLOSED) { + pipe->flags = 1 << BIT_CLOSED_ON_HOST; + } else { + if (wakes & PIPE_WAKE_READ) + clear_bit(BIT_WAKE_ON_READ, &pipe->flags); + if (wakes & PIPE_WAKE_WRITE) + clear_bit(BIT_WAKE_ON_WRITE, &pipe->flags); + } + /* + * wake_up_interruptible() implies a write barrier, so don't explicitly + * add another one here. + */ + wake_up_interruptible(&pipe->wake_queue); + } +} +DECLARE_TASKLET(goldfish_interrupt_tasklet, goldfish_interrupt_task, 0); + +/* + * The general idea of the interrupt handling: + * + * 1. device raises an interrupt if there's at least one signalled pipe + * 2. IRQ handler reads the signalled pipes and their count from the device + * 3. device writes them into a shared buffer and returns the count + * it only resets the IRQ if it has returned all signalled pipes, + * otherwise it leaves it raised, so IRQ handler will be called + * again for the next chunk + * 4. IRQ handler adds all returned pipes to the device's signalled pipes list + * 5. IRQ handler launches a tasklet to process the signalled pipes from the + * list in a separate context + */ +static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id) +{ + u32 count; + u32 i; + unsigned long flags; + struct goldfish_pipe_dev *dev = dev_id; + if (dev != pipe_dev) + return IRQ_NONE; + + /* Request the signalled pipes from the device */ + spin_lock_irqsave(&dev->lock, flags); + + count = readl(dev->base + PIPE_REG_GET_SIGNALLED); + if (count == 0) { + spin_unlock_irqrestore(&dev->lock, flags); + return IRQ_NONE; + } + if (count > MAX_SIGNALLED_PIPES) + count = MAX_SIGNALLED_PIPES; + + for (i = 0; i < count; ++i) + signalled_pipes_add_locked(dev, + dev->buffers->signalled_pipe_buffers[i].id, + dev->buffers->signalled_pipe_buffers[i].flags); + + spin_unlock_irqrestore(&dev->lock, flags); + + tasklet_schedule(&goldfish_interrupt_tasklet); + return IRQ_HANDLED; +} + +static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev) +{ + int id; + for (id = 0; id < dev->pipes_capacity; ++id) + if (!dev->pipes[id]) + return id; + + { + /* Reallocate the array */ + u32 new_capacity = 2 * dev->pipes_capacity; + struct goldfish_pipe **pipes = + kcalloc(new_capacity, sizeof(*pipes), GFP_KERNEL); + if (!pipes) + return -ENOMEM; + memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity); + kfree(dev->pipes); + dev->pipes = pipes; + id = dev->pipes_capacity; + dev->pipes_capacity = new_capacity; + } + return id; +} + +/** + * goldfish_pipe_open - open a channel to the AVD + * @inode: inode of device + * @file: file struct of opener + * + * Create a new pipe link between the emulator and the use application. + * Each new request produces a new pipe. + * + * Note: we use the pipe ID as a mux. All goldfish emulations are 32bit + * right now so this is fine. A move to 64bit will need this addressing + */ +static int goldfish_pipe_open(struct inode *inode, struct file *file) +{ + struct goldfish_pipe_dev *dev = pipe_dev; + unsigned long flags; + int id; + int status; + + /* Allocate new pipe kernel object */ + struct goldfish_pipe *pipe = kzalloc(sizeof(*pipe), GFP_KERNEL); + if (pipe == NULL) + return -ENOMEM; + + pipe->dev = dev; + mutex_init(&pipe->lock); + init_waitqueue_head(&pipe->wake_queue); + + /* + * Command buffer needs to be allocated on its own page to make sure it is + * physically contiguous in host's address space. + */ + pipe->command_buffer = + (struct goldfish_pipe_command*)__get_free_page(GFP_KERNEL); + if (!pipe->command_buffer) { + status = -ENOMEM; + goto err_pipe; + } + + spin_lock_irqsave(&dev->lock, flags); + + id = get_free_pipe_id_locked(dev); + if (id < 0) { + status = id; + goto err_id_locked; + } + + dev->pipes[id] = pipe; + pipe->id = id; + pipe->command_buffer->id = id; + + /* Now tell the emulator we're opening a new pipe. */ + dev->buffers->open_command_params.rw_params_max_count = + MAX_BUFFERS_PER_COMMAND; + dev->buffers->open_command_params.command_buffer_ptr = + (u64)(unsigned long)__pa(pipe->command_buffer); + status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN); + spin_unlock_irqrestore(&dev->lock, flags); + if (status < 0) + goto err_cmd; + /* All is done, save the pipe into the file's private data field */ + file->private_data = pipe; + return 0; + +err_cmd: + spin_lock_irqsave(&dev->lock, flags); + dev->pipes[id] = NULL; +err_id_locked: + spin_unlock_irqrestore(&dev->lock, flags); + free_page((unsigned long)pipe->command_buffer); +err_pipe: + kfree(pipe); + return status; +} + +static int goldfish_pipe_release(struct inode *inode, struct file *filp) +{ + unsigned long flags; + struct goldfish_pipe *pipe = filp->private_data; + struct goldfish_pipe_dev *dev = pipe->dev; + + /* The guest is closing the channel, so tell the emulator right now */ + (void)goldfish_cmd(pipe, PIPE_CMD_CLOSE); + + spin_lock_irqsave(&dev->lock, flags); + dev->pipes[pipe->id] = NULL; + signalled_pipes_remove_locked(dev, pipe); + spin_unlock_irqrestore(&dev->lock, flags); + + filp->private_data = NULL; + free_page((unsigned long)pipe->command_buffer); + kfree(pipe); + return 0; +} + +static const struct file_operations goldfish_pipe_fops = { + .owner = THIS_MODULE, + .read = goldfish_pipe_read, + .write = goldfish_pipe_write, + .poll = goldfish_pipe_poll, + .open = goldfish_pipe_open, + .release = goldfish_pipe_release, +}; + +static struct miscdevice goldfish_pipe_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "goldfish_pipe", + .fops = &goldfish_pipe_fops, +}; + +static int goldfish_pipe_device_init_v2(struct platform_device *pdev) +{ + char *page; + struct goldfish_pipe_dev *dev = pipe_dev; + int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt, + IRQF_SHARED, "goldfish_pipe", dev); + if (err) { + dev_err(&pdev->dev, "unable to allocate IRQ for v2\n"); + return err; + } + + err = misc_register(&goldfish_pipe_dev); + if (err) { + dev_err(&pdev->dev, "unable to register v2 device\n"); + return err; + } + + dev->first_signalled_pipe = NULL; + dev->pipes_capacity = INITIAL_PIPES_CAPACITY; + dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes), GFP_KERNEL); + if (!dev->pipes) + return -ENOMEM; + + /* + * We're going to pass two buffers, open_command_params and + * signalled_pipe_buffers, to the host. This means each of those buffers + * needs to be contained in a single physical page. The easiest choice is + * to just allocate a page and place the buffers in it. + */ + BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE); + page = (char*)__get_free_page(GFP_KERNEL); + if (!page) { + kfree(dev->pipes); + return -ENOMEM; + } + dev->buffers = (struct goldfish_pipe_dev_buffers*)page; + + /* Send the buffer addresses to the host */ + { + u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers); + writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH); + writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_SIGNAL_BUFFER); + writel((u32)MAX_SIGNALLED_PIPES, dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT); + + paddr = __pa(&dev->buffers->open_command_params); + writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_OPEN_BUFFER_HIGH); + writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_OPEN_BUFFER); + } + return 0; +} + +static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) { + struct goldfish_pipe_dev *dev = pipe_dev; + misc_deregister(&goldfish_pipe_dev); + kfree(dev->pipes); + free_page((unsigned long)dev->buffers); +} + +static int goldfish_pipe_probe(struct platform_device *pdev) +{ + int err; + struct resource *r; + struct goldfish_pipe_dev *dev = pipe_dev; + + BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE); + + /* not thread safe, but this should not happen */ + WARN_ON(dev->base != NULL); + + spin_lock_init(&dev->lock); + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (r == NULL || resource_size(r) < PAGE_SIZE) { + dev_err(&pdev->dev, "can't allocate i/o page\n"); + return -EINVAL; + } + dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE); + if (dev->base == NULL) { + dev_err(&pdev->dev, "ioremap failed\n"); + return -EINVAL; + } + + r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (r == NULL) { + err = -EINVAL; + goto error; + } + dev->irq = r->start; + + /* + * Exchange the versions with the host device + * + * Note: v1 driver used to not report its version, so we write it before + * reading device version back: this allows the host implementation to + * detect the old driver (if there was no version write before read). + */ + writel((u32)PIPE_DRIVER_VERSION, dev->base + PIPE_REG_VERSION); + dev->version = readl(dev->base + PIPE_REG_VERSION); + if (dev->version < PIPE_CURRENT_DEVICE_VERSION) { + /* initialize the old device version */ + err = goldfish_pipe_device_init_v1(pdev); + } else { + /* Host device supports the new interface */ + err = goldfish_pipe_device_init_v2(pdev); + } + if (!err) + return 0; + +error: + dev->base = NULL; + return err; +} + +static int goldfish_pipe_remove(struct platform_device *pdev) +{ + struct goldfish_pipe_dev *dev = pipe_dev; + if (dev->version < PIPE_CURRENT_DEVICE_VERSION) + goldfish_pipe_device_deinit_v1(pdev); + else + goldfish_pipe_device_deinit_v2(pdev); + dev->base = NULL; + return 0; +} + +static const struct acpi_device_id goldfish_pipe_acpi_match[] = { + { "GFSH0003", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match); + +static const struct of_device_id goldfish_pipe_of_match[] = { + { .compatible = "google,android-pipe", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match); + +static struct platform_driver goldfish_pipe_driver = { + .probe = goldfish_pipe_probe, + .remove = goldfish_pipe_remove, + .driver = { + .name = "goldfish_pipe", + .of_match_table = goldfish_pipe_of_match, + .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match), + } +}; + +module_platform_driver(goldfish_pipe_driver); +MODULE_AUTHOR("David Turner "); +MODULE_LICENSE("GPL"); From f5555c059ec1ee4a7298baa373aed789d35f0133 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Thu, 23 Jul 2015 10:40:57 -0700 Subject: [PATCH 598/607] ANDROID: arch: x86: disable pic for Android toolchain Android toolchains enable PIC, so explicitly disable it with -fno-pic (this is the upstream gcc default) Signed-off-by: Greg Hackmann (cherry picked from commit 892606ece2bebfa5a1ed62e9552cc973707ae9d3) Change-Id: I1e600363e5d18e459479fe4eb23d76855e16868d --- arch/x86/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b32..53949c886341 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -97,6 +97,8 @@ else KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) + KBUILD_CFLAGS += -fno-pic + # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) From 6297c6ba0d217d5b0998738fbfaff2f04cad77e6 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 17 Nov 2016 17:01:43 -0800 Subject: [PATCH 599/607] arm64: rename ranchu defconfig to ranchu64 Change-Id: Ib7cd1ef722167905957623f65c3cc064e9d5c357 --- arch/arm64/configs/{ranchu_defconfig => ranchu64_defconfig} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename arch/arm64/configs/{ranchu_defconfig => ranchu64_defconfig} (100%) diff --git a/arch/arm64/configs/ranchu_defconfig b/arch/arm64/configs/ranchu64_defconfig similarity index 100% rename from arch/arm64/configs/ranchu_defconfig rename to arch/arm64/configs/ranchu64_defconfig From 6502f5fcefadabb4ebd44583a11610751c3ff194 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 18 Nov 2016 07:26:19 +0100 Subject: [PATCH 600/607] ANDROID: goldfish_pipe: fix call_kern.cocci warnings Function get_free_pipe_id_locked called on line 671 inside lock on line 669 but uses GFP_KERNEL. Replace with GFP_ATOMIC. Generated by: scripts/coccinelle/locks/call_kern.cocci CC: Yurii Zubrytskyi Signed-off-by: Julia Lawall Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- drivers/platform/goldfish/goldfish_pipe_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c index bdb039bc7dde..ad373ed36555 100644 --- a/drivers/platform/goldfish/goldfish_pipe_v2.c +++ b/drivers/platform/goldfish/goldfish_pipe_v2.c @@ -616,7 +616,8 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev) /* Reallocate the array */ u32 new_capacity = 2 * dev->pipes_capacity; struct goldfish_pipe **pipes = - kcalloc(new_capacity, sizeof(*pipes), GFP_KERNEL); + kcalloc(new_capacity, sizeof(*pipes), + GFP_ATOMIC); if (!pipes) return -ENOMEM; memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity); From 5f93f3cb3ab26abd6791bc1f01e6a9db73e65536 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 18 Nov 2016 13:16:07 +0800 Subject: [PATCH 601/607] ANDROID: video: goldfishfb: fix platform_no_drv_owner.cocci warnings drivers/video/fbdev/goldfishfb.c:318:3-8: No need to set .owner here. The core will do it. Remove .owner field if calls are used which set it automatically Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci CC: Greg Hackmann Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- drivers/video/fbdev/goldfishfb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 131fee0341c3..1e56b50e4082 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -322,7 +322,6 @@ static struct platform_driver goldfish_fb_driver = { .remove = goldfish_fb_remove, .driver = { .name = "goldfish_fb", - .owner = THIS_MODULE, .of_match_table = goldfish_fb_of_match, .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match), } From 37e461fc4453488c394ccdbdcb8e286c122b047a Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 18 Nov 2016 11:09:02 -0800 Subject: [PATCH 602/607] ANDROID: goldfish: goldfish_pipe: fix locking errors If the get_user_pages_fast() call in goldfish_pipe_read_write() failed, it would return while still holding pipe->lock. goldfish_pipe_read_write() later releases and tries to re-acquire pipe->lock. If the re-acquire call failed, goldfish_pipe_read_write() would try unlock pipe->lock on exit anyway. This fixes the smatch messages: drivers/platform/goldfish/goldfish_pipe.c:392 goldfish_pipe_read_write() error: double unlock 'mutex:&pipe->lock' drivers/platform/goldfish/goldfish_pipe.c:397 goldfish_pipe_read_write() warn: inconsistent returns 'mutex:&pipe->lock'. Change-Id: Ifd06a76b32027ca451a001704ade0c5440ed69c4 Signed-off-by: Greg Hackmann --- drivers/platform/goldfish/goldfish_pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index cf7ce97e7346..fd1452e28352 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -273,11 +273,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, if (ret == 0) { DPRINT("%s: error: (requested pages == 0) (wanted %d)\n", __FUNCTION__, requested_pages); + mutex_unlock(&pipe->lock); return ret; } if (ret < 0) { DPRINT("%s: (requested pages < 0) %d \n", __FUNCTION__, requested_pages); + mutex_unlock(&pipe->lock); return ret; } @@ -384,10 +386,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, } /* Try to re-acquire the lock */ - if (mutex_lock_interruptible(&pipe->lock)) { - ret = -ERESTARTSYS; - break; - } + if (mutex_lock_interruptible(&pipe->lock)) + return -ERESTARTSYS; } mutex_unlock(&pipe->lock); From dc1767cd0f28e0d1ba607bb4b5e397366ae87336 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Fri, 18 Nov 2016 11:40:40 -0800 Subject: [PATCH 603/607] ANDROID: goldfish_pipe: fix allmodconfig build tree: https://android.googlesource.com/kernel/common android-4.4 head: 6297c6ba0d217d5b0998738fbfaff2f04cad77e6 commit: bc43565e1ac5ba3f204886a2275726bb4c3d44e6 [18/20] ANDROID: goldfish_pipe: An implementation of more parallel pipe config: i386-randconfig-s1-201646 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: git checkout bc43565e1ac5ba3f204886a2275726bb4c3d44e6 # save the attached .config to linux build tree make ARCH=i386 All errors (new ones prefixed by >>): >> ERROR: "goldfish_pipe_device_deinit_v1" [drivers/platform/goldfish/goldfish_pipe_v2.ko] undefined! >> ERROR: "goldfish_pipe_device_init_v1" [drivers/platform/goldfish/goldfish_pipe_v2.ko] undefined! >> ERROR: "pipe_dev" [drivers/platform/goldfish/goldfish_pipe.ko] undefined! Change-Id: Ibd51441edf82e6bb6824acc05ea795570cc374e8 --- drivers/platform/goldfish/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile index e53ae2fc717b..277a820ee4e1 100644 --- a/drivers/platform/goldfish/Makefile +++ b/drivers/platform/goldfish/Makefile @@ -2,4 +2,5 @@ # Makefile for Goldfish platform specific drivers # obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o -obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o goldfish_pipe_v2.o +obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe_all.o +goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o From af80d7c3e91450fbcad01c497b394bf8ab01d37c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:22 -0800 Subject: [PATCH 604/607] UPSTREAM: timekeeping: Add a fast and NMI safe boot clock This boot clock can be used as a tracing clock and will account for suspend time. To keep it NMI safe since we're accessing from tracing, we're not using a separate timekeeper with updates to monotonic clock and boot offset protected with seqlocks. This has the following minor side effects: (1) Its possible that a timestamp be taken after the boot offset is updated but before the timekeeper is updated. If this happens, the new boot offset is added to the old timekeeping making the clock appear to update slightly earlier: CPU 0 CPU 1 timekeeping_inject_sleeptime64() __timekeeping_inject_sleeptime(tk, delta); timestamp(); timekeeping_update(tk, TK_CLEAR_NTP...); (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be partially updated. Since the tk->offs_boot update is a rare event, this should be a rare occurrence which postprocessing should be able to handle. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..b7246d2ed7c9 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -233,6 +233,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); +extern u64 ktime_get_boot_fast_ns(void); /* * Timespec interfaces utilizing the ktime based ones diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d563c1960302..7c92bad46761 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -402,6 +402,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; From 5fe7d45f445841c6261e8695428a532db0e0377c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:23 -0800 Subject: [PATCH 605/607] UPSTREAM: trace: Add an option for boot clock as trace clock Unlike monotonic clock, boot clock as a trace clock will account for time spent in suspend useful for tracing suspend/resume. This uses earlier introduced infrastructure for using the fast boot clock. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Acked-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f375d4c05fb..312557c04c10 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -890,6 +890,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; From 7c04e2fcb076dd28760f2eca2eaf48dd37e52a40 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:24 -0800 Subject: [PATCH 606/607] UPSTREAM: trace: Update documentation for mono, mono_raw and boot clock Documentation was missing for mono and mono_raw, add them and also for the boot clock introduced in this series. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Acked-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index cf2f9e9dfe4d..fa16fb2302a5 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -357,6 +357,26 @@ of ftrace. Here is a list of some of the key files: to correlate events across hypervisor/guest if tb_offset is known. + mono: This uses the fast monotonic clock (CLOCK_MONOTONIC) + which is monotonic and is subject to NTP rate adjustments. + + mono_raw: + This is the raw monotonic clock (CLOCK_MONOTONIC_RAW) + which is montonic but is not subject to any rate adjustments + and ticks at the same rate as the hardware clocksource. + + boot: This is the boot clock (CLOCK_BOOTTIME) and is based on the + fast monotonic clock, but also accounts for time spent in + suspend. Since the clock access is designed for use in + tracing in the suspend path, some side effects are possible + if clock is accessed after the suspend time is accounted before + the fast mono clock is updated. In this case, the clock update + appears to happen slightly sooner than it normally would have. + Also on 32-bit systems, its possible that the 64-bit boot offset + sees a partial update. These effects are rare and post + processing should be able to handle them. See comments on + ktime_get_boot_fast_ns function for more information. + To set a clock, simply echo the clock name into this file. echo global > trace_clock From 8e53f7c02371eeef39805cc998eb73ac19a1320d Mon Sep 17 00:00:00 2001 From: Ke Wang Date: Fri, 25 Nov 2016 13:38:45 +0800 Subject: [PATCH 607/607] sched: tune: Fix lacking spinlock initialization The spinlock used by boost_groups in sched tune must be initialized. This commit fixes this lack and the following errors: [ 0.384739] c2 BUG: spinlock bad magic on CPU#2, swapper/2/0 [ 0.390313] c2 lock: 0xffffffc15fe1fc80, .magic:00000000, .owner: /-1, .owner_cpu: 0 [ 0.398739] c2 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.4.6+ #4 [ 0.404816] c2 Hardware name: Spreadtrum SP9860gBoard (DT) [ 0.410462] c2 Call trace: [ 0.413159] c2 [] dump_backtrace+0x0/0x210 [ 0.418803] c2 [] show_stack+0x20/0x28 [ 0.424100] c2 [] dump_stack+0xa8/0xe0 [ 0.429398] c2 [] spin_dump+0x78/0x9c [ 0.434608] c2 [] spin_bug+0x30/0x3c [ 0.439644] c2 [] do_raw_spin_lock+0xac/0x1b4 [ 0.445639] c2 [] _raw_spin_lock_irqsave+0x58/0x68 [ 0.451977] c2 [] schedtune_enqueue_task+0x84/0x3bc [ 0.458320] c2 [] enqueue_task_fair+0x438/0x208c [ 0.464487] c2 [] activate_task+0x70/0xd0 [ 0.470130] c2 [] ttwu_do_activate.constprop.131+0x4c/0x98 [ 0.477079] c2 [] try_to_wake_up+0x254/0x54c [ 0.482899] c2 [] default_wake_function+0x30/0x3c [ 0.489154] c2 [] autoremove_wake_function+0x3c/0x6c [ 0.495754] c2 [] __wake_up_common+0x64/0xa4 [ 0.501574] c2 [] __wake_up+0x48/0x60 [ 0.506788] c2 [] rcu_gp_kthread_wake+0x50/0x5c [ 0.512866] c2 [] note_gp_changes+0xac/0xd4 [ 0.518597] c2 [] rcu_process_callbacks+0xe8/0x93c [ 0.524940] c2 [] __do_softirq+0x24c/0x5b8 [ 0.530584] c2 [] irq_exit+0xc0/0xec [ 0.535623] c2 [] __handle_domain_irq+0x94/0xf8 [ 0.541789] c2 [] gic_handle_irq+0x64/0xc0 Signed-off-by: Ke Wang --- kernel/sched/tune.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 68a24a044b0a..079b18802f17 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -725,6 +725,7 @@ schedtune_init_cgroups(void) for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); + raw_spin_lock_init(&bg->lock); } pr_info("schedtune: configured to support %d boost groups\n",