From 17c8a0d7e13bae92cc819c9060a0d39b3ce860c5 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 19 Nov 2018 10:52:31 -0800 Subject: net/mlx5: EQ, Use the right place to store/read IRQ affinity hint [ Upstream commit 1e86ace4c140fd5a693e266c9b23409358f25381 ] Currently the cpu affinity hint mask for completion EQs is stored and read from the wrong place, since reading and storing is done from the same index, there is no actual issue with that, but internal irq_info for completion EQs stars at MLX5_EQ_VEC_COMP_BASE offset in irq_info array, this patch changes the code to use the correct offset to store and read the IRQ affinity hint. Signed-off-by: Saeed Mahameed Reviewed-by: Leon Romanovsky Reviewed-by: Tariq Toukan Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fb677e4f902d..88f0c530fe9c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1195,7 +1195,7 @@ enum { static inline const struct cpumask * mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector) { - return dev->priv.irq_info[vector].mask; + return dev->priv.irq_info[vector + MLX5_EQ_VEC_COMP_BASE].mask; } #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 4634125557a2354a675f0bd0bb89a8a994b5b39e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 20 Dec 2018 17:23:43 +0100 Subject: drbd: Avoid Clang warning about pointless switch statment [ Upstream commit a52c5a16cf19d8a85831bb1b915a221dd4ffae3c ] There are several warnings from Clang about no case statement matching the constant 0: In file included from drivers/block/drbd/drbd_receiver.c:48: In file included from drivers/block/drbd/drbd_int.h:48: In file included from ./include/linux/drbd_genl_api.h:54: In file included from ./include/linux/genl_magic_struct.h:236: ./include/linux/drbd_genl.h:321:1: warning: no case matching constant switch condition '0' GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/genl_magic_struct.h:220:10: note: expanded from macro 'GENL_struct' switch (0) { ^ Silence this warning by adding a 'case 0:' statement. Additionally, adjust the alignment of the statements in the ct_assert_unique macro to avoid a checkpatch warning. This solution was originally sent by Arnd Bergmann with a default case statement: https://lore.kernel.org/patchwork/patch/756723/ Link: https://github.com/ClangBuiltLinux/linux/issues/43 Suggested-by: Lars Ellenberg Signed-off-by: Nathan Chancellor Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/genl_magic_struct.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 5972e4969197..eeae59d3ceb7 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -191,6 +191,7 @@ static inline void ct_assert_unique_operations(void) { switch (0) { #include GENL_MAGIC_INCLUDE_FILE + case 0: ; } } @@ -209,6 +210,7 @@ static inline void ct_assert_unique_top_level_attributes(void) { switch (0) { #include GENL_MAGIC_INCLUDE_FILE + case 0: ; } } @@ -218,7 +220,8 @@ static inline void ct_assert_unique_top_level_attributes(void) static inline void ct_assert_unique_ ## s_name ## _attributes(void) \ { \ switch (0) { \ - s_fields \ + s_fields \ + case 0: \ ; \ } \ } -- cgit v1.2.3 From f70123c6d3accc024445eca6b9ee01c1fe2b80d8 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Dec 2018 14:34:43 -0800 Subject: kvm: Change offset in kvm_write_guest_offset_cached to unsigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7a86dab8cf2f0fdf508f3555dddfc236623bff60 ] Since the offset is added directly to the hva from the gfn_to_hva_cache, a negative offset could result in an out of bounds write. The existing BUG_ON only checks for addresses beyond the end of the gfn_to_hva_cache, not for addresses before the start of the gfn_to_hva_cache. Note that all current call sites have non-negative offsets. Fixes: 4ec6e8636256 ("kvm: Introduce kvm_write_guest_offset_cached()") Reported-by: Cfir Cohen Signed-off-by: Jim Mattson Reviewed-by: Cfir Cohen Reviewed-by: Peter Shier Reviewed-by: Krish Sadhukhan Reviewed-by: Sean Christopherson Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b6962ae6237e..4f7f19c1dc0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -685,7 +685,8 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len); + void *data, unsigned int offset, + unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -- cgit v1.2.3 From 5e1f1c1f5d00ffba3600ba1ffb3a1fc7dae0a375 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 30 Jan 2019 07:13:58 -0600 Subject: cpu/hotplug: Fix "SMT disabled by BIOS" detection for KVM commit b284909abad48b07d3071a9fc9b5692b3e64914b upstream. With the following commit: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") ... the hotplug code attempted to detect when SMT was disabled by BIOS, in which case it reported SMT as permanently disabled. However, that code broke a virt hotplug scenario, where the guest is booted with only primary CPU threads, and a sibling is brought online later. The problem is that there doesn't seem to be a way to reliably distinguish between the HW "SMT disabled by BIOS" case and the virt "sibling not yet brought online" case. So the above-mentioned commit was a bit misguided, as it permanently disabled SMT for both cases, preventing future virt sibling hotplugs. Going back and reviewing the original problems which were attempted to be solved by that commit, when SMT was disabled in BIOS: 1) /sys/devices/system/cpu/smt/control showed "on" instead of "notsupported"; and 2) vmx_vm_init() was incorrectly showing the L1TF_MSG_SMT warning. I'd propose that we instead consider #1 above to not actually be a problem. Because, at least in the virt case, it's possible that SMT wasn't disabled by BIOS and a sibling thread could be brought online later. So it makes sense to just always default the smt control to "on" to allow for that possibility (assuming cpuid indicates that the CPU supports SMT). The real problem is #2, which has a simple fix: change vmx_vm_init() to query the actual current SMT state -- i.e., whether any siblings are currently online -- instead of looking at the SMT "control" sysfs value. So fix it by: a) reverting the original "fix" and its followup fix: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") bc2d8d262cba ("cpu/hotplug: Fix SMT supported evaluation") and b) changing vmx_vm_init() to query the actual current SMT state -- instead of the sysfs control value -- to determine whether the L1TF warning is needed. This also requires the 'sched_smt_present' variable to exported, instead of 'cpu_smt_control'. Fixes: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") Reported-by: Igor Mammedov Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Joe Mario Cc: Jiri Kosina Cc: Peter Zijlstra Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/e3a85d585da28cc333ecbc1e78ee9216e6da9396.1548794349.git.jpoimboe@redhat.com Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2a378d261914..c7712e042aba 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -188,12 +188,10 @@ enum cpuhp_smt_control { #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); -extern void cpu_smt_check_topology_early(void); extern void cpu_smt_check_topology(void); #else # define cpu_smt_control (CPU_SMT_ENABLED) static inline void cpu_smt_disable(bool force) { } -static inline void cpu_smt_check_topology_early(void) { } static inline void cpu_smt_check_topology(void) { } #endif -- cgit v1.2.3 From e0f784bf571528011a7421021f72dbe4bfe10a7c Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Tue, 29 Jan 2019 11:58:35 +0100 Subject: HID: debug: fix the ring buffer implementation commit 13054abbaa4f1fd4e6f3b4b63439ec033b4c8035 upstream. Ring buffer implementation in hid_debug_event() and hid_debug_events_read() is strange allowing lost or corrupted data. After commit 717adfdaf147 ("HID: debug: check length before copy_to_user()") it is possible to enter an infinite loop in hid_debug_events_read() by providing 0 as count, this locks up a system. Fix this by rewriting the ring buffer implementation with kfifo and simplify the code. This fixes CVE-2019-3819. v2: fix an execution logic and add a comment v3: use __set_current_state() instead of set_current_state() Backport to v4.14: 2 tree-wide patches 6396bb22151 ("treewide: kzalloc() -> kcalloc()") and a9a08845e9ac ("vfs: do bulk POLL* -> EPOLL* replacement") are missing in v4.14 so cherry-pick relevant pieces. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1669187 Cc: stable@vger.kernel.org # v4.18+ Fixes: cd667ce24796 ("HID: use debugfs for events/reports dumping") Fixes: 717adfdaf147 ("HID: debug: check length before copy_to_user()") Signed-off-by: Vladis Dronov Reviewed-by: Oleg Nesterov Signed-off-by: Benjamin Tissoires Signed-off-by: Greg Kroah-Hartman --- include/linux/hid-debug.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index 8663f216c563..2d6100edf204 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -24,7 +24,10 @@ #ifdef CONFIG_DEBUG_FS +#include + #define HID_DEBUG_BUFSIZE 512 +#define HID_DEBUG_FIFOSIZE 512 void hid_dump_input(struct hid_device *, struct hid_usage *, __s32); void hid_dump_report(struct hid_device *, int , u8 *, int); @@ -37,11 +40,8 @@ void hid_debug_init(void); void hid_debug_exit(void); void hid_debug_event(struct hid_device *, char *); - struct hid_debug_list { - char *hid_debug_buf; - int head; - int tail; + DECLARE_KFIFO_PTR(hid_debug_fifo, char); struct fasync_struct *fasync; struct hid_device *hdev; struct list_head node; @@ -64,4 +64,3 @@ struct hid_debug_list { #endif #endif - -- cgit v1.2.3 From 085d735c858934e5d5bfaedb1fc98bd9135e6ff1 Mon Sep 17 00:00:00 2001 From: Hamish Martin Date: Wed, 13 Feb 2019 16:29:29 +0000 Subject: uio: Prevent device destruction while fds are open commit a93e7b331568227500186a465fee3c2cb5dffd1f upstream. Prevent destruction of a uio_device while user space apps hold open file descriptors to that device. Further, access to the 'info' member of the struct uio_device is protected by spinlock. This is to ensure stale pointers to data not under control of the UIO subsystem are not dereferenced. Signed-off-by: Hamish Martin Reviewed-by: Chris Packham Signed-off-by: Greg Kroah-Hartman [4.14 change __poll_t to unsigned int] Signed-off-by: Tommi Rantala Signed-off-by: Greg Kroah-Hartman --- include/linux/uio_driver.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 3c85c81b0027..6c5f2074e14f 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -14,6 +14,7 @@ #ifndef _UIO_DRIVER_H_ #define _UIO_DRIVER_H_ +#include #include #include @@ -68,12 +69,13 @@ struct uio_port { struct uio_device { struct module *owner; - struct device *dev; + struct device dev; int minor; atomic_t event; struct fasync_struct *async_queue; wait_queue_head_t wait; struct uio_info *info; + spinlock_t info_lock; struct kobject *map_dir; struct kobject *portio_dir; }; -- cgit v1.2.3 From 3f400c2c2e7043b64c65d9b8709a0b353e1f8384 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 13 Feb 2019 16:29:33 +0000 Subject: uio: change to use the mutex lock instead of the spin lock commit 543af5861f41af0a5d2432f6fb5976af50f9cee5 upstream. We are hitting a regression with the following commit: commit a93e7b331568227500186a465fee3c2cb5dffd1f Author: Hamish Martin Date: Mon May 14 13:32:23 2018 +1200 uio: Prevent device destruction while fds are open The problem is the addition of spin_lock_irqsave in uio_write. This leads to hitting uio_write -> copy_from_user -> _copy_from_user -> might_fault and the logs filling up with sleeping warnings. I also noticed some uio drivers allocate memory, sleep, grab mutexes from callouts like open() and release and uio is now doing spin_lock_irqsave while calling them. Reported-by: Mike Christie CC: Hamish Martin Reviewed-by: Hamish Martin Signed-off-by: Xiubo Li Signed-off-by: Greg Kroah-Hartman Signed-off-by: Tommi Rantala Signed-off-by: Greg Kroah-Hartman --- include/linux/uio_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 6c5f2074e14f..6f8b68cd460f 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -75,7 +75,7 @@ struct uio_device { struct fasync_struct *async_queue; wait_queue_head_t wait; struct uio_info *info; - spinlock_t info_lock; + struct mutex info_lock; struct kobject *map_dir; struct kobject *portio_dir; }; -- cgit v1.2.3 From 785644d6731914407b87e70db00aca351a44a935 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 14 Feb 2019 11:31:17 +0100 Subject: net: create skb_gso_validate_mac_len() commit 2b16f048729bf35e6c28a40cbfad07239f9dcd90 upstream If you take a GSO skb, and split it into packets, will the MAC length (L2 + L3 + L4 headers + payload) of those packets be small enough to fit within a given length? Move skb_gso_mac_seglen() to skbuff.h with other related functions like skb_gso_network_seglen() so we can use it, and then create skb_gso_validate_mac_len to do the full calculation. Signed-off-by: Daniel Axtens Signed-off-by: David S. Miller [jwang: cherry pick for CVE-2018-1000026] Signed-off-by: Jack Wang Signed-off-by: Sasha Levin --- include/linux/skbuff.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39c2570ddcf6..50a4a5968f3a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3317,6 +3317,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu); +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); @@ -4087,6 +4088,21 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) return hdr_len + skb_gso_transport_seglen(skb); } +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static inline unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + return hdr_len + skb_gso_transport_seglen(skb); +} + /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. -- cgit v1.2.3 From b1b0ca701bd69e834492bf3e74e40f4c28432f03 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 4 Feb 2019 13:35:32 +0100 Subject: perf/x86: Add check_period PMU callback commit 81ec3f3c4c4d78f2d3b6689c9816bfbdf7417dbb upstream. Vince (and later on Ravi) reported crashes in the BTS code during fuzzing with the following backtrace: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:perf_prepare_sample+0x8f/0x510 ... Call Trace: ? intel_pmu_drain_bts_buffer+0x194/0x230 intel_pmu_drain_bts_buffer+0x160/0x230 ? tick_nohz_irq_exit+0x31/0x40 ? smp_call_function_single_interrupt+0x48/0xe0 ? call_function_single_interrupt+0xf/0x20 ? call_function_single_interrupt+0xa/0x20 ? x86_schedule_events+0x1a0/0x2f0 ? x86_pmu_commit_txn+0xb4/0x100 ? find_busiest_group+0x47/0x5d0 ? perf_event_set_state.part.42+0x12/0x50 ? perf_mux_hrtimer_restart+0x40/0xb0 intel_pmu_disable_event+0xae/0x100 ? intel_pmu_disable_event+0xae/0x100 x86_pmu_stop+0x7a/0xb0 x86_pmu_del+0x57/0x120 event_sched_out.isra.101+0x83/0x180 group_sched_out.part.103+0x57/0xe0 ctx_sched_out+0x188/0x240 ctx_resched+0xa8/0xd0 __perf_event_enable+0x193/0x1e0 event_function+0x8e/0xc0 remote_function+0x41/0x50 flush_smp_call_function_queue+0x68/0x100 generic_smp_call_function_single_interrupt+0x13/0x30 smp_call_function_single_interrupt+0x3e/0xe0 call_function_single_interrupt+0xf/0x20 The reason is that while event init code does several checks for BTS events and prevents several unwanted config bits for BTS event (like precise_ip), the PERF_EVENT_IOC_PERIOD allows to create BTS event without those checks being done. Following sequence will cause the crash: If we create an 'almost' BTS event with precise_ip and callchains, and it into a BTS event it will crash the perf_prepare_sample() function because precise_ip events are expected to come in with callchain data initialized, but that's not the case for intel_pmu_drain_bts_buffer() caller. Adding a check_period callback to be called before the period is changed via PERF_EVENT_IOC_PERIOD. It will deny the change if the event would become BTS. Plus adding also the limit_period check as well. Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Naveen N. Rao Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190204123532.GA4794@krava Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8e22f24ded6a..956d76744c91 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -446,6 +446,11 @@ struct pmu { * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ + + /* + * Check period value for PERF_EVENT_IOC_PERIOD ioctl. + */ + int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; /** -- cgit v1.2.3 From 5d3d720df4a922e42d0590ddb3d685f1e0365c76 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Fri, 15 Feb 2019 17:58:54 +0100 Subject: net: Fix for_each_netdev_feature on Big endian [ Upstream commit 3b89ea9c5902acccdbbdec307c85edd1bf52515e ] The features attribute is of type u64 and stored in the native endianes on the system. The for_each_set_bit() macro takes a pointer to a 32 bit array and goes over the bits in this area. On little Endian systems this also works with an u64 as the most significant bit is on the highest address, but on big endian the words are swapped. When we expect bit 15 here we get bit 47 (15 + 32). This patch converts it more or less to its own for_each_set_bit() implementation which works on 64 bit integers directly. This is then completely in host endianness and should work like expected. Fixes: fd867d51f ("net/core: generic support for disabling netdev features down stack") Signed-off-by: Hauke Mehrtens Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdev_features.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index b1b0ca7ccb2b..79f1c2f649c6 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -11,6 +11,7 @@ #define _LINUX_NETDEV_FEATURES_H #include +#include typedef u64 netdev_features_t; @@ -143,8 +144,26 @@ enum { #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) #define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) -#define for_each_netdev_feature(mask_addr, bit) \ - for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) +/* Finds the next feature with the highest number of the range of start till 0. + */ +static inline int find_next_netdev_feature(u64 feature, unsigned long start) +{ + /* like BITMAP_LAST_WORD_MASK() for u64 + * this sets the most significant 64 - start to 0. + */ + feature &= ~0ULL >> (-start & ((sizeof(feature) * 8) - 1)); + + return fls64(feature) - 1; +} + +/* This goes for the MSB to the LSB through the set feature bits, + * mask_addr should be a u64 and bit an int + */ +#define for_each_netdev_feature(mask_addr, bit) \ + for ((bit) = find_next_netdev_feature((mask_addr), \ + NETDEV_FEATURE_COUNT); \ + (bit) >= 0; \ + (bit) = find_next_netdev_feature((mask_addr), (bit) - 1)) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ -- cgit v1.2.3 From 8565b7fb8dfa0bc97818c2417e813de6147f5d76 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 16 Feb 2019 13:44:39 -0800 Subject: net: Add header for usage of fls64() [ Upstream commit 8681ef1f3d295bd3600315325f3b3396d76d02f6 ] Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian") Suggested-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdev_features.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 79f1c2f649c6..de123f436f1a 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -11,6 +11,7 @@ #define _LINUX_NETDEV_FEATURES_H #include +#include #include typedef u64 netdev_features_t; -- cgit v1.2.3 From a096429cac56f3d4571ae7914f98bd2eb201b5fe Mon Sep 17 00:00:00 2001 From: Denis Bolotin Date: Thu, 3 Jan 2019 12:02:39 +0200 Subject: qed: Fix qed_chain_set_prod() for PBL chains with non power of 2 page count [ Upstream commit 2d533a9287f2011632977e87ce2783f4c689c984 ] In PBL chains with non power of 2 page count, the producer is not at the beginning of the chain when index is 0 after a wrap. Therefore, after the producer index wrap around, page index should be calculated more carefully. Signed-off-by: Denis Bolotin Signed-off-by: Ariel Elior Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/qed/qed_chain.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 59ddf9af909e..2dd0a9ed5b36 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -663,6 +663,37 @@ out: static inline void qed_chain_set_prod(struct qed_chain *p_chain, u32 prod_idx, void *p_prod_elem) { + if (p_chain->mode == QED_CHAIN_MODE_PBL) { + u32 cur_prod, page_mask, page_cnt, page_diff; + + cur_prod = is_chain_u16(p_chain) ? p_chain->u.chain16.prod_idx : + p_chain->u.chain32.prod_idx; + + /* Assume that number of elements in a page is power of 2 */ + page_mask = ~p_chain->elem_per_page_mask; + + /* Use "cur_prod - 1" and "prod_idx - 1" since producer index + * reaches the first element of next page before the page index + * is incremented. See qed_chain_produce(). + * Index wrap around is not a problem because the difference + * between current and given producer indices is always + * positive and lower than the chain's capacity. + */ + page_diff = (((cur_prod - 1) & page_mask) - + ((prod_idx - 1) & page_mask)) / + p_chain->elem_per_page; + + page_cnt = qed_chain_get_page_cnt(p_chain); + if (is_chain_u16(p_chain)) + p_chain->pbl.c.u16.prod_page_idx = + (p_chain->pbl.c.u16.prod_page_idx - + page_diff + page_cnt) % page_cnt; + else + p_chain->pbl.c.u32.prod_page_idx = + (p_chain->pbl.c.u32.prod_page_idx - + page_diff + page_cnt) % page_cnt; + } + if (is_chain_u16(p_chain)) p_chain->u.chain16.prod_idx = (u16) prod_idx; else -- cgit v1.2.3 From dac7d4432b13d6b58923c1178b786bcda37c9e9e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 15 Feb 2019 12:15:47 -0500 Subject: net: validate untrusted gso packets without csum offload commit d5be7f632bad0f489879eed0ff4b99bd7fe0b74c upstream. Syzkaller again found a path to a kernel crash through bad gso input. By building an excessively large packet to cause an skb field to wrap. If VIRTIO_NET_HDR_F_NEEDS_CSUM was set this would have been dropped in skb_partial_csum_set. GSO packets that do not set checksum offload are suspicious and rare. Most callers of virtio_net_hdr_to_skb already pass them to skb_probe_transport_header. Move that test forward, change it to detect parse failure and drop packets on failure as those cleary are not one of the legitimate VIRTIO_NET_HDR_GSO types. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Fixes: f43798c27684 ("tun: Allow GSO using virtio_net_hdr") Reported-by: syzbot Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 2 +- include/linux/virtio_net.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50a4a5968f3a..3172e14d9398 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2377,7 +2377,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, return; else if (skb_flow_dissect_flow_keys(skb, &keys, 0)) skb_set_transport_header(skb, keys.control.thoff); - else + else if (offset_hint >= 0) skb_set_transport_header(skb, offset_hint); } diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index cb462f9ab7dd..71f2394abbf7 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -57,6 +57,15 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; + } else { + /* gso packets without NEEDS_CSUM do not set transport_offset. + * probe and drop if does not match one of the above types. + */ + if (gso_type) { + skb_probe_transport_header(skb, -1); + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { -- cgit v1.2.3 From d996573ebd5c528c613e90b7e7daa961d58fa16e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 18 Feb 2019 23:37:12 -0500 Subject: net: avoid false positives in untrusted gso validation commit 9e8db5913264d3967b93c765a6a9e464d9c473db upstream. GSO packets with vnet_hdr must conform to a small set of gso_types. The below commit uses flow dissection to drop packets that do not. But it has false positives when the skb is not fully initialized. Dissection needs skb->protocol and skb->network_header. Infer skb->protocol from gso_type as the two must agree. SKB_GSO_UDP can use both ipv4 and ipv6, so try both. Exclude callers for which network header offset is not known. Fixes: d5be7f632bad ("net: validate untrusted gso packets without csum offload") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/virtio_net.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 71f2394abbf7..e0348cb0a1dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -61,10 +61,20 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ - if (gso_type) { + if (gso_type && skb->network_header) { + if (!skb->protocol) + virtio_net_hdr_set_proto(skb, hdr); +retry: skb_probe_transport_header(skb, -1); - if (!skb_transport_header_was_set(skb)) + if (!skb_transport_header_was_set(skb)) { + /* UFO does not specify ipv4 or 6: try both */ + if (gso_type & SKB_GSO_UDP && + skb->protocol == htons(ETH_P_IP)) { + skb->protocol = htons(ETH_P_IPV6); + goto retry; + } return -EINVAL; + } } } -- cgit v1.2.3 From 81bafd09bb8d98370d8d5d21a1b3bbc48f3dcf28 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 30 Oct 2017 11:08:16 -0700 Subject: sched/sysctl: Fix attributes of some extern declarations commit a9903f04e0a4ea522d959c2f287cdf0ab029e324 upstream. The definition of sysctl_sched_migration_cost, sysctl_sched_nr_migrate and sysctl_sched_time_avg includes the attribute const_debug. This attribute is not part of the extern declaration of these variables in include/linux/sched/sysctl.h, while it is in kernel/sched/sched.h, and as a result Clang generates warnings like this: kernel/sched/sched.h:1618:33: warning: section attribute is specified on redeclared variable [-Wsection] extern const_debug unsigned int sysctl_sched_time_avg; ^ ./include/linux/sched/sysctl.h:42:21: note: previous declaration is here extern unsigned int sysctl_sched_time_avg; The header only declares the variables when CONFIG_SCHED_DEBUG is defined, therefore it is not necessary to duplicate the definition of const_debug. Instead we can use the attribute __read_mostly, which is the expansion of const_debug when CONFIG_SCHED_DEBUG=y is set. Signed-off-by: Matthias Kaehlcke Reviewed-by: Nick Desaulniers Cc: Douglas Anderson Cc: Guenter Roeck Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shile Zhang Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171030180816.170850-1-mka@chromium.org Signed-off-by: Ingo Molnar Cc: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- include/linux/sched/sysctl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d6a18a3839cc..1c1a1512ec55 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_migration_cost; -extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; +extern __read_mostly unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_time_avg; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, -- cgit v1.2.3 From 494c4399ef3bbc1efa4bd7f2f36454a5f4ef9e64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Dec 2017 08:38:30 -0800 Subject: writeback: synchronize sync(2) against cgroup writeback membership switches [ Upstream commit 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 ] sync_inodes_sb() can race against cgwb (cgroup writeback) membership switches and fail to writeback some inodes. For example, if an inode switches to another wb while sync_inodes_sb() is in progress, the new wb might not be visible to bdi_split_work_to_wbs() at all or the inode might jump from a wb which hasn't issued writebacks yet to one which already has. This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb switch path against sync_inodes_sb() so that sync_inodes_sb() is guaranteed to see all the target wbs and inodes can't jump wbs to escape syncing. v2: Fixed misplaced rwsem init. Spotted by Jiufei. Signed-off-by: Tejun Heo Reported-by: Jiufei Xue Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com Acked-by: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 19240379637f..b186c4b464e0 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -165,6 +165,7 @@ struct backing_dev_info { struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ + struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #else struct bdi_writeback_congested *wb_congested; #endif -- cgit v1.2.3 From 8a1e11f6ab05cf04f625bf48212dd00ee1f5ed58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Thu, 24 Jan 2019 03:07:02 -0800 Subject: net: dev_is_mac_header_xmit() true for ARPHRD_RAWIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 3b707c3008cad04604c1f50e39f456621821c414 ] __bpf_redirect() and act_mirred checks this boolean to determine whether to prefix an ethernet header. Signed-off-by: Maciej Żenczykowski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index 3355efc89781..4125f60ee53b 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -54,6 +54,7 @@ static inline bool dev_is_mac_header_xmit(const struct net_device *dev) case ARPHRD_IPGRE: case ARPHRD_VOID: case ARPHRD_NONE: + case ARPHRD_RAWIP: return false; default: return true; -- cgit v1.2.3 From 85cf5519050d151a77273171f2b2faf0ce5af8eb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 25 Jan 2019 12:53:07 +0530 Subject: cpufreq: Use struct kobj_attribute instead of struct global_attr commit 625c85a62cb7d3c79f6e16de3cfa972033658250 upstream. The cpufreq_global_kobject is created using kobject_create_and_add() helper, which assigns the kobj_type as dynamic_kobj_ktype and show/store routines are set to kobj_attr_show() and kobj_attr_store(). These routines pass struct kobj_attribute as an argument to the show/store callbacks. But all the cpufreq files created using the cpufreq_global_kobject expect the argument to be of type struct attribute. Things work fine currently as no one accesses the "attr" argument. We may not see issues even if the argument is used, as struct kobj_attribute has struct attribute as its first element and so they will both get same address. But this is logically incorrect and we should rather use struct kobj_attribute instead of struct global_attr in the cpufreq core and drivers and the show/store callbacks should take struct kobj_attribute as argument instead. This bug is caught using CFI CLANG builds in android kernel which catches mismatch in function prototypes for such callbacks. Reported-by: Donghee Han Reported-by: Sangkyu Kim Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- include/linux/cpufreq.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cbf85c4c745f..cad1eb50d668 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -254,20 +254,12 @@ __ATTR(_name, 0644, show_##_name, store_##_name) static struct freq_attr _name = \ __ATTR(_name, 0200, NULL, store_##_name) -struct global_attr { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, - struct attribute *attr, char *buf); - ssize_t (*store)(struct kobject *a, struct attribute *b, - const char *c, size_t count); -}; - #define define_one_global_ro(_name) \ -static struct global_attr _name = \ +static struct kobj_attribute _name = \ __ATTR(_name, 0444, show_##_name, NULL) #define define_one_global_rw(_name) \ -static struct global_attr _name = \ +static struct kobj_attribute _name = \ __ATTR(_name, 0644, show_##_name, store_##_name) -- cgit v1.2.3 From 13e429213f160a92780d9144bfdfe286e9ff6a84 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 30 Jan 2019 15:54:19 +0100 Subject: net: stmmac: Fallback to Platform Data clock in Watchdog conversion [ Upstream commit 4ec5302fa906ec9d86597b236f62315bacdb9622 ] If we don't have DT then stmmac_clk will not be available. Let's add a new Platform Data field so that we can specify the refclk by this mean. This way we can still use the coalesce command in PCI based setups. Signed-off-by: Jose Abreu Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 32feac5bbd75..5844105a482b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -183,6 +183,7 @@ struct plat_stmmacenet_data { struct clk *pclk; struct clk *clk_ptp_ref; unsigned int clk_ptp_rate; + unsigned int clk_ref_rate; struct reset_control *stmmac_rst; struct stmmac_axi *axi; int has_gmac4; -- cgit v1.2.3 From 8939e8cdd004ba5d7dd3bbf9c105990f9ebe96f8 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 31 Jan 2019 11:19:43 +0000 Subject: irqchip/gic-v3-its: Fix ITT_entry_size accessor [ Upstream commit 56841070ccc87b463ac037d2d1f2beb8e5e35f0c ] According to ARM IHI 0069C (ID070116), we should use GITS_TYPER's bits [7:4] as ITT_entry_size instead of [8:4]. Although this is pretty annoying, it only results in a potential over-allocation of memory, and nothing bad happens. Fixes: 3dfa576bfb45 ("irqchip/gic-v3-its: Add probing for VLPI properties") Signed-off-by: Zenghui Yu [maz: massaged subject and commit message] Signed-off-by: Marc Zyngier Signed-off-by: Sasha Levin --- include/linux/irqchip/arm-gic-v3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index bacb499c512c..845ff8c51564 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -306,7 +306,7 @@ #define GITS_TYPER_PLPIS (1UL << 0) #define GITS_TYPER_VLPIS (1UL << 1) #define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 -#define GITS_TYPER_ITT_ENTRY_SIZE(r) ((((r) >> GITS_TYPER_ITT_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_TYPER_ITT_ENTRY_SIZE(r) ((((r) >> GITS_TYPER_ITT_ENTRY_SIZE_SHIFT) & 0xf) + 1) #define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) -- cgit v1.2.3 From 7b1386a3eb472f4f7cda800a751c6e54def6cee6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Feb 2019 16:20:25 +0000 Subject: keys: Fix dependency loop between construction record and auth key [ Upstream commit 822ad64d7e46a8e2c8b8a796738d7b657cbb146d ] In the request_key() upcall mechanism there's a dependency loop by which if a key type driver overrides the ->request_key hook and the userspace side manages to lose the authorisation key, the auth key and the internal construction record (struct key_construction) can keep each other pinned. Fix this by the following changes: (1) Killing off the construction record and using the auth key instead. (2) Including the operation name in the auth key payload and making the payload available outside of security/keys/. (3) The ->request_key hook is given the authkey instead of the cons record and operation name. Changes (2) and (3) allow the auth key to naturally be cleaned up if the keyring it is in is destroyed or cleared or the auth key is unlinked. Fixes: 7ee02a316600 ("keys: Fix dependency loop between construction record and auth key") Signed-off-by: David Howells Signed-off-by: James Morris Signed-off-by: Sasha Levin --- include/linux/key-type.h | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 9520fc3c3b9a..dfb3ba782d2c 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -17,15 +17,6 @@ #ifdef CONFIG_KEYS -/* - * key under-construction record - * - passed to the request_key actor if supplied - */ -struct key_construction { - struct key *key; /* key being constructed */ - struct key *authkey;/* authorisation for key being constructed */ -}; - /* * Pre-parsed payload, used by key add, update and instantiate. * @@ -47,8 +38,7 @@ struct key_preparsed_payload { time_t expiry; /* Expiry time of key */ } __randomize_layout; -typedef int (*request_key_actor_t)(struct key_construction *key, - const char *op, void *aux); +typedef int (*request_key_actor_t)(struct key *auth_key, void *aux); /* * Preparsed matching criterion. @@ -170,20 +160,20 @@ extern int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, - struct key *instkey); + struct key *authkey); extern int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, - struct key *instkey); -extern void complete_request_key(struct key_construction *cons, int error); + struct key *authkey); +extern void complete_request_key(struct key *authkey, int error); static inline int key_negate_and_link(struct key *key, unsigned timeout, struct key *keyring, - struct key *instkey) + struct key *authkey) { - return key_reject_and_link(key, timeout, ENOKEY, keyring, instkey); + return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey); } extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep); -- cgit v1.2.3 From 1bdc347e64e359e4383a08fdc8ed9f9a3ccb4a8d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 23 Jan 2019 15:19:17 +0100 Subject: splice: don't merge into linked buffers commit a0ce2f0aa6ad97c3d4927bf2ca54bcebdf062d55 upstream. Before this patch, it was possible for two pipes to affect each other after data had been transferred between them with tee(): ============ $ cat tee_test.c int main(void) { int pipe_a[2]; if (pipe(pipe_a)) err(1, "pipe"); int pipe_b[2]; if (pipe(pipe_b)) err(1, "pipe"); if (write(pipe_a[1], "abcd", 4) != 4) err(1, "write"); if (tee(pipe_a[0], pipe_b[1], 2, 0) != 2) err(1, "tee"); if (write(pipe_b[1], "xx", 2) != 2) err(1, "write"); char buf[5]; if (read(pipe_a[0], buf, 4) != 4) err(1, "read"); buf[4] = 0; printf("got back: '%s'\n", buf); } $ gcc -o tee_test tee_test.c $ ./tee_test got back: 'abxx' $ ============ As suggested by Al Viro, fix it by creating a separate type for non-mergeable pipe buffers, then changing the types of buffers in splice_pipe_to_pipe() and link_pipe(). Cc: Fixes: 7c77f0b3f920 ("splice: implement pipe to pipe splicing") Fixes: 70524490ee2e ("[PATCH] splice: add support for sys_tee()") Suggested-by: Al Viro Signed-off-by: Jann Horn Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 6a80cfc63e0c..befdcd304b3d 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -183,6 +183,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); +void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; -- cgit v1.2.3 From dfdac22666d229538f499c5d56d38c05b4859f1e Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 23 Jan 2019 17:44:16 +0300 Subject: device property: Fix the length used in PROPERTY_ENTRY_STRING() commit 2b6e492467c78183bb629bb0a100ea3509b615a5 upstream. With string type property entries we need to use sizeof(const char *) instead of the number of characters as the length of the entry. If the string was shorter then sizeof(const char *), attempts to read it would have failed with -EOVERFLOW. The problem has been hidden because all build-in string properties have had a string longer then 8 characters until now. Fixes: a85f42047533 ("device property: helper macros for property entry creation") Cc: 4.5+ # 4.5+ Signed-off-by: Heikki Krogerus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 89d94b349912..45777ec1c524 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -252,7 +252,7 @@ struct property_entry { #define PROPERTY_ENTRY_STRING(_name_, _val_) \ (struct property_entry) { \ .name = _name_, \ - .length = sizeof(_val_), \ + .length = sizeof(const char *), \ .is_string = true, \ { .value = { .str = _val_ } }, \ } -- cgit v1.2.3 From a8aac659b9652430ccf898dd61bc6f996e3aef9d Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 31 Jan 2019 14:58:39 +0000 Subject: arm64: Fix HCR.TGE status for NMI contexts commit 5870970b9a828d8693aa6d15742573289d7dbcd0 upstream. When using VHE, the host needs to clear HCR_EL2.TGE bit in order to interact with guest TLBs, switching from EL2&0 translation regime to EL1&0. However, some non-maskable asynchronous event could happen while TGE is cleared like SDEI. Because of this address translation operations relying on EL2&0 translation regime could fail (tlb invalidation, userspace access, ...). Fix this by properly setting HCR_EL2.TGE when entering NMI context and clear it if necessary when returning to the interrupted context. Signed-off-by: Julien Thierry Suggested-by: Marc Zyngier Reviewed-by: Marc Zyngier Reviewed-by: James Morse Cc: Arnd Bergmann Cc: Will Deacon Cc: Marc Zyngier Cc: James Morse Cc: linux-arch@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- include/linux/hardirq.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 0fbbcdf0c178..da0af631ded5 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -60,8 +60,14 @@ extern void irq_enter(void); */ extern void irq_exit(void); +#ifndef arch_nmi_enter +#define arch_nmi_enter() do { } while (0) +#define arch_nmi_exit() do { } while (0) +#endif + #define nmi_enter() \ do { \ + arch_nmi_enter(); \ printk_nmi_enter(); \ lockdep_off(); \ ftrace_nmi_enter(); \ @@ -80,6 +86,7 @@ extern void irq_exit(void); ftrace_nmi_exit(); \ lockdep_on(); \ printk_nmi_exit(); \ + arch_nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ -- cgit v1.2.3 From 8259d5615683288dd06e46ee6ece2fb225ae9ff7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 6 Jan 2019 21:06:25 +1100 Subject: dm: fix to_sector() for 32bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0bdb50c531f7377a9da80d3ce2d61f389c84cb30 upstream. A dm-raid array with devices larger than 4GB won't assemble on a 32 bit host since _check_data_dev_sectors() was added in 4.16. This is because to_sector() treats its argument as an "unsigned long" which is 32bits (4GB) on a 32bit host. Using "unsigned long long" is more correct. Kernels as early as 4.2 can have other problems due to to_sector() being used on the size of a device. Fixes: 0cf4503174c1 ("dm raid: add support for the MD RAID0 personality") cc: stable@vger.kernel.org (v4.2+) Reported-and-tested-by: Guillaume Perréal Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- include/linux/device-mapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a5538433c927..91a063a1f3b3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -630,7 +630,7 @@ do { \ */ #define dm_target_offset(ti, sector) ((sector) - (ti)->begin) -static inline sector_t to_sector(unsigned long n) +static inline sector_t to_sector(unsigned long long n) { return (n >> SECTOR_SHIFT); } -- cgit v1.2.3 From 89dce6e457a14aa53fc0a83ec8f4206748a5c87a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 12:54:17 -0800 Subject: KVM: Call kvm_arch_memslots_updated() before updating memslots commit 152482580a1b0accb60676063a1ac57b2d12daf6 upstream. kvm_arch_memslots_updated() is at this point in time an x86-specific hook for handling MMIO generation wraparound. x86 stashes 19 bits of the memslots generation number in its MMIO sptes in order to avoid full page fault walks for repeat faults on emulated MMIO addresses. Because only 19 bits are used, wrapping the MMIO generation number is possible, if unlikely. kvm_arch_memslots_updated() alerts x86 that the generation has changed so that it can invalidate all MMIO sptes in case the effective MMIO generation has wrapped so as to avoid using a stale spte, e.g. a (very) old spte that was created with generation==0. Given that the purpose of kvm_arch_memslots_updated() is to prevent consuming stale entries, it needs to be called before the new generation is propagated to memslots. Invalidating the MMIO sptes after updating memslots means that there is a window where a vCPU could dereference the new memslots generation, e.g. 0, and incorrectly reuse an old MMIO spte that was created with (pre-wrap) generation==0. Fixes: e59dbe09f8e6 ("KVM: Introduce kvm_arch_memslots_updated()") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f7f19c1dc0a..753c16633bac 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -625,7 +625,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages); -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, -- cgit v1.2.3 From 2e5522ad5c1cca8848525721643f824cc651ca16 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Mar 2019 09:46:58 +0100 Subject: libceph: wait for latest osdmap in ceph_monc_blacklist_add() commit bb229bbb3bf63d23128e851a1f3b85c083178fa1 upstream. Because map updates are distributed lazily, an OSD may not know about the new blacklist for quite some time after "osd blacklist add" command is completed. This makes it possible for a blacklisted but still alive client to overwrite a post-blacklist update, resulting in data corruption. Waiting for latest osdmap in ceph_monc_blacklist_add() and thus using the post-blacklist epoch for all post-blacklist requests ensures that all such requests "wait" for the blacklist to come into force on their respective OSDs. Cc: stable@vger.kernel.org Fixes: 6305a3b41515 ("libceph: support for blacklisting clients") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman Signed-off-by: Greg Kroah-Hartman --- include/linux/ceph/libceph.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index d3b04f9589a9..c311cd13ea7d 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -291,6 +291,8 @@ extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); -- cgit v1.2.3 From 2d412eb3b823f846c31a68a4aced3a5527a27f8a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 8 Mar 2019 11:32:04 -0800 Subject: tracing: kdb: Fix ftdump to not sleep [ Upstream commit 31b265b3baaf55f209229888b7ffea523ddab366 ] As reported back in 2016-11 [1], the "ftdump" kdb command triggers a BUG for "sleeping function called from invalid context". kdb's "ftdump" command wants to call ring_buffer_read_prepare() in atomic context. A very simple solution for this is to add allocation flags to ring_buffer_read_prepare() so kdb can call it without triggering the allocation error. This patch does that. Note that in the original email thread about this, it was suggested that perhaps the solution for kdb was to either preallocate the buffer ahead of time or create our own iterator. I'm hoping that this alternative of adding allocation flags to ring_buffer_read_prepare() can be considered since it means I don't need to duplicate more of the core trace code into "trace_kdb.c" (for either creating my own iterator or re-preparing a ring allocator whose memory was already allocated). NOTE: another option for kdb is to actually figure out how to make it reuse the existing ftrace_dump() function and totally eliminate the duplication. This sounds very appealing and actually works (the "sr z" command can be seen to properly dump the ftrace buffer). The downside here is that ftrace_dump() fully consumes the trace buffer. Unless that is changed I'd rather not use it because it means "ftdump | grep xyz" won't be very useful to search the ftrace buffer since it will throw away the whole trace on the first grep. A future patch to dump only the last few lines of the buffer will also be hard to implement. [1] https://lkml.kernel.org/r/20161117191605.GA21459@google.com Link: http://lkml.kernel.org/r/20190308193205.213659-1-dianders@chromium.org Reported-by: Brian Norris Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- include/linux/ring_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 5caa062a02b2..ca52b82128df 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -123,7 +123,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events); struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu); +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags); void ring_buffer_read_prepare_sync(void); void ring_buffer_read_start(struct ring_buffer_iter *iter); void ring_buffer_read_finish(struct ring_buffer_iter *iter); -- cgit v1.2.3 From 7d1be2d6a7ddc2f7547248ba16177e19bb6c0c4c Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Thu, 7 Mar 2019 16:31:28 -0800 Subject: include/linux/relay.h: fix percpu annotation in struct rchan [ Upstream commit 62461ac2e5b6520b6d65fc6d7d7b4b8df4b848d8 ] The percpu member of this structure is declared as: struct ... ** __percpu member; So its type is: __percpu pointer to pointer to struct ... But looking at how it's used, its type should be: pointer to __percpu pointer to struct ... and it should thus be declared as: struct ... * __percpu *member; So fix the placement of '__percpu' in the definition of this structures. This silents a few Sparse's warnings like: warning: incorrect type in initializer (different address spaces) expected void const [noderef] *__vpp_verify got struct sched_domain ** Link: http://lkml.kernel.org/r/20190118144902.79065-1-luc.vanoostenryck@gmail.com Fixes: 017c59c042d01 ("relay: Use per CPU constructs for the relay channel buffer pointers") Signed-off-by: Luc Van Oostenryck Cc: Jens Axboe Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/relay.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/relay.h b/include/linux/relay.h index e1bdf01a86e2..c759f96e39c1 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -66,7 +66,7 @@ struct rchan struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ size_t last_toobig; /* tried to log event > subbuf size */ - struct rchan_buf ** __percpu buf; /* per-cpu channel buffers */ + struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */ int is_global; /* One global buffer ? */ struct list_head list; /* for channel list */ struct dentry *parent; /* parent dentry passed to open */ -- cgit v1.2.3 From 2c8340cad6434d108d89256c57ed306d786358f4 Mon Sep 17 00:00:00 2001 From: Katsuhiro Suzuki Date: Mon, 11 Feb 2019 00:38:06 +0900 Subject: clk: fractional-divider: check parent rate only if flag is set [ Upstream commit d13501a2bedfbea0983cc868d3f1dc692627f60d ] Custom approximation of fractional-divider may not need parent clock rate checking. For example Rockchip SoCs work fine using grand parent clock rate even if target rate is greater than parent. This patch checks parent clock rate only if CLK_SET_RATE_PARENT flag is set. For detailed example, clock tree of Rockchip I2S audio hardware. - Clock rate of CPLL is 1.2GHz, GPLL is 491.52MHz. - i2s1_div is integer divider can divide N (N is 1~128). Input clock is CPLL or GPLL. Initial divider value is N = 1. Ex) PLL = CPLL, N = 10, i2s1_div output rate is CPLL / 10 = 1.2GHz / 10 = 120MHz - i2s1_frac is fractional divider can divide input to x/y, x and y are 16bit integer. CPLL --> | selector | ---> i2s1_div -+--> | selector | --> I2S1 MCLK GPLL --> | | ,--------------' | | `--> i2s1_frac ---> | | Clock mux system try to choose suitable one from i2s1_div and i2s1_frac for master clock (MCLK) of I2S1. Bad scenario as follows: - Try to set MCLK to 8.192MHz (32kHz audio replay) Candidate setting is - i2s1_div: GPLL / 60 = 8.192MHz i2s1_div candidate is exactly same as target clock rate, so mux choose this clock source. i2s1_div output rate is changed 491.52MHz -> 8.192MHz - After that try to set to 11.2896MHz (44.1kHz audio replay) Candidate settings are - i2s1_div : CPLL / 107 = 11.214945MHz - i2s1_frac: i2s1_div = 8.192MHz This is because clk_fd_round_rate() thinks target rate (11.2896MHz) is higher than parent rate (i2s1_div = 8.192MHz) and returns parent clock rate. Above is current upstreamed behavior. Clock mux system choose i2s1_div, but this clock rate is not acceptable for I2S driver, so users cannot replay audio. Expected behavior is: - Try to set master clock to 11.2896MHz (44.1kHz audio replay) Candidate settings are - i2s1_div : CPLL / 107 = 11.214945MHz - i2s1_frac: i2s1_div * 147/6400 = 11.2896MHz Change i2s1_div to GPLL / 1 = 491.52MHz at same time. If apply this commit, clk_fd_round_rate() calls custom approximate function of Rockchip even if target rate is higher than parent. Custom function changes both grand parent (i2s1_div) and parent (i2s_frac) settings at same time. Clock mux system can choose i2s1_frac and audio works fine. Signed-off-by: Katsuhiro Suzuki Reviewed-by: Heiko Stuebner [sboyd@kernel.org: Make function into a macro instead] Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- include/linux/clk-provider.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 2f4e79fe7b86..3eb3376f1cc8 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -743,6 +743,9 @@ unsigned int __clk_get_enable_count(struct clk *clk); unsigned long clk_hw_get_rate(const struct clk_hw *hw); unsigned long __clk_get_flags(struct clk *clk); unsigned long clk_hw_get_flags(const struct clk_hw *hw); +#define clk_hw_can_set_rate_parent(hw) \ + (clk_hw_get_flags((hw)) & CLK_SET_RATE_PARENT) + bool clk_hw_is_prepared(const struct clk_hw *hw); bool clk_hw_is_enabled(const struct clk_hw *hw); bool __clk_is_enabled(struct clk *clk); -- cgit v1.2.3 From 43a81992523b5bb00d69e840fe8f2f935f55474d Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Fri, 18 Jan 2019 15:49:36 +0100 Subject: sched/topology: Fix percpu data types in struct sd_data & struct s_data [ Upstream commit 99687cdbb3f6c8e32bcc7f37496e811f30460e48 ] The percpu members of struct sd_data and s_data are declared as: struct ... ** __percpu member; So their type is: __percpu pointer to pointer to struct ... But looking at how they're used, their type should be: pointer to __percpu pointer to struct ... and they should thus be declared as: struct ... * __percpu *member; So fix the placement of '__percpu' in the definition of these structures. This addresses a bunch of Sparse's warnings like: warning: incorrect type in initializer (different address spaces) expected void const [noderef] *__vpp_verify got struct sched_domain ** Signed-off-by: Luc Van Oostenryck Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190118144936.79158-1-luc.vanoostenryck@gmail.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- include/linux/sched/topology.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index cf257c2e728d..5a92baa91e0c 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -177,10 +177,10 @@ typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 struct sd_data { - struct sched_domain **__percpu sd; - struct sched_domain_shared **__percpu sds; - struct sched_group **__percpu sg; - struct sched_group_capacity **__percpu sgc; + struct sched_domain *__percpu *sd; + struct sched_domain_shared *__percpu *sds; + struct sched_group *__percpu *sg; + struct sched_group_capacity *__percpu *sgc; }; struct sched_domain_topology_level { -- cgit v1.2.3 From e4688147c06de31732a67f08e4b296b97d03d6bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Feb 2019 14:48:03 +0100 Subject: genirq: Avoid summation loops for /proc/stat [ Upstream commit 1136b0728969901a091f0471968b2b76ed14d9ad ] Waiman reported that on large systems with a large amount of interrupts the readout of /proc/stat takes a long time to sum up the interrupt statistics. In principle this is not a problem. but for unknown reasons some enterprise quality software reads /proc/stat with a high frequency. The reason for this is that interrupt statistics are accounted per cpu. So the /proc/stat logic has to sum up the interrupt stats for each interrupt. This can be largely avoided for interrupts which are not marked as 'PER_CPU' interrupts by simply adding a per interrupt summation counter which is incremented along with the per interrupt per cpu counter. The PER_CPU interrupts need to avoid that and use only per cpu accounting because they share the interrupt number and the interrupt descriptor and concurrent updates would conflict or require unwanted synchronization. Reported-by: Waiman Long Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Reviewed-by: Marc Zyngier Reviewed-by: Davidlohr Bueso Cc: Matthew Wilcox Cc: Andrew Morton Cc: Alexey Dobriyan Cc: Kees Cook Cc: linux-fsdevel@vger.kernel.org Cc: Davidlohr Bueso Cc: Miklos Szeredi Cc: Daniel Colascione Cc: Dave Chinner Cc: Randy Dunlap Link: https://lkml.kernel.org/r/20190208135020.925487496@linutronix.de 8<------------- v2: Undo the unintentional layout change of struct irq_desc. include/linux/irqdesc.h | 1 + kernel/irq/chip.c | 12 ++++++++++-- kernel/irq/internals.h | 8 +++++++- kernel/irq/irqdesc.c | 7 ++++++- 4 files changed, 24 insertions(+), 4 deletions(-) Signed-off-by: Sasha Levin --- include/linux/irqdesc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index b6084898d330..234f0d1f8dca 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -65,6 +65,7 @@ struct irq_desc { unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ + unsigned int tot_count; unsigned int irq_count; /* For detecting broken IRQs */ unsigned long last_unhandled; /* Aging timer for unhandled count */ unsigned int irqs_unhandled; -- cgit v1.2.3 From 46b2c037b245f819841d03e323092105f5dd59f2 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 29 Jan 2019 01:04:25 -0500 Subject: bpf: fix missing prototype warnings [ Upstream commit 116bfa96a255123ed209da6544f74a4f2eaca5da ] Compiling with W=1 generates warnings: CC kernel/bpf/core.o kernel/bpf/core.c:721:12: warning: no previous prototype for ?bpf_jit_alloc_exec_limit? [-Wmissing-prototypes] 721 | u64 __weak bpf_jit_alloc_exec_limit(void) | ^~~~~~~~~~~~~~~~~~~~~~~~ kernel/bpf/core.c:757:14: warning: no previous prototype for ?bpf_jit_alloc_exec? [-Wmissing-prototypes] 757 | void *__weak bpf_jit_alloc_exec(unsigned long size) | ^~~~~~~~~~~~~~~~~~ kernel/bpf/core.c:762:13: warning: no previous prototype for ?bpf_jit_free_exec? [-Wmissing-prototypes] 762 | void __weak bpf_jit_free_exec(void *addr) | ^~~~~~~~~~~~~~~~~ All three are weak functions that archs can override, provide proper prototypes for when a new arch provides their own. Signed-off-by: Valdis Kletnieks Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- include/linux/filter.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 42197b16dd78..56d2cda9931b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -741,7 +741,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); - +u64 bpf_jit_alloc_exec_limit(void); +void *bpf_jit_alloc_exec(unsigned long size); +void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); -- cgit v1.2.3 From f3b3b5434752a86b5dd848081a648f7412e0560b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2019 17:00:13 +0100 Subject: cgroup/pids: turn cgroup_subsys->free() into cgroup_subsys->release() to fix the accounting [ Upstream commit 51bee5abeab2058ea5813c5615d6197a23dbf041 ] The only user of cgroup_subsys->free() callback is pids_cgrp_subsys which needs pids_free() to uncharge the pid. However, ->free() is called from __put_task_struct()->cgroup_free() and this is too late. Even the trivial program which does for (;;) { int pid = fork(); assert(pid >= 0); if (pid) wait(NULL); else exit(0); } can run out of limits because release_task()->call_rcu(delayed_put_task_struct) implies an RCU gp after the task/pid goes away and before the final put(). Test-case: mkdir -p /tmp/CG mount -t cgroup2 none /tmp/CG echo '+pids' > /tmp/CG/cgroup.subtree_control mkdir /tmp/CG/PID echo 2 > /tmp/CG/PID/pids.max perl -e 'while ($p = fork) { wait; } $p // die "fork failed: $!\n"' & echo $! > /tmp/CG/PID/cgroup.procs Without this patch the forking process fails soon after migration. Rename cgroup_subsys->free() to cgroup_subsys->release() and move the callsite into the new helper, cgroup_release(), called by release_task() which actually frees the pid(s). Reported-by: Herton R. Krzesinski Reported-by: Jan Stancek Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- include/linux/cgroup-defs.h | 2 +- include/linux/cgroup.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e7905d9353e8..93a2469a9130 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -523,7 +523,7 @@ struct cgroup_subsys { void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); - void (*free)(struct task_struct *task); + void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index dddbc29e2009..8e83c9055ccb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -118,6 +118,7 @@ extern int cgroup_can_fork(struct task_struct *p); extern void cgroup_cancel_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); void cgroup_exit(struct task_struct *p); +void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); @@ -668,6 +669,7 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {} +static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } -- cgit v1.2.3 From 56dbdae0c48827f8fa8bb23f6a26c8785bc455c8 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Apr 2019 18:38:45 -0700 Subject: lib/string.c: implement a basic bcmp [ Upstream commit 5f074f3e192f10c9fade898b9b3b8812e3d83342 ] A recent optimization in Clang (r355672) lowers comparisons of the return value of memcmp against zero to comparisons of the return value of bcmp against zero. This helps some platforms that implement bcmp more efficiently than memcmp. glibc simply aliases bcmp to memcmp, but an optimized implementation is in the works. This results in linkage failures for all targets with Clang due to the undefined symbol. For now, just implement bcmp as a tailcail to memcmp to unbreak the build. This routine can be further optimized in the future. Other ideas discussed: * A weak alias was discussed, but breaks for architectures that define their own implementations of memcmp since aliases to declarations are not permitted (only definitions). Arch-specific memcmp implementations typically declare memcmp in C headers, but implement them in assembly. * -ffreestanding also is used sporadically throughout the kernel. * -fno-builtin-bcmp doesn't work when doing LTO. Link: https://bugs.llvm.org/show_bug.cgi?id=41035 Link: https://code.woboq.org/userspace/glibc/string/memcmp.c.html#bcmp Link: https://github.com/llvm/llvm-project/commit/8e16d73346f8091461319a7dfc4ddd18eedcff13 Link: https://github.com/ClangBuiltLinux/linux/issues/416 Link: http://lkml.kernel.org/r/20190313211335.165605-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Reported-by: Nathan Chancellor Reported-by: Adhemerval Zanella Suggested-by: Arnd Bergmann Suggested-by: James Y Knight Suggested-by: Masahiro Yamada Suggested-by: Nathan Chancellor Suggested-by: Rasmus Villemoes Acked-by: Steven Rostedt (VMware) Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Masahiro Yamada Reviewed-by: Andy Shevchenko Cc: David Laight Cc: Rasmus Villemoes Cc: Namhyung Kim Cc: Greg Kroah-Hartman Cc: Alexander Shishkin Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/string.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 96115bf561b4..3d43329c20be 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -142,6 +142,9 @@ extern void * memscan(void *,int,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif +#ifndef __HAVE_ARCH_BCMP +extern int bcmp(const void *,const void *,__kernel_size_t); +#endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif -- cgit v1.2.3 From b5ba76a58b09c09ed7efc97de18f93d28c04ee4e Mon Sep 17 00:00:00 2001 From: Yuval Avnery Date: Mon, 11 Mar 2019 06:18:24 +0200 Subject: net/mlx5e: Add a lock on tir list [ Upstream commit 80a2a9026b24c6bd34b8d58256973e22270bedec ] Refresh tirs is looping over a global list of tirs while netdevs are adding and removing tirs from that list. That is why a lock is required. Fixes: 724b2aa15126 ("net/mlx5e: TIRs management refactoring") Signed-off-by: Yuval Avnery Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 88f0c530fe9c..32d445315128 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -743,6 +743,8 @@ struct mlx5_pagefault { }; struct mlx5_td { + /* protects tirs list changes while tirs refresh */ + struct mutex list_lock; struct list_head tirs_list; u32 tdn; }; -- cgit v1.2.3 From ed031128c2f8267f13d592961acecfae5136968b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2019 18:38:53 -0700 Subject: include/linux/bitrev.h: fix constant bitrev commit 6147e136ff5071609b54f18982dea87706288e21 upstream. clang points out with hundreds of warnings that the bitrev macros have a problem with constant input: drivers/hwmon/sht15.c:187:11: error: variable '__x' is uninitialized when used within its own initialization [-Werror,-Wuninitialized] u8 crc = bitrev8(data->val_status & 0x0F); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/bitrev.h:102:21: note: expanded from macro 'bitrev8' __constant_bitrev8(__x) : \ ~~~~~~~~~~~~~~~~~~~^~~~ include/linux/bitrev.h:67:11: note: expanded from macro '__constant_bitrev8' u8 __x = x; \ ~~~ ^ Both the bitrev and the __constant_bitrev macros use an internal variable named __x, which goes horribly wrong when passing one to the other. The obvious fix is to rename one of the variables, so this adds an extra '_'. It seems we got away with this because - there are only a few drivers using bitrev macros - usually there are no constant arguments to those - when they are constant, they tend to be either 0 or (unsigned)-1 (drivers/isdn/i4l/isdnhdlc.o, drivers/iio/amplifiers/ad8366.c) and give the correct result by pure chance. In fact, the only driver that I could find that gets different results with this is drivers/net/wan/slic_ds26522.c, which in turn is a driver for fairly rare hardware (adding the maintainer to Cc for testing). Link: http://lkml.kernel.org/r/20190322140503.123580-1-arnd@arndb.de Fixes: 556d2f055bf6 ("ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit instruction") Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Cc: Zhao Qiang Cc: Yalin Wang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/bitrev.h | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h index 50fb0dee23e8..d35b8ec1c485 100644 --- a/include/linux/bitrev.h +++ b/include/linux/bitrev.h @@ -34,41 +34,41 @@ static inline u32 __bitrev32(u32 x) #define __constant_bitrev32(x) \ ({ \ - u32 __x = x; \ - __x = (__x >> 16) | (__x << 16); \ - __x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8); \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = (___x >> 16) | (___x << 16); \ + ___x = ((___x & (u32)0xFF00FF00UL) >> 8) | ((___x & (u32)0x00FF00FFUL) << 8); \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev16(x) \ ({ \ - u16 __x = x; \ - __x = (__x >> 8) | (__x << 8); \ - __x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4); \ - __x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2); \ - __x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1); \ - __x; \ + u16 ___x = x; \ + ___x = (___x >> 8) | (___x << 8); \ + ___x = ((___x & (u16)0xF0F0U) >> 4) | ((___x & (u16)0x0F0FU) << 4); \ + ___x = ((___x & (u16)0xCCCCU) >> 2) | ((___x & (u16)0x3333U) << 2); \ + ___x = ((___x & (u16)0xAAAAU) >> 1) | ((___x & (u16)0x5555U) << 1); \ + ___x; \ }) #define __constant_bitrev8x4(x) \ ({ \ - u32 __x = x; \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev8(x) \ ({ \ - u8 __x = x; \ - __x = (__x >> 4) | (__x << 4); \ - __x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2); \ - __x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1); \ - __x; \ + u8 ___x = x; \ + ___x = (___x >> 4) | (___x << 4); \ + ___x = ((___x & (u8)0xCCU) >> 2) | ((___x & (u8)0x33U) << 2); \ + ___x = ((___x & (u8)0xAAU) >> 1) | ((___x & (u8)0x55U) << 1); \ + ___x; \ }) #define bitrev32(x) \ -- cgit v1.2.3 From 1b69a78ac089b07fb919b464c35c0fcfca242c14 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 8 Apr 2019 14:33:22 +0200 Subject: virtio: Honour 'may_reduce_num' in vring_create_virtqueue commit cf94db21905333e610e479688add629397a4b384 upstream. vring_create_virtqueue() allows the caller to specify via the may_reduce_num parameter whether the vring code is allowed to allocate a smaller ring than specified. However, the split ring allocation code tries to allocate a smaller ring on allocation failure regardless of what the caller specified. This may cause trouble for e.g. virtio-pci in legacy mode, which does not support ring resizing. (The packed ring code does not resize in any case.) Let's fix this by bailing out immediately in the split ring code if the requested size cannot be allocated and may_reduce_num has not been specified. While at it, fix a typo in the usage instructions. Fixes: 2a2d1382fe9d ("virtio: Add improved queue allocation API") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin Reviewed-by: Halil Pasic Reviewed-by: Jens Freimann Signed-off-by: Greg Kroah-Hartman --- include/linux/virtio_ring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index bbf32524ab27..75007e648dfa 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -63,7 +63,7 @@ struct virtqueue; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than - * expected. The caller should query virtqueue_get_ring_size to learn + * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, -- cgit v1.2.3 From 82017e26e51596ee577171a33f357377ec6513b5 Mon Sep 17 00:00:00 2001 From: "ndesaulniers@google.com" Date: Mon, 15 Oct 2018 10:22:21 -0700 Subject: compiler.h: update definition of unreachable() [ Upstream commit fe0640eb30b7da261ae84d252ed9ed3c7e68dfd8 ] Fixes the objtool warning seen with Clang: arch/x86/mm/fault.o: warning: objtool: no_context()+0x220: unreachable instruction Fixes commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") Josh noted that the fallback definition was meant to work around a pre-gcc-4.6 bug. GCC still needs to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365, so compiler-gcc.h defines its own version of unreachable(). Clang and ICC can use this shared definition. Link: https://github.com/ClangBuiltLinux/linux/issues/204 Suggested-by: Andy Lutomirski Suggested-by: Josh Poimboeuf Tested-by: Nathan Chancellor Signed-off-by: Nick Desaulniers Signed-off-by: Miguel Ojeda Signed-off-by: Sasha Levin --- include/linux/compiler.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index a704d032713b..67c3934fb9ed 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -119,7 +119,10 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define ASM_UNREACHABLE #endif #ifndef unreachable -# define unreachable() do { annotate_reachable(); do { } while (1); } while (0) +# define unreachable() do { \ + annotate_unreachable(); \ + __builtin_unreachable(); \ +} while (0) #endif /* -- cgit v1.2.3 From 0ba1fa56351e6e9c2f8db4ffc823cb7057e4ea82 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 1 Mar 2019 10:57:57 +0800 Subject: appletalk: Fix use-after-free in atalk_proc_exit [ Upstream commit 6377f787aeb945cae7abbb6474798de129e1f3ac ] KASAN report this: BUG: KASAN: use-after-free in pde_subdir_find+0x12d/0x150 fs/proc/generic.c:71 Read of size 8 at addr ffff8881f41fe5b0 by task syz-executor.0/2806 CPU: 0 PID: 2806 Comm: syz-executor.0 Not tainted 5.0.0-rc7+ #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xfa/0x1ce lib/dump_stack.c:113 print_address_description+0x65/0x270 mm/kasan/report.c:187 kasan_report+0x149/0x18d mm/kasan/report.c:317 pde_subdir_find+0x12d/0x150 fs/proc/generic.c:71 remove_proc_entry+0xe8/0x420 fs/proc/generic.c:667 atalk_proc_exit+0x18/0x820 [appletalk] atalk_exit+0xf/0x5a [appletalk] __do_sys_delete_module kernel/module.c:1018 [inline] __se_sys_delete_module kernel/module.c:961 [inline] __x64_sys_delete_module+0x3dc/0x5e0 kernel/module.c:961 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fb2de6b9c58 EFLAGS: 00000246 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000200001c0 RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fb2de6ba6bc R13: 00000000004bccaa R14: 00000000006f6bc8 R15: 00000000ffffffff Allocated by task 2806: set_track mm/kasan/common.c:85 [inline] __kasan_kmalloc.constprop.3+0xa0/0xd0 mm/kasan/common.c:496 slab_post_alloc_hook mm/slab.h:444 [inline] slab_alloc_node mm/slub.c:2739 [inline] slab_alloc mm/slub.c:2747 [inline] kmem_cache_alloc+0xcf/0x250 mm/slub.c:2752 kmem_cache_zalloc include/linux/slab.h:730 [inline] __proc_create+0x30f/0xa20 fs/proc/generic.c:408 proc_mkdir_data+0x47/0x190 fs/proc/generic.c:469 0xffffffffc10c01bb 0xffffffffc10c0166 do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 2806: set_track mm/kasan/common.c:85 [inline] __kasan_slab_free+0x130/0x180 mm/kasan/common.c:458 slab_free_hook mm/slub.c:1409 [inline] slab_free_freelist_hook mm/slub.c:1436 [inline] slab_free mm/slub.c:2986 [inline] kmem_cache_free+0xa6/0x2a0 mm/slub.c:3002 pde_put+0x6e/0x80 fs/proc/generic.c:647 remove_proc_entry+0x1d3/0x420 fs/proc/generic.c:684 0xffffffffc10c031c 0xffffffffc10c0166 do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8881f41fe500 which belongs to the cache proc_dir_entry of size 256 The buggy address is located 176 bytes inside of 256-byte region [ffff8881f41fe500, ffff8881f41fe600) The buggy address belongs to the page: page:ffffea0007d07f80 count:1 mapcount:0 mapping:ffff8881f6e69a00 index:0x0 flags: 0x2fffc0000000200(slab) raw: 02fffc0000000200 dead000000000100 dead000000000200 ffff8881f6e69a00 raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881f41fe480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff8881f41fe500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8881f41fe580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8881f41fe600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8881f41fe680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb It should check the return value of atalk_proc_init fails, otherwise atalk_exit will trgger use-after-free in pde_subdir_find while unload the module.This patch fix error cleanup path of atalk_init Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/atalk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index 4d356e168692..212eb8c7fed6 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -151,7 +151,7 @@ extern int sysctl_aarp_retransmit_limit; extern int sysctl_aarp_resolve_time; #ifdef CONFIG_SYSCTL -extern void atalk_register_sysctl(void); +extern int atalk_register_sysctl(void); extern void atalk_unregister_sysctl(void); #else #define atalk_register_sysctl() do { } while(0) -- cgit v1.2.3 From c43e2bdd14941e969dac515efadc3d379f333fdf Mon Sep 17 00:00:00 2001 From: Pi-Hsun Shih Date: Wed, 13 Mar 2019 11:44:33 -0700 Subject: include/linux/swap.h: use offsetof() instead of custom __swapoffset macro [ Upstream commit a4046c06be50a4f01d435aa7fe57514818e6cc82 ] Use offsetof() to calculate offset of a field to take advantage of compiler built-in version when possible, and avoid UBSAN warning when compiling with Clang: UBSAN: Undefined behaviour in mm/swapfile.c:3010:38 member access within null pointer of type 'union swap_header' CPU: 6 PID: 1833 Comm: swapon Tainted: G S 4.19.23 #43 Call trace: dump_backtrace+0x0/0x194 show_stack+0x20/0x2c __dump_stack+0x20/0x28 dump_stack+0x70/0x94 ubsan_epilogue+0x14/0x44 ubsan_type_mismatch_common+0xf4/0xfc __ubsan_handle_type_mismatch_v1+0x34/0x54 __se_sys_swapon+0x654/0x1084 __arm64_sys_swapon+0x1c/0x24 el0_svc_common+0xa8/0x150 el0_svc_compat_handler+0x2c/0x38 el0_svc_compat+0x8/0x18 Link: http://lkml.kernel.org/r/20190312081902.223764-1-pihsun@chromium.org Signed-off-by: Pi-Hsun Shih Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4fd1ab9565ba..e643866912b7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -155,9 +155,9 @@ struct swap_extent { /* * Max bad pages in the new format.. */ -#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x) #define MAX_SWAP_BADPAGES \ - ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)) + ((offsetof(union swap_header, magic.magic) - \ + offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ -- cgit v1.2.3 From 28356c21ac32d49d15a3ea7383b0a96052d15394 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 3 Apr 2019 18:39:01 +0000 Subject: bpf: reduce verifier memory consumption commit 638f5b90d46016372a8e3e0a434f199cc5e12b8c upstream. the verifier got progressively smarter over time and size of its internal state grew as well. Time to reduce the memory consumption. Before: sizeof(struct bpf_verifier_state) = 6520 After: sizeof(struct bpf_verifier_state) = 896 It's done by observing that majority of BPF programs use little to no stack whereas verifier kept all of 512 stack slots ready always. Instead dynamically reallocate struct verifier state when stack access is detected. Runtime difference before vs after is within a noise. The number of processed instructions stays the same. Cc: jakub.kicinski@netronome.com Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller [Backported to 4.14 by sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8458cc5fbce5..0fc3d9afb8bf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -91,14 +91,19 @@ enum bpf_stack_slot_type { #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +struct bpf_stack_state { + struct bpf_reg_state spilled_ptr; + u8 slot_type[BPF_REG_SIZE]; +}; + /* state of the program: * type of all registers and stack info */ struct bpf_verifier_state { struct bpf_reg_state regs[MAX_BPF_REG]; - u8 stack_slot_type[MAX_BPF_STACK]; - struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; struct bpf_verifier_state *parent; + int allocated_stack; + struct bpf_stack_state *stack; }; /* linked list of verifier states used to prune search */ @@ -133,7 +138,7 @@ struct bpf_verifier_env { struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ - struct bpf_verifier_state cur_state; /* current verifier state */ + struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ void *analyzer_priv; /* pointer to external analyzer's private data */ @@ -145,6 +150,11 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; +static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +{ + return env->cur_state->regs; +} + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv); -- cgit v1.2.3 From 6a42c49482db5ab565a9dabb1642ff64ae0f45b7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Apr 2019 18:39:05 +0000 Subject: bpf: move {prev_,}insn_idx into verifier env commit c08435ec7f2bc8f4109401f696fd55159b4b40cb upstream. Move prev_insn_idx and insn_idx from the do_check() function into the verifier environment, so they can be read inside the various helper functions for handling the instructions. It's easier to put this into the environment rather than changing all call-sites only to pass it along. insn_idx is useful in particular since this later on allows to hold state in env->insn_aux_data[env->insn_idx]. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Vallish Vaidyeshwara [Backported to 4.14 by sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0fc3d9afb8bf..3149a01d0f58 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -134,6 +134,8 @@ struct bpf_ext_analyzer_ops { * one verifier_env per bpf_check() call */ struct bpf_verifier_env { + u32 insn_idx; + u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ -- cgit v1.2.3 From ee282977c63d1c195ee0de6df173b0f216d14c28 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Apr 2019 18:39:06 +0000 Subject: bpf: move tmp variable into ax register in interpreter commit 144cd91c4c2bced6eb8a7e25e590f6618a11e854 upstream. This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run() into the hidden ax register. The latter is currently only used in JITs for constant blinding as a temporary scratch register, meaning the BPF interpreter will never see the use of ax. Therefore it is safe to use it for the cases where tmp has been used earlier. This is needed to later on allow restricted hidden use of ax in both interpreter and JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov [backported to 4.14 sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 56d2cda9931b..f7880c716308 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -53,7 +53,8 @@ struct bpf_prog_aux; * constants. See JIT pre-step in bpf_jit_blind_constants(). */ #define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 -- cgit v1.2.3 From b101cf55c6da31b86ee8144ac0113b7ecfb7c06b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Apr 2019 18:39:07 +0000 Subject: bpf: enable access to ax register also from verifier rewrite commit 9b73bfdd08e73231d6a90ae6db4b46b3fbf56c30 upstream. Right now we are using BPF ax register in JIT for constant blinding as well as in interpreter as temporary variable. Verifier will not be able to use it simply because its use will get overridden from the former in bpf_jit_blind_insn(). However, it can be made to work in that blinding will be skipped if there is prior use in either source or destination register on the instruction. Taking constraints of ax into account, the verifier is then open to use it in rewrites under some constraints. Note, ax register already has mappings in every eBPF JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov [backported to 4.14 sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- include/linux/filter.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index f7880c716308..ac2272778f2e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -46,12 +46,7 @@ struct bpf_prog_aux; #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_8 -/* Kernel hidden auxiliary/helper register for hardening step. - * Only used by eBPF JITs. It's nothing more than a temporary - * register that JITs use internally, only that here it's part - * of eBPF instructions that have been rewritten for blinding - * constants. See JIT pre-step in bpf_jit_blind_constants(). - */ +/* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG -- cgit v1.2.3 From ae03b6b1c880a03d4771257336dc3bca156dd51b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Apr 2019 18:39:12 +0000 Subject: bpf: prevent out of bounds speculation on pointer arithmetic commit 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 upstream. Jann reported that the original commit back in b2157399cc98 ("bpf: prevent out-of-bounds speculation") was not sufficient to stop CPU from speculating out of bounds memory access: While b2157399cc98 only focussed on masking array map access for unprivileged users for tail calls and data access such that the user provided index gets sanitized from BPF program and syscall side, there is still a more generic form affected from BPF programs that applies to most maps that hold user data in relation to dynamic map access when dealing with unknown scalars or "slow" known scalars as access offset, for example: - Load a map value pointer into R6 - Load an index into R7 - Do a slow computation (e.g. with a memory dependency) that loads a limit into R8 (e.g. load the limit from a map for high latency, then mask it to make the verifier happy) - Exit if R7 >= R8 (mispredicted branch) - Load R0 = R6[R7] - Load R0 = R6[R0] For unknown scalars there are two options in the BPF verifier where we could derive knowledge from in order to guarantee safe access to the memory: i) While /<=/>= variants won't allow to derive any lower or upper bounds from the unknown scalar where it would be safe to add it to the map value pointer, it is possible through ==/!= test however. ii) another option is to transform the unknown scalar into a known scalar, for example, through ALU ops combination such as R &= followed by R |= or any similar combination where the original information from the unknown scalar would be destroyed entirely leaving R with a constant. The initial slow load still precedes the latter ALU ops on that register, so the CPU executes speculatively from that point. Once we have the known scalar, any compare operation would work then. A third option only involving registers with known scalars could be crafted as described in [0] where a CPU port (e.g. Slow Int unit) would be filled with many dependent computations such that the subsequent condition depending on its outcome has to wait for evaluation on its execution port and thereby executing speculatively if the speculated code can be scheduled on a different execution port, or any other form of mistraining as described in [1], for example. Given this is not limited to only unknown scalars, not only map but also stack access is affected since both is accessible for unprivileged users and could potentially be used for out of bounds access under speculation. In order to prevent any of these cases, the verifier is now sanitizing pointer arithmetic on the offset such that any out of bounds speculation would be masked in a way where the pointer arithmetic result in the destination register will stay unchanged, meaning offset masked into zero similar as in array_index_nospec() case. With regards to implementation, there are three options that were considered: i) new insn for sanitation, ii) push/pop insn and sanitation as inlined BPF, iii) reuse of ax register and sanitation as inlined BPF. Option i) has the downside that we end up using from reserved bits in the opcode space, but also that we would require each JIT to emit masking as native arch opcodes meaning mitigation would have slow adoption till everyone implements it eventually which is counter-productive. Option ii) and iii) have both in common that a temporary register is needed in order to implement the sanitation as inlined BPF since we are not allowed to modify the source register. While a push / pop insn in ii) would be useful to have in any case, it requires once again that every JIT needs to implement it first. While possible, amount of changes needed would also be unsuitable for a -stable patch. Therefore, the path which has fewer changes, less BPF instructions for the mitigation and does not require anything to be changed in the JITs is option iii) which this work is pursuing. The ax register is already mapped to a register in all JITs (modulo arm32 where it's mapped to stack as various other BPF registers there) and used in constant blinding for JITs-only so far. It can be reused for verifier rewrites under certain constraints. The interpreter's tmp "register" has therefore been remapped into extending the register set with hidden ax register and reusing that for a number of instructions that needed the prior temporary variable internally (e.g. div, mod). This allows for zero increase in stack space usage in the interpreter, and enables (restricted) generic use in rewrites otherwise as long as such a patchlet does not make use of these instructions. The sanitation mask is dynamic and relative to the offset the map value or stack pointer currently holds. There are various cases that need to be taken under consideration for the masking, e.g. such operation could look as follows: ptr += val or val += ptr or ptr -= val. Thus, the value to be sanitized could reside either in source or in destination register, and the limit is different depending on whether the ALU op is addition or subtraction and depending on the current known and bounded offset. The limit is derived as follows: limit := max_value_size - (smin_value + off). For subtraction: limit := umax_value + off. This holds because we do not allow any pointer arithmetic that would temporarily go out of bounds or would have an unknown value with mixed signed bounds where it is unclear at verification time whether the actual runtime value would be either negative or positive. For example, we have a derived map pointer value with constant offset and bounded one, so limit based on smin_value works because the verifier requires that statically analyzed arithmetic on the pointer must be in bounds, and thus it checks if resulting smin_value + off and umax_value + off is still within map value bounds at time of arithmetic in addition to time of access. Similarly, for the case of stack access we derive the limit as follows: MAX_BPF_STACK + off for subtraction and -off for the case of addition where off := ptr_reg->off + ptr_reg->var_off.value. Subtraction is a special case for the masking which can be in form of ptr += -val, ptr -= -val, or ptr -= val. In the first two cases where we know that the value is negative, we need to temporarily negate the value in order to do the sanitation on a positive value where we later swap the ALU op, and restore original source register if the value was in source. The sanitation of pointer arithmetic alone is still not fully sufficient as is, since a scenario like the following could happen ... PTR += 0x1000 (e.g. K-based imm) PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON PTR += 0x1000 PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON [...] ... which under speculation could end up as ... PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] [...] ... and therefore still access out of bounds. To prevent such case, the verifier is also analyzing safety for potential out of bounds access under speculative execution. Meaning, it is also simulating pointer access under truncation. We therefore "branch off" and push the current verification state after the ALU operation with known 0 to the verification stack for later analysis. Given the current path analysis succeeded it is likely that the one under speculation can be pruned. In any case, it is also subject to existing complexity limits and therefore anything beyond this point will be rejected. In terms of pruning, it needs to be ensured that the verification state from speculative execution simulation must never prune a non-speculative execution path, therefore, we mark verifier state accordingly at the time of push_stack(). If verifier detects out of bounds access under speculative execution from one of the possible paths that includes a truncation, it will reject such program. Given we mask every reg-based pointer arithmetic for unprivileged programs, we've been looking into how it could affect real-world programs in terms of size increase. As the majority of programs are targeted for privileged-only use case, we've unconditionally enabled masking (with its alu restrictions on top of it) for privileged programs for the sake of testing in order to check i) whether they get rejected in its current form, and ii) by how much the number of instructions and size will increase. We've tested this by using Katran, Cilium and test_l4lb from the kernel selftests. For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb we've used test_l4lb.o as well as test_l4lb_noinline.o. We found that none of the programs got rejected by the verifier with this change, and that impact is rather minimal to none. balancer_kern.o had 13,904 bytes (1,738 insns) xlated and 7,797 bytes JITed before and after the change. Most complex program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated and 18,538 bytes JITed before and after and none of the other tail call programs in bpf_lxc.o had any changes either. For the older bpf_lxc_opt_-DUNKNOWN.o object we found a small increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed after the change. Other programs from that object file had similar small increase. Both test_l4lb.o had no change and remained at 6,544 bytes (817 insns) xlated and 3,401 bytes JITed and for test_l4lb_noinline.o constant at 5,080 bytes (634 insns) xlated and 3,313 bytes JITed. This can be explained in that LLVM typically optimizes stack based pointer arithmetic by using K-based operations and that use of dynamic map access is not overly frequent. However, in future we may decide to optimize the algorithm further under known guarantees from branch and value speculation. Latter seems also unclear in terms of prediction heuristics that today's CPUs apply as well as whether there could be collisions in e.g. the predictor's Value History/Pattern Table for triggering out of bounds access, thus masking is performed unconditionally at this point but could be subject to relaxation later on. We were generally also brainstorming various other approaches for mitigation, but the blocker was always lack of available registers at runtime and/or overhead for runtime tracking of limits belonging to a specific pointer. Thus, we found this to be minimally intrusive under given constraints. With that in place, a simple example with sanitized access on unprivileged load at post-verification time looks as follows: # bpftool prog dump xlated id 282 [...] 28: (79) r1 = *(u64 *)(r7 +0) 29: (79) r2 = *(u64 *)(r7 +8) 30: (57) r1 &= 15 31: (79) r3 = *(u64 *)(r0 +4608) 32: (57) r3 &= 1 33: (47) r3 |= 1 34: (2d) if r2 > r3 goto pc+19 35: (b4) (u32) r11 = (u32) 20479 | 36: (1f) r11 -= r2 | Dynamic sanitation for pointer 37: (4f) r11 |= r2 | arithmetic with registers 38: (87) r11 = -r11 | containing bounded or known 39: (c7) r11 s>>= 63 | scalars in order to prevent 40: (5f) r11 &= r2 | out of bounds speculation. 41: (0f) r4 += r11 | 42: (71) r4 = *(u8 *)(r4 +0) 43: (6f) r4 <<= r1 [...] For the case where the scalar sits in the destination register as opposed to the source register, the following code is emitted for the above example: [...] 16: (b4) (u32) r11 = (u32) 20479 17: (1f) r11 -= r2 18: (4f) r11 |= r2 19: (87) r11 = -r11 20: (c7) r11 s>>= 63 21: (5f) r2 &= r11 22: (0f) r2 += r0 23: (61) r0 = *(u32 *)(r2 +0) [...] JIT blinding example with non-conflicting use of r10: [...] d5: je 0x0000000000000106 _ d7: mov 0x0(%rax),%edi | da: mov $0xf153246,%r10d | Index load from map value and e0: xor $0xf153259,%r10 | (const blinded) mask with 0x1f. e7: and %r10,%rdi |_ ea: mov $0x2f,%r10d | f0: sub %rdi,%r10 | Sanitized addition. Both use r10 f3: or %rdi,%r10 | but do not interfere with each f6: neg %r10 | other. (Neither do these instructions f9: sar $0x3f,%r10 | interfere with the use of ax as temp fd: and %r10,%rdi | in interpreter.) 100: add %rax,%rdi |_ 103: mov 0x0(%rdi),%eax [...] Tested that it fixes Jann's reproducer, and also checked that test_verifier and test_progs suite with interpreter, JIT and JIT with hardening enabled on x86-64 and arm64 runs successfully. [0] Speculose: Analyzing the Security Implications of Speculative Execution in CPUs, Giorgi Maisuradze and Christian Rossow, https://arxiv.org/pdf/1801.04084.pdf [1] A Systematic Evaluation of Transient Execution Attacks and Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz, Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens, Dmitry Evtyushkin, Daniel Gruss, https://arxiv.org/pdf/1811.05441.pdf Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Vallish Vaidyeshwara [some checkpatch cleanups and backported to 4.14 by sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3149a01d0f58..10d85378e690 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -104,6 +104,7 @@ struct bpf_verifier_state { struct bpf_verifier_state *parent; int allocated_stack; struct bpf_stack_state *stack; + bool speculative; }; /* linked list of verifier states used to prune search */ @@ -112,14 +113,23 @@ struct bpf_verifier_state_list { struct bpf_verifier_state_list *next; }; +/* Possible states for alu_state member. */ +#define BPF_ALU_SANITIZE_SRC 1U +#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ + BPF_ALU_SANITIZE_DST) + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + u8 alu_state; /* used in combination with alu_limit */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit v1.2.3 From 6588a490bfe1b879f11b5e74724ef53a33b68641 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Apr 2019 18:39:13 +0000 Subject: bpf: fix sanitation of alu op with pointer / scalar type from different paths commit d3bd7413e0ca40b60cf60d4003246d067cafdeda upstream. While 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") took care of rejecting alu op on pointer when e.g. pointer came from two different map values with different map properties such as value size, Jann reported that a case was not covered yet when a given alu op is used in both "ptr_reg += reg" and "numeric_reg += reg" from different branches where we would incorrectly try to sanitize based on the pointer's limit. Catch this corner case and reject the program instead. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Vallish Vaidyeshwara Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 10d85378e690..d8b3240cfe6e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -117,6 +117,7 @@ struct bpf_verifier_state_list { #define BPF_ALU_SANITIZE_SRC 1U #define BPF_ALU_SANITIZE_DST 2U #define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_NON_POINTER (1U << 3) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) -- cgit v1.2.3 From 76da4272779ab127acdaa9f4aa799cd7e90a8b3f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Mar 2019 11:52:36 +0100 Subject: appletalk: Fix compile regression [ Upstream commit 27da0d2ef998e222a876c0cec72aa7829a626266 ] A bugfix just broke compilation of appletalk when CONFIG_SYSCTL is disabled: In file included from net/appletalk/ddp.c:65: net/appletalk/ddp.c: In function 'atalk_init': include/linux/atalk.h:164:34: error: expected expression before 'do' #define atalk_register_sysctl() do { } while(0) ^~ net/appletalk/ddp.c:1934:7: note: in expansion of macro 'atalk_register_sysctl' rc = atalk_register_sysctl(); This is easier to avoid by using conventional inline functions as stubs rather than macros. The header already has inline functions for other purposes, so I'm changing over all the macros for consistency. Fixes: 6377f787aeb9 ("appletalk: Fix use-after-free in atalk_proc_exit") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/atalk.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index 212eb8c7fed6..03885e63f92b 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -154,16 +154,26 @@ extern int sysctl_aarp_resolve_time; extern int atalk_register_sysctl(void); extern void atalk_unregister_sysctl(void); #else -#define atalk_register_sysctl() do { } while(0) -#define atalk_unregister_sysctl() do { } while(0) +static inline int atalk_register_sysctl(void) +{ + return 0; +} +static inline void atalk_unregister_sysctl(void) +{ +} #endif #ifdef CONFIG_PROC_FS extern int atalk_proc_init(void); extern void atalk_proc_exit(void); #else -#define atalk_proc_init() ({ 0; }) -#define atalk_proc_exit() do { } while(0) +static inline int atalk_proc_init(void) +{ + return 0; +} +static inline void atalk_proc_exit(void) +{ +} #endif /* CONFIG_PROC_FS */ #endif /* __LINUX_ATALK_H__ */ -- cgit v1.2.3 From bb461ad8e6e0653fc6bd0f26d9173bab0aec235b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream. The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/sched/mm.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..ef4ae0a545fe 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -57,6 +57,27 @@ static inline void mmdrop_async(struct mm_struct *mm) } } +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From 877e9c51c96cd5ad92b45e36447e275374efc7af Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:49:52 +0900 Subject: x86/kprobes: Verify stack frame on kretprobe commit 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 upstream. Verify the stack frame pointer on kretprobe trampoline handler, If the stack frame pointer does not match, it skips the wrong entry and tries to find correct one. This can happen if user puts the kretprobe on the function which can be used in the path of ftrace user-function call. Such functions should not be probed, so this adds a warning message that reports which function should be blacklisted. Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- include/linux/kprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index bd2684700b74..520702b82134 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -198,6 +198,7 @@ struct kretprobe_instance { struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + void *fp; char data[0]; }; -- cgit v1.2.3 From 4fa17fc730ba3b7f0cb69937df919a7a226067be Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 8 Jan 2018 10:41:39 -0800 Subject: iomap: report collisions between directio and buffered writes to userspace commit 5a9d929d6e13278df62bd9e3d3ceae8c87ad1eea upstream. If two programs simultaneously try to write to the same part of a file via direct IO and buffered IO, there's a chance that the post-diowrite pagecache invalidation will fail on the dirty page. When this happens, the dio write succeeded, which means that the page cache is no longer coherent with the disk! Programs are not supposed to mix IO types and this is a clear case of data corruption, so store an EIO which will be reflected to userspace during the next fsync. Replace the WARN_ON with a ratelimited pr_crit so that the developers have /some/ kind of breadcrumb to track down the offending program(s) and file(s) involved. Signed-off-by: Darrick J. Wong Reviewed-by: Liu Bo Signed-off-by: Greg Kroah-Hartman Cc: Zubin Mithra --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f6a577edec67..dafac283b0ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2965,6 +2965,7 @@ enum { }; void dio_end_io(struct bio *bio); +void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, -- cgit v1.2.3 From aec0d4aad4613ea834b65070aba62ced4ec4e540 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 23:59:25 +0200 Subject: tracing: Fix buffer_ref pipe ops commit b987222654f84f7b4ca95b3a55eca784cb30235b upstream. This fixes multiple issues in buffer_pipe_buf_ops: - The ->steal() handler must not return zero unless the pipe buffer has the only reference to the page. But generic_pipe_buf_steal() assumes that every reference to the pipe is tracked by the page's refcount, which isn't true for these buffers - buffer_pipe_buf_get(), which duplicates a buffer, doesn't touch the page's refcount. Fix it by using generic_pipe_buf_nosteal(), which refuses every attempted theft. It should be easy to actually support ->steal, but the only current users of pipe_buf_steal() are the virtio console and FUSE, and they also only use it as an optimization. So it's probably not worth the effort. - The ->get() and ->release() handlers can be invoked concurrently on pipe buffers backed by the same struct buffer_ref. Make them safe against concurrency by using refcount_t. - The pointers stored in ->private were only zeroed out when the last reference to the buffer_ref was dropped. As far as I know, this shouldn't be necessary anyway, but if we do it, let's always do it. Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index befdcd304b3d..2dcf6e81b2e2 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -182,6 +182,7 @@ void free_pipe_info(struct pipe_inode_info *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); -- cgit v1.2.3 From 94d99a2355e32ba8eae7bdfdf861f90286634cd9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:06:20 -0700 Subject: mm: make page ref count overflow check tighter and more explicit commit f958d7b528b1b40c44cfda5eabe2d82760d868c3 upstream. We have a VM_BUG_ON() to check that the page reference count doesn't underflow (or get close to overflow) by checking the sign of the count. That's all fine, but we actually want to allow people to use a "get page ref unless it's already very high" helper function, and we want that one to use the sign of the page ref (without triggering this VM_BUG_ON). Change the VM_BUG_ON to only check for small underflows (or _very_ close to overflowing), and ignore overflows which have strayed into negative territory. Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 58f2263de4de..4023819837a6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -824,6 +824,10 @@ static inline bool is_device_public_page(const struct page *page) #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ +/* 127: arbitrary random number, small enough to assemble well */ +#define page_ref_zero_or_close_to_overflow(page) \ + ((unsigned int) page_ref_count(page) + 127u <= 127u) + static inline void get_page(struct page *page) { page = compound_head(page); @@ -831,7 +835,7 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_refcount. */ - VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); page_ref_inc(page); } -- cgit v1.2.3 From 2a280d881726795ae490dc45bcaa9f6d4bf43741 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:14:59 -0700 Subject: mm: add 'try_get_page()' helper function commit 88b1a17dfc3ed7728316478fae0f5ad508f50397 upstream. This is the same as the traditional 'get_page()' function, but instead of unconditionally incrementing the reference count of the page, it only does so if the count was "safe". It returns whether the reference count was incremented (and is marked __must_check, since the caller obviously has to be aware of it). Also like 'get_page()', you can't use this function unless you already had a reference to the page. The intent is that you can use this exactly like get_page(), but in situations where you want to limit the maximum reference count. The code currently does an unconditional WARN_ON_ONCE() if we ever hit the reference count issues (either zero or negative), as a notification that the conditional non-increment actually happened. NOTE! The count access for the "safety" check is inherently racy, but that doesn't matter since the buffer we use is basically half the range of the reference count (ie we look at the sign of the count). Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4023819837a6..ee0eae215210 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -839,6 +839,15 @@ static inline void get_page(struct page *page) page_ref_inc(page); } +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + page_ref_inc(page); + return true; +} + static inline void put_page(struct page *page) { page = compound_head(page); -- cgit v1.2.3 From c88a0aa7ace7eb10dca42be59f21e2cbd263575e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 5 Apr 2019 14:02:10 -0700 Subject: fs: prevent page refcount overflow in pipe_buf_get commit 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb upstream. Change pipe_buf_get() to return a bool indicating whether it succeeded in raising the refcount of the page (if the thing in the pipe is a page). This removes another mechanism for overflowing the page refcount. All callers converted to handle a failure. Reported-by: Jann Horn Signed-off-by: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/pipe_fs_i.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 2dcf6e81b2e2..c251ad717345 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,18 +108,20 @@ struct pipe_buf_operations { /* * Get a reference to the pipe buffer. */ - void (*get)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to + * + * Return: %true if the reference was successfully obtained. */ -static inline void pipe_buf_get(struct pipe_inode_info *pipe, +static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->ops->get(pipe, buf); + return buf->ops->get(pipe, buf); } /** @@ -179,7 +181,7 @@ struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); -- cgit v1.2.3 From fd76b2d5045f0a23e4fa2199500af326e5a1081c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Mar 2019 20:44:13 -0700 Subject: ptrace: take into account saved_sigmask in PTRACE{GET,SET}SIGMASK [ Upstream commit fcfc2aa0185f4a731d05a21e9f359968fdfd02e7 ] There are a few system calls (pselect, ppoll, etc) which replace a task sigmask while they are running in a kernel-space When a task calls one of these syscalls, the kernel saves a current sigmask in task->saved_sigmask and sets a syscall sigmask. On syscall-exit-stop, ptrace traps a task before restoring the saved_sigmask, so PTRACE_GETSIGMASK returns the syscall sigmask and PTRACE_SETSIGMASK does nothing, because its sigmask is replaced by saved_sigmask, when the task returns to user-space. This patch fixes this problem. PTRACE_GETSIGMASK returns saved_sigmask if it's set. PTRACE_SETSIGMASK drops the TIF_RESTORE_SIGMASK flag. Link: http://lkml.kernel.org/r/20181120060616.6043-1-avagin@gmail.com Fixes: 29000caecbe8 ("ptrace: add ability to get/set signal-blocked mask") Signed-off-by: Andrei Vagin Acked-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin (Microsoft) --- include/linux/sched/signal.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index fbf86ecd149d..bcaba7e8ca6e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -377,10 +377,20 @@ static inline void set_restore_sigmask(void) set_thread_flag(TIF_RESTORE_SIGMASK); WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } + +static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); +} + static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } +static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +{ + return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); +} static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); @@ -398,6 +408,10 @@ static inline void set_restore_sigmask(void) current->restore_sigmask = true; WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } +static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +{ + tsk->restore_sigmask = false; +} static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; @@ -406,6 +420,10 @@ static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } +static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +{ + return tsk->restore_sigmask; +} static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) -- cgit v1.2.3 From 20ea0648cc1623778ff286b829613b9bd2523272 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 19 Apr 2019 13:52:38 -0400 Subject: USB: core: Fix bug caused by duplicate interface PM usage counter commit c2b71462d294cf517a0bc6e4fd6424d7cee5596f upstream. The syzkaller fuzzer reported a bug in the USB hub driver which turned out to be caused by a negative runtime-PM usage counter. This allowed a hub to be runtime suspended at a time when the driver did not expect it. The symptom is a WARNING issued because the hub's status URB is submitted while it is already active: URB 0000000031fb463e submitted while active WARNING: CPU: 0 PID: 2917 at drivers/usb/core/urb.c:363 The negative runtime-PM usage count was caused by an unfortunate design decision made when runtime PM was first implemented for USB. At that time, USB class drivers were allowed to unbind from their interfaces without balancing the usage counter (i.e., leaving it with a positive count). The core code would take care of setting the counter back to 0 before allowing another driver to bind to the interface. Later on when runtime PM was implemented for the entire kernel, the opposite decision was made: Drivers were required to balance their runtime-PM get and put calls. In order to maintain backward compatibility, however, the USB subsystem adapted to the new implementation by keeping an independent usage counter for each interface and using it to automatically adjust the normal usage counter back to 0 whenever a driver was unbound. This approach involves duplicating information, but what is worse, it doesn't work properly in cases where a USB class driver delays decrementing the usage counter until after the driver's disconnect() routine has returned and the counter has been adjusted back to 0. Doing so would cause the usage counter to become negative. There's even a warning about this in the USB power management documentation! As it happens, this is exactly what the hub driver does. The kick_hub_wq() routine increments the runtime-PM usage counter, and the corresponding decrement is carried out by hub_event() in the context of the hub_wq work-queue thread. This work routine may sometimes run after the driver has been unbound from its interface, and when it does it causes the usage counter to go negative. It is not possible for hub_disconnect() to wait for a pending hub_event() call to finish, because hub_disconnect() is called with the device lock held and hub_event() acquires that lock. The only feasible fix is to reverse the original design decision: remove the duplicate interface-specific usage counter and require USB drivers to balance their runtime PM gets and puts. As far as I know, all existing drivers currently do this. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+7634edaea4d0b341c625@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 8c7ba40cf021..a22a3e139e96 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -200,7 +200,6 @@ usb_find_last_int_out_endpoint(struct usb_host_interface *alt, * @dev: driver model's view of this device * @usb_dev: if an interface is bound to the USB major, this will point * to the sysfs representation for that device. - * @pm_usage_cnt: PM usage counter for this interface * @reset_ws: Used for scheduling resets from atomic context. * @resetting_device: USB core reset the device, so use alt setting 0 as * current; needs bandwidth alloc after reset. @@ -257,7 +256,6 @@ struct usb_interface { struct device dev; /* interface specific device info */ struct device *usb_dev; - atomic_t pm_usage_cnt; /* usage counter for autosuspend */ struct work_struct reset_ws; /* for resets in atomic context */ }; #define to_usb_interface(d) container_of(d, struct usb_interface, dev) -- cgit v1.2.3 From 502a97abc4ee6adf5884d61006201860fc7c2ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller?= Date: Mon, 8 Apr 2019 15:33:54 +0200 Subject: clk: x86: Add system specific quirk to mark clocks as critical MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7c2e07130090ae001a97a6b65597830d6815e93e upstream. Since commit 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL"), the pmc_plt_clocks of the Bay Trail SoC are unconditionally gated off. Unfortunately this will break systems where these clocks are used for external purposes beyond the kernel's knowledge. Fix it by implementing a system specific quirk to mark the necessary pmc_plt_clks as critical. Fixes: 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL") Signed-off-by: David Müller Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/x86/clk-pmc-atom.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/clk-pmc-atom.h b/include/linux/platform_data/x86/clk-pmc-atom.h index 3ab892208343..7a37ac27d0fb 100644 --- a/include/linux/platform_data/x86/clk-pmc-atom.h +++ b/include/linux/platform_data/x86/clk-pmc-atom.h @@ -35,10 +35,13 @@ struct pmc_clk { * * @base: PMC clock register base offset * @clks: pointer to set of registered clocks, typically 0..5 + * @critical: flag to indicate if firmware enabled pmc_plt_clks + * should be marked as critial or not */ struct pmc_clk_data { void __iomem *base; const struct pmc_clk *clks; + bool critical; }; #endif /* __PLATFORM_DATA_X86_CLK_PMC_ATOM_H */ -- cgit v1.2.3