From e675447bda51c1ea72d1ac9132ce3bed974f1da3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:15 +0000 Subject: timers: Make 'pinned' a timer property We want to move the timer migration logic from a 'push' to a 'pull' model. Under the current 'push' model pinned timers are handled via a runtime API variant: mod_timer_pinned(). The 'pull' model requires us to store the pinned attribute of a timer in the timer_list structure itself, as a new TIMER_PINNED bit in timer->flags. This flag must be set at initialization time and the timer APIs recognize the flag. This patch: - Implements the new flag and associated new-style initialization methods - makes mod_timer() recognize new-style pinned timers, - and adds some migration helper facility to allow step by step conversion of old-style to new-style pinned timers. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.049338558@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 20ac746f3eb3..046d6cf26498 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -62,7 +62,8 @@ struct timer_list { #define TIMER_MIGRATING 0x00080000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) #define TIMER_DEFERRABLE 0x00100000 -#define TIMER_IRQSAFE 0x00200000 +#define TIMER_PINNED 0x00200000 +#define TIMER_IRQSAFE 0x00400000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ @@ -78,9 +79,15 @@ struct timer_list { #define TIMER_INITIALIZER(_function, _expires, _data) \ __TIMER_INITIALIZER((_function), (_expires), (_data), 0) +#define TIMER_PINNED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED) + #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) \ __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE) +#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED) + #define DEFINE_TIMER(_name, _function, _expires, _data) \ struct timer_list _name = \ TIMER_INITIALIZER(_function, _expires, _data) @@ -124,8 +131,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define init_timer(timer) \ __init_timer((timer), 0) +#define init_timer_pinned(timer) \ + __init_timer((timer), TIMER_PINNED) #define init_timer_deferrable(timer) \ __init_timer((timer), TIMER_DEFERRABLE) +#define init_timer_pinned_deferrable(timer) \ + __init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED) #define init_timer_on_stack(timer) \ __init_timer_on_stack((timer), 0) @@ -145,12 +156,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_timer(timer, fn, data) \ __setup_timer((timer), (fn), (data), 0) +#define setup_pinned_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_PINNED) #define setup_deferrable_timer(timer, fn, data) \ __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE) +#define setup_pinned_deferrable_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) #define setup_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), 0) +#define setup_pinned_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED) #define setup_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE) +#define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) /** * timer_pending - is a timer pending? @@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); extern void set_timer_slack(struct timer_list *time, int slack_hz); -#define TIMER_NOT_PINNED 0 -#define TIMER_PINNED 1 +#define MOD_TIMER_NOT_PINNED 0 +#define MOD_TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: -- cgit v1.2.3 From 177ec0a0a531695210b277d734b2f92ee5796303 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:24 +0000 Subject: timers: Remove the deprecated mod_timer_pinned() API We switched all users to initialize the timers as pinned and call mod_timer(). Remove the now unused timer API function. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.706205231@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 046d6cf26498..a8f6c70eb414 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); -extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); extern void set_timer_slack(struct timer_list *time, int slack_hz); -#define MOD_TIMER_NOT_PINNED 0 -#define MOD_TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: -- cgit v1.2.3 From b0d6e2dcb284f1f4dcb4b92760f49eeaf5fc0bc7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:29 +0000 Subject: timers: Reduce the CPU index space to 256k We want to store the array index in the flags space. 256k CPUs should be enough for a while. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.030144293@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index a8f6c70eb414..989f33d16ebf 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -58,12 +58,12 @@ struct timer_list { * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! */ -#define TIMER_CPUMASK 0x0007FFFF -#define TIMER_MIGRATING 0x00080000 +#define TIMER_CPUMASK 0x0003FFFF +#define TIMER_MIGRATING 0x00040000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) -#define TIMER_DEFERRABLE 0x00100000 -#define TIMER_PINNED 0x00200000 -#define TIMER_IRQSAFE 0x00400000 +#define TIMER_DEFERRABLE 0x00080000 +#define TIMER_PINNED 0x00100000 +#define TIMER_IRQSAFE 0x00200000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ -- cgit v1.2.3 From 500462a9de657f86edaa102f8ab6bff7f7e43fc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:30 +0000 Subject: timers: Switch to a non-cascading wheel The current timer wheel has some drawbacks: 1) Cascading: Cascading can be an unbound operation and is completely pointless in most cases because the vast majority of the timer wheel timers are canceled or rearmed before expiration. (They are used as timeout safeguards, not as real timers to measure time.) 2) No fast lookup of the next expiring timer: In NOHZ scenarios the first timer soft interrupt after a long NOHZ period must fast forward the base time to the current value of jiffies. As we have no way to find the next expiring timer fast, the code loops linearly and increments the base time one by one and checks for expired timers in each step. This causes unbound overhead spikes exactly in the moment when we should wake up as fast as possible. After a thorough analysis of real world data gathered on laptops, workstations, webservers and other machines (thanks Chris!) I came to the conclusion that the current 'classic' timer wheel implementation can be modified to address the above issues. The vast majority of timer wheel timers is canceled or rearmed before expiry. Most of them are timeouts for networking and other I/O tasks. The nature of timeouts is to catch the exception from normal operation (TCP ack timed out, disk does not respond, etc.). For these kinds of timeouts the accuracy of the timeout is not really a concern. Timeouts are very often approximate worst-case values and in case the timeout fires, we already waited for a long time and performance is down the drain already. The few timers which actually expire can be split into two categories: 1) Short expiry times which expect halfways accurate expiry 2) Long term expiry times are inaccurate today already due to the batching which is done for NOHZ automatically and also via the set_timer_slack() API. So for long term expiry timers we can avoid the cascading property and just leave them in the less granular outer wheels until expiry or cancelation. Timers which are armed with a timeout larger than the wheel capacity are no longer cascaded. We expire them with the longest possible timeout (6+ days). We have not observed such timeouts in our data collection, but at least we handle them, applying the rule of the least surprise. To avoid extending the wheel levels for HZ=1000 so we can accomodate the longest observed timeouts (5 days in the network conntrack code) we reduce the first level granularity on HZ=1000 to 4ms, which effectively is the same as the HZ=250 behaviour. From our data analysis there is nothing which relies on that 1ms granularity and as a side effect we get better batching and timer locality for the networking code as well. Contrary to the classic wheel the granularity of the next wheel is not the capacity of the first wheel. The granularities of the wheels are in the currently chosen setting 8 times the granularity of the previous wheel. So for HZ=250 we end up with the following granularity levels: Level Offset Granularity Range 0 0 4 ms 0 ms - 252 ms 1 64 32 ms 256 ms - 2044 ms (256ms - ~2s) 2 128 256 ms 2048 ms - 16380 ms (~2s - ~16s) 3 192 2048 ms (~2s) 16384 ms - 131068 ms (~16s - ~2m) 4 256 16384 ms (~16s) 131072 ms - 1048572 ms (~2m - ~17m) 5 320 131072 ms (~2m) 1048576 ms - 8388604 ms (~17m - ~2h) 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) That's a worst case inaccuracy of 12.5% for the timers which are queued at the beginning of a level. So the new wheel concept addresses the old issues: 1) Cascading is avoided completely 2) By keeping the timers in the bucket until expiry/cancelation we can track the buckets which have timers enqueued in a bucket bitmap and therefore can look up the next expiring timer very fast and O(1). A further benefit of the concept is that the slack calculation which is done on every timer start is no longer necessary because the granularity levels provide natural batching already. Our extensive testing with various loads did not show any performance degradation vs. the current wheel implementation. This patch does not address the 'fast lookup' issue as we wanted to make sure that there is no regression introduced by the wheel redesign. The optimizations are in follow up patches. This patch contains fixes from Anna-Maria Gleixner and Richard Cochran. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.108621834@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 989f33d16ebf..5869ab9848fe 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -64,6 +64,8 @@ struct timer_list { #define TIMER_DEFERRABLE 0x00080000 #define TIMER_PINNED 0x00100000 #define TIMER_IRQSAFE 0x00200000 +#define TIMER_ARRAYSHIFT 22 +#define TIMER_ARRAYMASK 0xFFC00000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ -- cgit v1.2.3 From 53bf837b78d155b8e1110b3c25b4d0d6391b8ff3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:31 +0000 Subject: timers: Remove set_timer_slack() leftovers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now have implicit batching in the timer wheel. The slack API is no longer used, so remove it. Signed-off-by: Thomas Gleixner Cc: Alan Stern Cc: Andrew F. Davis Cc: Arjan van de Ven Cc: Chris Mason Cc: David S. Miller Cc: David Woodhouse Cc: Dmitry Eremin-Solenikov Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Greg Kroah-Hartman Cc: Jaehoon Chung Cc: Jens Axboe Cc: John Stultz Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Mathias Nyman Cc: Pali Rohár Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sebastian Reichel Cc: Ulf Hansson Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: netdev@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.189813118@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 5869ab9848fe..4419506b564e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -19,7 +19,6 @@ struct timer_list { void (*function)(unsigned long); unsigned long data; u32 flags; - int slack; #ifdef CONFIG_TIMER_STATS int start_pid; @@ -73,7 +72,6 @@ struct timer_list { .expires = (_expires), \ .data = (_data), \ .flags = (_flags), \ - .slack = -1, \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ } @@ -193,8 +191,6 @@ extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); -extern void set_timer_slack(struct timer_list *time, int slack_hz); - /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: -- cgit v1.2.3