Skip to content

Commit

Permalink
timers: Implement the hierarchical pull model
Browse files Browse the repository at this point in the history
Placing timers at enqueue time on a target CPU based on dubious heuristics
does not make any sense:

 1) Most timer wheel timers are canceled or rearmed before they expire.

 2) The heuristics to predict which CPU will be busy when the timer expires
    are wrong by definition.

So placing the timers at enqueue wastes precious cycles.

The proper solution to this problem is to always queue the timers on the
local CPU and allow the non pinned timers to be pulled onto a busy CPU at
expiry time.

Therefore split the timer storage into local pinned and global timers:
Local pinned timers are always expired on the CPU on which they have been
queued. Global timers can be expired on any CPU.

As long as a CPU is busy it expires both local and global timers. When a
CPU goes idle it arms for the first expiring local timer. If the first
expiring pinned (local) timer is before the first expiring movable timer,
then no action is required because the CPU will wake up before the first
movable timer expires. If the first expiring movable timer is before the
first expiring pinned (local) timer, then this timer is queued into an idle
timerqueue and eventually expired by another active CPU.

To avoid global locking the timerqueues are implemented as a hierarchy. The
lowest level of the hierarchy holds the CPUs. The CPUs are associated to
groups of 8, which are separated per node. If more than one CPU group
exist, then a second level in the hierarchy collects the groups. Depending
on the size of the system more than 2 levels are required. Each group has a
"migrator" which checks the timerqueue during the tick for remote expirable
timers.

If the last CPU in a group goes idle it reports the first expiring event in
the group up to the next group(s) in the hierarchy. If the last CPU goes
idle it arms its timer for the first system wide expiring timer to ensure
that no timer event is missed.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240222103710.32582-1-anna-maria@linutronix.de
  • Loading branch information
anna-marialx authored and KAGA-KOKO committed Feb 22, 2024
1 parent 57e95a5 commit 7ee9887
Show file tree
Hide file tree
Showing 6 changed files with 2,011 additions and 8 deletions.
1 change: 1 addition & 0 deletions include/linux/cpuhotplug.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
CPUHP_AP_PERF_CSKY_ONLINE,
CPUHP_AP_TMIGR_ONLINE,
CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RANDOM_ONLINE,
Expand Down
3 changes: 3 additions & 0 deletions kernel/time/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
ifeq ($(CONFIG_SMP),y)
obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
endif
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
Expand Down
1 change: 1 addition & 0 deletions kernel/time/tick-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ extern void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
extern void timer_lock_remote_bases(unsigned int cpu);
extern void timer_unlock_remote_bases(unsigned int cpu);
extern bool timer_base_is_idle(void);
extern void timer_expire_remote(unsigned int cpu);
# endif
#else /* CONFIG_NO_HZ_COMMON */
static inline void timers_update_nohz(void) { }
Expand Down
113 changes: 105 additions & 8 deletions kernel/time/timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
#include <asm/io.h>

#include "tick-internal.h"
#include "timer_migration.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
Expand Down Expand Up @@ -2169,6 +2170,64 @@ bool timer_base_is_idle(void)
{
return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
}

static void __run_timer_base(struct timer_base *base);

/**
* timer_expire_remote() - expire global timers of cpu
* @cpu: Remote CPU
*
* Expire timers of global base of remote CPU.
*/
void timer_expire_remote(unsigned int cpu)
{
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

__run_timer_base(base);
}

static void timer_use_tmigr(unsigned long basej, u64 basem,
unsigned long *nextevt, bool *tick_stop_path,
bool timer_base_idle, struct timer_events *tevt)
{
u64 next_tmigr;

if (timer_base_idle)
next_tmigr = tmigr_cpu_new_timer(tevt->global);
else if (tick_stop_path)
next_tmigr = tmigr_cpu_deactivate(tevt->global);
else
next_tmigr = tmigr_quick_check(tevt->global);

/*
* If the CPU is the last going idle in timer migration hierarchy, make
* sure the CPU will wake up in time to handle remote timers.
* next_tmigr == KTIME_MAX if other CPUs are still active.
*/
if (next_tmigr < tevt->local) {
u64 tmp;

/* If we missed a tick already, force 0 delta */
if (next_tmigr < basem)
next_tmigr = basem;

tmp = div_u64(next_tmigr - basem, TICK_NSEC);

*nextevt = basej + (unsigned long)tmp;
tevt->local = next_tmigr;
}
}
# else
static void timer_use_tmigr(unsigned long basej, u64 basem,
unsigned long *nextevt, bool *tick_stop_path,
bool timer_base_idle, struct timer_events *tevt)
{
/*
* Make sure first event is written into tevt->local to not miss a
* timer on !SMP systems.
*/
tevt->local = min_t(u64, tevt->local, tevt->global);
}
# endif /* CONFIG_SMP */

static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
Expand All @@ -2177,7 +2236,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
struct timer_base *base_local, *base_global;
unsigned long nextevt;
u64 expires;
bool idle_is_possible;

/*
* Pretend that there is no timer pending if the cpu is offline.
Expand All @@ -2198,6 +2257,22 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
base_global, &tevt);

/*
* If the next event is only one jiffie ahead there is no need to call
* timer migration hierarchy related functions. The value for the next
* global timer in @tevt struct equals then KTIME_MAX. This is also
* true, when the timer base is idle.
*
* The proper timer migration hierarchy function depends on the callsite
* and whether timer base is idle or not. @nextevt will be updated when
* this CPU needs to handle the first timer migration hierarchy
* event. See timer_use_tmigr() for detailed information.
*/
idle_is_possible = time_after(nextevt, basej + 1);
if (idle_is_possible)
timer_use_tmigr(basej, basem, &nextevt, idle,
base_local->is_idle, &tevt);

/*
* We have a fresh next event. Check whether we can forward the
* base.
Expand All @@ -2210,7 +2285,10 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
*/
if (idle) {
/*
* Bases are idle if the next event is more than a tick away.
* Bases are idle if the next event is more than a tick
* away. Caution: @nextevt could have changed by enqueueing a
* global timer into timer migration hierarchy. Therefore a new
* check is required here.
*
* If the base is marked idle then any timer add operation must
* forward the base clk itself to keep granularity small. This
Expand All @@ -2223,23 +2301,35 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
trace_timer_base_idle(true, base_local->cpu);
}
*idle = base_local->is_idle;

/*
* When timer base is not set idle, undo the effect of
* tmigr_cpu_deactivate() to prevent inconsitent states - active
* timer base but inactive timer migration hierarchy.
*
* When timer base was already marked idle, nothing will be
* changed here.
*/
if (!base_local->is_idle && idle_is_possible)
tmigr_cpu_activate();
}

raw_spin_unlock(&base_global->lock);
raw_spin_unlock(&base_local->lock);

expires = min_t(u64, tevt.local, tevt.global);

return cmp_next_hrtimer_event(basem, expires);
return cmp_next_hrtimer_event(basem, tevt.local);
}

/**
* get_next_timer_interrupt() - return the time (clock mono) of the next timer
* @basej: base time jiffies
* @basem: base time clock monotonic
*
* Returns the tick aligned clock monotonic time of the next pending
* timer or KTIME_MAX if no timer is pending.
* Returns the tick aligned clock monotonic time of the next pending timer or
* KTIME_MAX if no timer is pending. If timer of global base was queued into
* timer migration hierarchy, first global timer is not taken into account. If
* it was the last CPU of timer migration hierarchy going idle, first global
* event is taken into account.
*/
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
Expand Down Expand Up @@ -2281,6 +2371,9 @@ void timer_clear_idle(void)
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
trace_timer_base_idle(false, smp_processor_id());

/* Activate without holding the timer_base->lock */
tmigr_cpu_activate();
}
#endif

Expand Down Expand Up @@ -2350,6 +2443,9 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
run_timer_base(BASE_GLOBAL);
run_timer_base(BASE_DEF);

if (is_timers_nohz_active())
tmigr_handle_remote();
}
}

Expand All @@ -2364,7 +2460,8 @@ static void run_local_timers(void)

for (int i = 0; i < NR_BASES; i++, base++) {
/* Raise the softirq only if required. */
if (time_after_eq(jiffies, base->next_expiry)) {
if (time_after_eq(jiffies, base->next_expiry) ||
(i == BASE_DEF && tmigr_requires_handle_remote())) {
raise_softirq(TIMER_SOFTIRQ);
return;
}
Expand Down
Loading

0 comments on commit 7ee9887

Please sign in to comment.