Skip to content

Commit

Permalink
x86/mm/tlb: Flush remote and local TLBs concurrently
Browse files Browse the repository at this point in the history
To improve TLB shootdown performance, flush the remote and local TLBs
concurrently. Introduce flush_tlb_multi() that does so. Introduce
paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen
and hyper-v are only compile-tested).

While the updated smp infrastructure is capable of running a function on
a single local core, it is not optimized for this case. The multiple
function calls and the indirect branch introduce some overhead, and
might make local TLB flushes slower than they were before the recent
changes.

Before calling the SMP infrastructure, check if only a local TLB flush
is needed to restore the lost performance in this common case. This
requires to check mm_cpumask() one more time, but unless this mask is
updated very frequently, this should impact performance negatively.

Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Michael Kelley <mikelley@microsoft.com> # Hyper-v parts
Reviewed-by: Juergen Gross <jgross@suse.com> # Xen and paravirt parts
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20210220231712.2475218-5-namit@vmware.com
  • Loading branch information
anadav authored and Ingo Molnar committed Mar 6, 2021
1 parent 6035152 commit 4ce94ea
Show file tree
Hide file tree
Showing 10 changed files with 57 additions and 41 deletions.
10 changes: 5 additions & 5 deletions arch/x86/hyperv/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,16 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
}

static void hyperv_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
struct hv_tlb_flush **flush_pcpu;
struct hv_tlb_flush *flush;
u64 status = U64_MAX;
unsigned long flags;

trace_hyperv_mmu_flush_tlb_others(cpus, info);
trace_hyperv_mmu_flush_tlb_multi(cpus, info);

if (!hv_hypercall_pg)
goto do_native;
Expand Down Expand Up @@ -164,7 +164,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
if (!(status & HV_HYPERCALL_RESULT_MASK))
return;
do_native:
native_flush_tlb_others(cpus, info);
native_flush_tlb_multi(cpus, info);
}

static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
Expand Down Expand Up @@ -239,6 +239,6 @@ void hyperv_setup_mmu_ops(void)
return;

pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
6 changes: 3 additions & 3 deletions arch/x86/include/asm/paravirt.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static inline void slow_down_io(void)
void native_flush_tlb_local(void);
void native_flush_tlb_global(void);
void native_flush_tlb_one_user(unsigned long addr);
void native_flush_tlb_others(const struct cpumask *cpumask,
void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);

static inline void __flush_tlb_local(void)
Expand All @@ -68,10 +68,10 @@ static inline void __flush_tlb_one_user(unsigned long addr)
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
}

static inline void __flush_tlb_others(const struct cpumask *cpumask,
static inline void __flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}

static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/include/asm/paravirt_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_one_user)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);

void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);

Expand Down
4 changes: 2 additions & 2 deletions arch/x86/include/asm/tlbflush.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ extern void initialize_tlbstate_and_flush(void);
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
* - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
* - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
Expand Down Expand Up @@ -209,7 +209,7 @@ struct flush_tlb_info {
void flush_tlb_local(void);
void flush_tlb_one_user(unsigned long addr);
void flush_tlb_one_kernel(unsigned long addr);
void flush_tlb_others(const struct cpumask *cpumask,
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);

#ifdef CONFIG_PARAVIRT
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/trace/hyperv.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#if IS_ENABLED(CONFIG_HYPERV)

TRACE_EVENT(hyperv_mmu_flush_tlb_others,
TRACE_EVENT(hyperv_mmu_flush_tlb_multi,
TP_PROTO(const struct cpumask *cpus,
const struct flush_tlb_info *info),
TP_ARGS(cpus, info),
Expand Down
11 changes: 8 additions & 3 deletions arch/x86/kernel/kvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
}
#endif

static void kvm_flush_tlb_others(const struct cpumask *cpumask,
static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
u8 state;
Expand All @@ -627,6 +627,11 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
* queue flush_on_enter for pre-empted vCPUs
*/
for_each_cpu(cpu, flushmask) {
/*
* The local vCPU is never preempted, so we do not explicitly
* skip check for local vCPU - it will never be cleared from
* flushmask.
*/
src = &per_cpu(steal_time, cpu);
state = READ_ONCE(src->preempted);
if ((state & KVM_VCPU_PREEMPTED)) {
Expand All @@ -636,7 +641,7 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
}
}

native_flush_tlb_others(flushmask, info);
native_flush_tlb_multi(flushmask, info);
}

static void __init kvm_guest_init(void)
Expand All @@ -654,7 +659,7 @@ static void __init kvm_guest_init(void)
}

if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/paravirt.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_user = native_flush_tlb_local,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_others = native_flush_tlb_others,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
.mmu.tlb_remove_table =
(void (*)(struct mmu_gather *, void *))tlb_remove_page,

Expand Down
46 changes: 29 additions & 17 deletions arch/x86/mm/tlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# define __flush_tlb_local native_flush_tlb_local
# define __flush_tlb_global native_flush_tlb_global
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
#endif

/*
Expand Down Expand Up @@ -490,7 +490,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
* cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
Expand Down Expand Up @@ -697,7 +697,7 @@ static void flush_tlb_func(void *info)
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* This should be rare, with native_flush_tlb_multi() skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
Expand Down Expand Up @@ -795,9 +795,14 @@ static bool tlb_is_not_lazy(int cpu)

static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);

STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
/*
* Do accounting and tracing. Note that there are (and have always been)
* cases in which a remote TLB flush will be traced, but eventually
* would not happen.
*/
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
Expand All @@ -816,8 +821,7 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
* doing a speculative memory access.
*/
if (info->freed_tables) {
smp_call_function_many(cpumask, flush_tlb_func,
(void *)info, 1);
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
} else {
/*
* Although we could have used on_each_cpu_cond_mask(),
Expand All @@ -844,14 +848,14 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
if (tlb_is_not_lazy(cpu))
__cpumask_set_cpu(cpu, cond_cpumask);
}
smp_call_function_many(cond_cpumask, flush_tlb_func, (void *)info, 1);
on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
}
}

void flush_tlb_others(const struct cpumask *cpumask,
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
__flush_tlb_others(cpumask, info);
__flush_tlb_multi(cpumask, info);
}

/*
Expand Down Expand Up @@ -931,16 +935,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);

if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
flush_tlb_func(info);
local_irq_enable();
}

if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), info);

put_flush_tlb_info();
put_cpu();
}
Expand Down Expand Up @@ -1152,16 +1160,20 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
int cpu = get_cpu();

info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
flush_tlb_func(info);
local_irq_enable();
}

if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, info);

cpumask_clear(&batch->cpumask);

put_flush_tlb_info();
Expand Down
11 changes: 5 additions & 6 deletions arch/x86/xen/mmu_pv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1247,8 +1247,8 @@ static void xen_flush_tlb_one_user(unsigned long addr)
preempt_enable();
}

static void xen_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
static void xen_flush_tlb_multi(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
struct {
struct mmuext_op op;
Expand All @@ -1258,7 +1258,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
const size_t mc_entry_size = sizeof(args->op) +
sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());

trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);

if (cpumask_empty(cpus))
return; /* nothing to do */
Expand All @@ -1267,9 +1267,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
args = mcs.args;
args->op.arg2.vcpumask = to_cpumask(args->mask);

/* Remove us, and any offline CPUS. */
/* Remove any offline CPUs */
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));

args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
if (info->end != TLB_FLUSH_ALL &&
Expand Down Expand Up @@ -2086,7 +2085,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_others = xen_flush_tlb_others,
.flush_tlb_multi = xen_flush_tlb_multi,
.tlb_remove_table = tlb_remove_table,

.pgd_alloc = xen_pgd_alloc,
Expand Down
2 changes: 1 addition & 1 deletion include/trace/events/xen.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ TRACE_EVENT(xen_mmu_flush_tlb_one_user,
TP_printk("addr %lx", __entry->addr)
);

TRACE_EVENT(xen_mmu_flush_tlb_others,
TRACE_EVENT(xen_mmu_flush_tlb_multi,
TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm,
unsigned long addr, unsigned long end),
TP_ARGS(cpus, mm, addr, end),
Expand Down

0 comments on commit 4ce94ea

Please sign in to comment.