Skip to content

Commit 9a77daa

Browse files
Ben Gardonbonzini
authored andcommitted
KVM: x86/mmu: Use atomic ops to set SPTEs in TDP MMU map
To prepare for handling page faults in parallel, change the TDP MMU page fault handler to use atomic operations to set SPTEs so that changes are not lost if multiple threads attempt to modify the same SPTE. Reviewed-by: Peter Feiner <pfeiner@google.com> Signed-off-by: Ben Gardon <bgardon@google.com> Message-Id: <20210202185734.1680553-21-bgardon@google.com> [Document new locking rules. - Paolo] Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1 parent a9442f5 commit 9a77daa

File tree

3 files changed

+130
-34
lines changed

3 files changed

+130
-34
lines changed

Documentation/virt/kvm/locking.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows:
1616
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
1717
them together is quite rare.
1818

19-
On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
19+
On x86:
20+
21+
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
22+
23+
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
24+
taken inside kvm->arch.mmu_lock, and cannot be taken without already
25+
holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
26+
there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
2027

2128
Everything else is a leaf: no other lock is taken inside the critical
2229
sections.

arch/x86/include/asm/kvm_host.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,19 @@ struct kvm_arch {
10391039
* tdp_mmu_page set and a root_count of 0.
10401040
*/
10411041
struct list_head tdp_mmu_pages;
1042+
1043+
/*
1044+
* Protects accesses to the following fields when the MMU lock
1045+
* is held in read mode:
1046+
* - tdp_mmu_pages (above)
1047+
* - the link field of struct kvm_mmu_pages used by the TDP MMU
1048+
* - lpage_disallowed_mmu_pages
1049+
* - the lpage_disallowed_link field of struct kvm_mmu_pages used
1050+
* by the TDP MMU
1051+
* It is acceptable, but not necessary, to acquire this lock when
1052+
* the thread holds the MMU lock in write mode.
1053+
*/
1054+
spinlock_t tdp_mmu_pages_lock;
10421055
};
10431056

10441057
struct kvm_vm_stat {

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 109 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "tdp_mmu.h"
88
#include "spte.h"
99

10+
#include <asm/cmpxchg.h>
1011
#include <trace/events/kvm.h>
1112

1213
#ifdef CONFIG_X86_64
@@ -33,6 +34,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
3334
kvm->arch.tdp_mmu_enabled = true;
3435

3536
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
37+
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
3638
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
3739
}
3840

@@ -225,7 +227,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
225227
}
226228

227229
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
228-
u64 old_spte, u64 new_spte, int level);
230+
u64 old_spte, u64 new_spte, int level,
231+
bool shared);
229232

230233
static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
231234
{
@@ -267,61 +270,91 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
267270
*
268271
* @kvm: kvm instance
269272
* @sp: the new page
273+
* @shared: This operation may not be running under the exclusive use of
274+
* the MMU lock and the operation must synchronize with other
275+
* threads that might be adding or removing pages.
270276
* @account_nx: This page replaces a NX large page and should be marked for
271277
* eventual reclaim.
272278
*/
273279
static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
274-
bool account_nx)
280+
bool shared, bool account_nx)
275281
{
276-
lockdep_assert_held_write(&kvm->mmu_lock);
282+
if (shared)
283+
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
284+
else
285+
lockdep_assert_held_write(&kvm->mmu_lock);
277286

278287
list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
279288
if (account_nx)
280289
account_huge_nx_page(kvm, sp);
290+
291+
if (shared)
292+
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
281293
}
282294

283295
/**
284296
* tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
285297
*
286298
* @kvm: kvm instance
287299
* @sp: the page to be removed
300+
* @shared: This operation may not be running under the exclusive use of
301+
* the MMU lock and the operation must synchronize with other
302+
* threads that might be adding or removing pages.
288303
*/
289-
static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp)
304+
static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
305+
bool shared)
290306
{
291-
lockdep_assert_held_write(&kvm->mmu_lock);
307+
if (shared)
308+
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
309+
else
310+
lockdep_assert_held_write(&kvm->mmu_lock);
292311

293312
list_del(&sp->link);
294313
if (sp->lpage_disallowed)
295314
unaccount_huge_nx_page(kvm, sp);
315+
316+
if (shared)
317+
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
296318
}
297319

298320
/**
299321
* handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
300322
*
301323
* @kvm: kvm instance
302324
* @pt: the page removed from the paging structure
325+
* @shared: This operation may not be running under the exclusive use
326+
* of the MMU lock and the operation must synchronize with other
327+
* threads that might be modifying SPTEs.
303328
*
304329
* Given a page table that has been removed from the TDP paging structure,
305330
* iterates through the page table to clear SPTEs and free child page tables.
306331
*/
307-
static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
332+
static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
333+
bool shared)
308334
{
309335
struct kvm_mmu_page *sp = sptep_to_sp(pt);
310336
int level = sp->role.level;
311337
gfn_t gfn = sp->gfn;
312338
u64 old_child_spte;
339+
u64 *sptep;
313340
int i;
314341

315342
trace_kvm_mmu_prepare_zap_page(sp);
316343

317-
tdp_mmu_unlink_page(kvm, sp);
344+
tdp_mmu_unlink_page(kvm, sp, shared);
318345

319346
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
320-
old_child_spte = READ_ONCE(*(pt + i));
321-
WRITE_ONCE(*(pt + i), 0);
347+
sptep = pt + i;
348+
349+
if (shared) {
350+
old_child_spte = xchg(sptep, 0);
351+
} else {
352+
old_child_spte = READ_ONCE(*sptep);
353+
WRITE_ONCE(*sptep, 0);
354+
}
322355
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp),
323356
gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
324-
old_child_spte, 0, level - 1);
357+
old_child_spte, 0, level - 1, shared);
325358
}
326359

327360
kvm_flush_remote_tlbs_with_address(kvm, gfn,
@@ -338,12 +371,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
338371
* @old_spte: The value of the SPTE before the change
339372
* @new_spte: The value of the SPTE after the change
340373
* @level: the level of the PT the SPTE is part of in the paging structure
374+
* @shared: This operation may not be running under the exclusive use of
375+
* the MMU lock and the operation must synchronize with other
376+
* threads that might be modifying SPTEs.
341377
*
342378
* Handle bookkeeping that might result from the modification of a SPTE.
343379
* This function must be called for all TDP SPTE modifications.
344380
*/
345381
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
346-
u64 old_spte, u64 new_spte, int level)
382+
u64 old_spte, u64 new_spte, int level,
383+
bool shared)
347384
{
348385
bool was_present = is_shadow_present_pte(old_spte);
349386
bool is_present = is_shadow_present_pte(new_spte);
@@ -415,18 +452,51 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
415452
*/
416453
if (was_present && !was_leaf && (pfn_changed || !is_present))
417454
handle_removed_tdp_mmu_page(kvm,
418-
spte_to_child_pt(old_spte, level));
455+
spte_to_child_pt(old_spte, level), shared);
419456
}
420457

421458
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
422-
u64 old_spte, u64 new_spte, int level)
459+
u64 old_spte, u64 new_spte, int level,
460+
bool shared)
423461
{
424-
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
462+
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
463+
shared);
425464
handle_changed_spte_acc_track(old_spte, new_spte, level);
426465
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
427466
new_spte, level);
428467
}
429468

469+
/*
470+
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
471+
* associated bookkeeping
472+
*
473+
* @kvm: kvm instance
474+
* @iter: a tdp_iter instance currently on the SPTE that should be set
475+
* @new_spte: The value the SPTE should be set to
476+
* Returns: true if the SPTE was set, false if it was not. If false is returned,
477+
* this function will have no side-effects.
478+
*/
479+
static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
480+
struct tdp_iter *iter,
481+
u64 new_spte)
482+
{
483+
u64 *root_pt = tdp_iter_root_pt(iter);
484+
struct kvm_mmu_page *root = sptep_to_sp(root_pt);
485+
int as_id = kvm_mmu_page_as_id(root);
486+
487+
lockdep_assert_held_read(&kvm->mmu_lock);
488+
489+
if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
490+
new_spte) != iter->old_spte)
491+
return false;
492+
493+
handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
494+
iter->level, true);
495+
496+
return true;
497+
}
498+
499+
430500
/*
431501
* __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
432502
* @kvm: kvm instance
@@ -456,7 +526,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
456526
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
457527

458528
__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
459-
iter->level);
529+
iter->level, false);
460530
if (record_acc_track)
461531
handle_changed_spte_acc_track(iter->old_spte, new_spte,
462532
iter->level);
@@ -630,23 +700,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
630700
int ret = 0;
631701
int make_spte_ret = 0;
632702

633-
if (unlikely(is_noslot_pfn(pfn))) {
703+
if (unlikely(is_noslot_pfn(pfn)))
634704
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
635-
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
636-
new_spte);
637-
} else {
705+
else
638706
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
639707
pfn, iter->old_spte, prefault, true,
640708
map_writable, !shadow_accessed_mask,
641709
&new_spte);
642-
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
643-
rcu_dereference(iter->sptep));
644-
}
645710

646711
if (new_spte == iter->old_spte)
647712
ret = RET_PF_SPURIOUS;
648-
else
649-
tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
713+
else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
714+
return RET_PF_RETRY;
650715

651716
/*
652717
* If the page fault was caused by a write but the page is write
@@ -660,8 +725,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
660725
}
661726

662727
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
663-
if (unlikely(is_mmio_spte(new_spte)))
728+
if (unlikely(is_mmio_spte(new_spte))) {
729+
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
730+
new_spte);
664731
ret = RET_PF_EMULATE;
732+
} else
733+
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
734+
rcu_dereference(iter->sptep));
665735

666736
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
667737
rcu_dereference(iter->sptep));
@@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
720790
*/
721791
if (is_shadow_present_pte(iter.old_spte) &&
722792
is_large_pte(iter.old_spte)) {
723-
tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
793+
if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
794+
break;
724795

725796
kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
726797
KVM_PAGES_PER_HPAGE(iter.level));
@@ -737,19 +808,24 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
737808
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
738809
child_pt = sp->spt;
739810

740-
tdp_mmu_link_page(vcpu->kvm, sp,
741-
huge_page_disallowed &&
742-
req_level >= iter.level);
743-
744811
new_spte = make_nonleaf_spte(child_pt,
745812
!shadow_accessed_mask);
746813

747-
trace_kvm_mmu_get_page(sp, true);
748-
tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
814+
if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
815+
new_spte)) {
816+
tdp_mmu_link_page(vcpu->kvm, sp, true,
817+
huge_page_disallowed &&
818+
req_level >= iter.level);
819+
820+
trace_kvm_mmu_get_page(sp, true);
821+
} else {
822+
tdp_mmu_free_sp(sp);
823+
break;
824+
}
749825
}
750826
}
751827

752-
if (WARN_ON(iter.level != level)) {
828+
if (iter.level != level) {
753829
rcu_read_unlock();
754830
return RET_PF_RETRY;
755831
}

0 commit comments

Comments
 (0)