77#include "tdp_mmu.h"
88#include "spte.h"
99
10+ #include <asm/cmpxchg.h>
1011#include <trace/events/kvm.h>
1112
1213#ifdef CONFIG_X86_64
@@ -33,6 +34,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
3334 kvm -> arch .tdp_mmu_enabled = true;
3435
3536 INIT_LIST_HEAD (& kvm -> arch .tdp_mmu_roots );
37+ spin_lock_init (& kvm -> arch .tdp_mmu_pages_lock );
3638 INIT_LIST_HEAD (& kvm -> arch .tdp_mmu_pages );
3739}
3840
@@ -225,7 +227,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
225227}
226228
227229static void handle_changed_spte (struct kvm * kvm , int as_id , gfn_t gfn ,
228- u64 old_spte , u64 new_spte , int level );
230+ u64 old_spte , u64 new_spte , int level ,
231+ bool shared );
229232
230233static int kvm_mmu_page_as_id (struct kvm_mmu_page * sp )
231234{
@@ -267,61 +270,91 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
267270 *
268271 * @kvm: kvm instance
269272 * @sp: the new page
273+ * @shared: This operation may not be running under the exclusive use of
274+ * the MMU lock and the operation must synchronize with other
275+ * threads that might be adding or removing pages.
270276 * @account_nx: This page replaces a NX large page and should be marked for
271277 * eventual reclaim.
272278 */
273279static void tdp_mmu_link_page (struct kvm * kvm , struct kvm_mmu_page * sp ,
274- bool account_nx )
280+ bool shared , bool account_nx )
275281{
276- lockdep_assert_held_write (& kvm -> mmu_lock );
282+ if (shared )
283+ spin_lock (& kvm -> arch .tdp_mmu_pages_lock );
284+ else
285+ lockdep_assert_held_write (& kvm -> mmu_lock );
277286
278287 list_add (& sp -> link , & kvm -> arch .tdp_mmu_pages );
279288 if (account_nx )
280289 account_huge_nx_page (kvm , sp );
290+
291+ if (shared )
292+ spin_unlock (& kvm -> arch .tdp_mmu_pages_lock );
281293}
282294
283295/**
284296 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
285297 *
286298 * @kvm: kvm instance
287299 * @sp: the page to be removed
300+ * @shared: This operation may not be running under the exclusive use of
301+ * the MMU lock and the operation must synchronize with other
302+ * threads that might be adding or removing pages.
288303 */
289- static void tdp_mmu_unlink_page (struct kvm * kvm , struct kvm_mmu_page * sp )
304+ static void tdp_mmu_unlink_page (struct kvm * kvm , struct kvm_mmu_page * sp ,
305+ bool shared )
290306{
291- lockdep_assert_held_write (& kvm -> mmu_lock );
307+ if (shared )
308+ spin_lock (& kvm -> arch .tdp_mmu_pages_lock );
309+ else
310+ lockdep_assert_held_write (& kvm -> mmu_lock );
292311
293312 list_del (& sp -> link );
294313 if (sp -> lpage_disallowed )
295314 unaccount_huge_nx_page (kvm , sp );
315+
316+ if (shared )
317+ spin_unlock (& kvm -> arch .tdp_mmu_pages_lock );
296318}
297319
298320/**
299321 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
300322 *
301323 * @kvm: kvm instance
302324 * @pt: the page removed from the paging structure
325+ * @shared: This operation may not be running under the exclusive use
326+ * of the MMU lock and the operation must synchronize with other
327+ * threads that might be modifying SPTEs.
303328 *
304329 * Given a page table that has been removed from the TDP paging structure,
305330 * iterates through the page table to clear SPTEs and free child page tables.
306331 */
307- static void handle_removed_tdp_mmu_page (struct kvm * kvm , u64 * pt )
332+ static void handle_removed_tdp_mmu_page (struct kvm * kvm , u64 * pt ,
333+ bool shared )
308334{
309335 struct kvm_mmu_page * sp = sptep_to_sp (pt );
310336 int level = sp -> role .level ;
311337 gfn_t gfn = sp -> gfn ;
312338 u64 old_child_spte ;
339+ u64 * sptep ;
313340 int i ;
314341
315342 trace_kvm_mmu_prepare_zap_page (sp );
316343
317- tdp_mmu_unlink_page (kvm , sp );
344+ tdp_mmu_unlink_page (kvm , sp , shared );
318345
319346 for (i = 0 ; i < PT64_ENT_PER_PAGE ; i ++ ) {
320- old_child_spte = READ_ONCE (* (pt + i ));
321- WRITE_ONCE (* (pt + i ), 0 );
347+ sptep = pt + i ;
348+
349+ if (shared ) {
350+ old_child_spte = xchg (sptep , 0 );
351+ } else {
352+ old_child_spte = READ_ONCE (* sptep );
353+ WRITE_ONCE (* sptep , 0 );
354+ }
322355 handle_changed_spte (kvm , kvm_mmu_page_as_id (sp ),
323356 gfn + (i * KVM_PAGES_PER_HPAGE (level - 1 )),
324- old_child_spte , 0 , level - 1 );
357+ old_child_spte , 0 , level - 1 , shared );
325358 }
326359
327360 kvm_flush_remote_tlbs_with_address (kvm , gfn ,
@@ -338,12 +371,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
338371 * @old_spte: The value of the SPTE before the change
339372 * @new_spte: The value of the SPTE after the change
340373 * @level: the level of the PT the SPTE is part of in the paging structure
374+ * @shared: This operation may not be running under the exclusive use of
375+ * the MMU lock and the operation must synchronize with other
376+ * threads that might be modifying SPTEs.
341377 *
342378 * Handle bookkeeping that might result from the modification of a SPTE.
343379 * This function must be called for all TDP SPTE modifications.
344380 */
345381static void __handle_changed_spte (struct kvm * kvm , int as_id , gfn_t gfn ,
346- u64 old_spte , u64 new_spte , int level )
382+ u64 old_spte , u64 new_spte , int level ,
383+ bool shared )
347384{
348385 bool was_present = is_shadow_present_pte (old_spte );
349386 bool is_present = is_shadow_present_pte (new_spte );
@@ -415,18 +452,51 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
415452 */
416453 if (was_present && !was_leaf && (pfn_changed || !is_present ))
417454 handle_removed_tdp_mmu_page (kvm ,
418- spte_to_child_pt (old_spte , level ));
455+ spte_to_child_pt (old_spte , level ), shared );
419456}
420457
421458static void handle_changed_spte (struct kvm * kvm , int as_id , gfn_t gfn ,
422- u64 old_spte , u64 new_spte , int level )
459+ u64 old_spte , u64 new_spte , int level ,
460+ bool shared )
423461{
424- __handle_changed_spte (kvm , as_id , gfn , old_spte , new_spte , level );
462+ __handle_changed_spte (kvm , as_id , gfn , old_spte , new_spte , level ,
463+ shared );
425464 handle_changed_spte_acc_track (old_spte , new_spte , level );
426465 handle_changed_spte_dirty_log (kvm , as_id , gfn , old_spte ,
427466 new_spte , level );
428467}
429468
469+ /*
470+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
471+ * associated bookkeeping
472+ *
473+ * @kvm: kvm instance
474+ * @iter: a tdp_iter instance currently on the SPTE that should be set
475+ * @new_spte: The value the SPTE should be set to
476+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
477+ * this function will have no side-effects.
478+ */
479+ static inline bool tdp_mmu_set_spte_atomic (struct kvm * kvm ,
480+ struct tdp_iter * iter ,
481+ u64 new_spte )
482+ {
483+ u64 * root_pt = tdp_iter_root_pt (iter );
484+ struct kvm_mmu_page * root = sptep_to_sp (root_pt );
485+ int as_id = kvm_mmu_page_as_id (root );
486+
487+ lockdep_assert_held_read (& kvm -> mmu_lock );
488+
489+ if (cmpxchg64 (rcu_dereference (iter -> sptep ), iter -> old_spte ,
490+ new_spte ) != iter -> old_spte )
491+ return false;
492+
493+ handle_changed_spte (kvm , as_id , iter -> gfn , iter -> old_spte , new_spte ,
494+ iter -> level , true);
495+
496+ return true;
497+ }
498+
499+
430500/*
431501 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
432502 * @kvm: kvm instance
@@ -456,7 +526,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
456526 WRITE_ONCE (* rcu_dereference (iter -> sptep ), new_spte );
457527
458528 __handle_changed_spte (kvm , as_id , iter -> gfn , iter -> old_spte , new_spte ,
459- iter -> level );
529+ iter -> level , false );
460530 if (record_acc_track )
461531 handle_changed_spte_acc_track (iter -> old_spte , new_spte ,
462532 iter -> level );
@@ -630,23 +700,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
630700 int ret = 0 ;
631701 int make_spte_ret = 0 ;
632702
633- if (unlikely (is_noslot_pfn (pfn ))) {
703+ if (unlikely (is_noslot_pfn (pfn )))
634704 new_spte = make_mmio_spte (vcpu , iter -> gfn , ACC_ALL );
635- trace_mark_mmio_spte (rcu_dereference (iter -> sptep ), iter -> gfn ,
636- new_spte );
637- } else {
705+ else
638706 make_spte_ret = make_spte (vcpu , ACC_ALL , iter -> level , iter -> gfn ,
639707 pfn , iter -> old_spte , prefault , true,
640708 map_writable , !shadow_accessed_mask ,
641709 & new_spte );
642- trace_kvm_mmu_set_spte (iter -> level , iter -> gfn ,
643- rcu_dereference (iter -> sptep ));
644- }
645710
646711 if (new_spte == iter -> old_spte )
647712 ret = RET_PF_SPURIOUS ;
648- else
649- tdp_mmu_set_spte ( vcpu -> kvm , iter , new_spte ) ;
713+ else if (! tdp_mmu_set_spte_atomic ( vcpu -> kvm , iter , new_spte ))
714+ return RET_PF_RETRY ;
650715
651716 /*
652717 * If the page fault was caused by a write but the page is write
@@ -660,8 +725,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
660725 }
661726
662727 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
663- if (unlikely (is_mmio_spte (new_spte )))
728+ if (unlikely (is_mmio_spte (new_spte ))) {
729+ trace_mark_mmio_spte (rcu_dereference (iter -> sptep ), iter -> gfn ,
730+ new_spte );
664731 ret = RET_PF_EMULATE ;
732+ } else
733+ trace_kvm_mmu_set_spte (iter -> level , iter -> gfn ,
734+ rcu_dereference (iter -> sptep ));
665735
666736 trace_kvm_mmu_set_spte (iter -> level , iter -> gfn ,
667737 rcu_dereference (iter -> sptep ));
@@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
720790 */
721791 if (is_shadow_present_pte (iter .old_spte ) &&
722792 is_large_pte (iter .old_spte )) {
723- tdp_mmu_set_spte (vcpu -> kvm , & iter , 0 );
793+ if (!tdp_mmu_set_spte_atomic (vcpu -> kvm , & iter , 0 ))
794+ break ;
724795
725796 kvm_flush_remote_tlbs_with_address (vcpu -> kvm , iter .gfn ,
726797 KVM_PAGES_PER_HPAGE (iter .level ));
@@ -737,19 +808,24 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
737808 sp = alloc_tdp_mmu_page (vcpu , iter .gfn , iter .level );
738809 child_pt = sp -> spt ;
739810
740- tdp_mmu_link_page (vcpu -> kvm , sp ,
741- huge_page_disallowed &&
742- req_level >= iter .level );
743-
744811 new_spte = make_nonleaf_spte (child_pt ,
745812 !shadow_accessed_mask );
746813
747- trace_kvm_mmu_get_page (sp , true);
748- tdp_mmu_set_spte (vcpu -> kvm , & iter , new_spte );
814+ if (tdp_mmu_set_spte_atomic (vcpu -> kvm , & iter ,
815+ new_spte )) {
816+ tdp_mmu_link_page (vcpu -> kvm , sp , true,
817+ huge_page_disallowed &&
818+ req_level >= iter .level );
819+
820+ trace_kvm_mmu_get_page (sp , true);
821+ } else {
822+ tdp_mmu_free_sp (sp );
823+ break ;
824+ }
749825 }
750826 }
751827
752- if (WARN_ON ( iter .level != level ) ) {
828+ if (iter .level != level ) {
753829 rcu_read_unlock ();
754830 return RET_PF_RETRY ;
755831 }
0 commit comments