[Intel-gfx] [PATCH 19/27] KVM: x86/mmu: Use page-track notifiers iff there are external users

Thu Aug 10 03:02:19 UTC 2023

On Thu, Aug 10, 2023 at 07:21:03AM +0800, Yan Zhao wrote:
> On Wed, Aug 09, 2023 at 07:33:45AM -0700, Sean Christopherson wrote:
> > On Wed, Aug 09, 2023, Yan Zhao wrote:
> > > On Mon, Aug 07, 2023 at 10:19:07AM -0700, Sean Christopherson wrote:
> > > > On Mon, Aug 07, 2023, Like Xu wrote:
> > > > > On 23/12/2022 8:57 am, Sean Christopherson wrote:
> > > > > > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > > > > > +					const u8 *new, int bytes)
> > > > > > +{
> > > > > > +	__kvm_page_track_write(vcpu, gpa, new, bytes);
> > > > > > +
> > > > > > +	kvm_mmu_track_write(vcpu, gpa, new, bytes);
> > > > > > +}
> > > > > 
> > > > > The kvm_mmu_track_write() is only used for x86, where the incoming parameter
> > > > > "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in
> > > > > emulated page table writes"), please help confirm if it's still needed ? Thanks.
> > > > > A minor clean up is proposed.
> > > > 
> > > > Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either.
> > > > So I think we can remove @new from kvm_page_track_write() entirely.
> > > Sorry for the late reply.
> > > Yes, KVMGT does not consume @new and it reads the guest PTE again in the
> > > page track write handler.
> > > 
> > > But I have a couple of questions related to the memtioned commit as
> > > below:
> > > 
> > > (1) If "re-reading the current value of the guest PTE after the MMU lock has
> > > been acquired", then should KVMGT also acquire the MMU lock too?
> > 
> > No.  If applicable, KVMGT should read the new/current value after acquiring
> > whatever lock protects the generation (or update) of the shadow entries.  I
> > suspect KVMGT already does this, but I don't have time to confirm that at this
> I think the mutex lock and unlock of info->vgpu_lock you added in
> kvmgt_page_track_write() is the counterpart :)
> 
> > exact memory.
> > 
> > The race that was fixed in KVM was:
> > 
> >   vCPU0         vCPU1   
> >   write X
> >                  write Y
> >                  sync SPTE w/ Y
> >   sync SPTE w/ X
> > 
> > Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever
> > value "loses" the race, i.e. whatever written value is processed second ('Y' in the
> > above sequence).
> I suspect that vCPU0 may still generate a wrong SPTE if vCPU1 wrote 4
> bytes while vCPU0 wrote 8 bytes, though the chances are very low.
> 
This could happen in below sequence:
vCPU0 updates a PTE to AABBCCDD;
vCPU1 updates a PTE to EEFFGGHH in two writes.
(each character stands for a byte)

vCPU0                  vCPU1   
write AABBCCDD
                       write GGHH
                       detect 4 bytes write and hold on sync
sync SPTE w/ AABBGGHH
                       write EEFF
                       sync SPTE w/ EEFFGGHH


Do you think it worth below serialization work?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a915e23d61fa..51cd0ab73529 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1445,6 +1445,8 @@ struct kvm_arch {
         */
 #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
        struct kvm_mmu_memory_cache split_desc_cache;
+
+       struct xarray track_writing_range;
 };

 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index fd04e618ad2d..4b271701dcf6 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -142,12 +142,14 @@ void kvm_page_track_cleanup(struct kvm *kvm)

        head = &kvm->arch.track_notifier_head;
        cleanup_srcu_struct(&head->track_srcu);
+       xa_destroy(&kvm->arch.track_writing_range);
 }

 int kvm_page_track_init(struct kvm *kvm)
 {
        struct kvm_page_track_notifier_head *head;

+       xa_init(&kvm->arch.track_writing_range);
        head = &kvm->arch.track_notifier_head;
        INIT_HLIST_HEAD(&head->track_notifier_list);
        return init_srcu_struct(&head->track_srcu);
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
index 62f98c6c5af3..1829792b9892 100644
--- a/arch/x86/kvm/mmu/page_track.h
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -47,12 +47,46 @@ static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return fa

 #endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */

-static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                                       const u8 *new, int bytes)
+static inline void kvm_page_track_write_begin(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                             int bytes)
 {
+       struct kvm *kvm = vcpu->kvm;
+       gfn_t gfn = gpa_to_gfn(gpa);
+
+       WARN_ON(gfn != gpa_to_gfn(gpa + bytes - 1));
+
+       if (!kvm_page_track_write_tracking_enabled(kvm))
+               return;
+
+retry:
+       if (xa_insert(&kvm->arch.track_writing_range, gfn, xa_mk_value(gfn),
+                     GFP_KERNEL_ACCOUNT)) {
+               cpu_relax();
+               goto retry;
+       }
+       return;
+}
+
+static inline void kvm_page_track_write_abort(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                             int bytes)
+{
+       if (!kvm_page_track_write_tracking_enabled(vcpu->kvm))
+               return;
+
+       xa_erase(&vcpu->kvm->arch.track_writing_range, gpa_to_gfn(gpa));
+}
+
+static inline void kvm_page_track_write_end(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                           const u8 *new, int bytes)
+{
+       if (!kvm_page_track_write_tracking_enabled(vcpu->kvm))
+               return;
+
        __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);

        kvm_mmu_track_write(vcpu, gpa, new, bytes);
+
+       xa_erase(&vcpu->kvm->arch.track_writing_range, gpa_to_gfn(gpa));
 }

 #endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05a68d7d99fe..9b75829d5d7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7544,10 +7544,13 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
        int ret;

+       kvm_page_track_write_begin(vcpu, gpa, bytes);
        ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
-       if (ret < 0)
+       if (ret < 0) {
+               kvm_page_track_write_abort(vcpu, gpa, bytes);
                return 0;
-       kvm_page_track_write(vcpu, gpa, val, bytes);
+       }
+       kvm_page_track_write_end(vcpu, gpa, val, bytes);
        return 1;
 }

@@ -7792,6 +7795,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,

        hva += offset_in_page(gpa);

+       kvm_page_track_write_begin(vcpu, gpa, bytes);
        switch (bytes) {
        case 1:
                r = emulator_try_cmpxchg_user(u8, hva, old, new);
@@ -7809,12 +7813,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                BUG();
        }

-       if (r < 0)
+       if (r < 0) {
+               kvm_page_track_write_abort(vcpu, gpa, bytes);
                return X86EMUL_UNHANDLEABLE;
-       if (r)
+       }
+       if (r) {
+               kvm_page_track_write_abort(vcpu, gpa, bytes);
                return X86EMUL_CMPXCHG_FAILED;
+       }

-       kvm_page_track_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write_end(vcpu, gpa, new, bytes);

        return X86EMUL_CONTINUE;