[Intel-gfx] [RFC PATCH v3 11/19] KVM: x86: nSVM: implement shadowing of AVIC's physical id table

Maxim Levitsky mlevitsk at redhat.com
Wed Apr 27 20:03:06 UTC 2022


Implement the shadow physical id table and its
write tracking code which will be soon used for the nested AVIC.

Signed-off-by: Maxim Levitsky <mlevitsk at redhat.com>
---
 arch/x86/kvm/svm/avic.c | 461 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/svm/svm.h  |  71 +++++++
 2 files changed, 524 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index e5cbbb97fbab6..f462b7e48e3ca 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -51,6 +51,433 @@ static u32 next_vm_id = 0;
 static bool next_vm_id_wrapped = 0;
 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 
+
+static inline struct kvm_vcpu *avic_vcpu_by_l1_apicid(struct kvm *kvm,
+						      int l1_apicid)
+{
+	WARN_ON(l1_apicid == -1);
+	return kvm_get_vcpu_by_id(kvm, l1_apicid);
+}
+
+static void avic_physid_shadow_entry_set_vcpu(struct kvm *kvm,
+					      struct avic_physid_table *t,
+					      int n,
+					      int new_l1_apicid)
+{
+	struct avic_physid_entry_descr *e = &t->entries[n];
+	u64 sentry = READ_ONCE(*e->sentry);
+	u64 old_sentry = sentry;
+	struct kvm_vcpu *new_vcpu = NULL;
+	int l0_apicid = -1;
+
+	WARN_ON(!test_bit(n, t->valid_entires));
+
+	if (!list_empty(&e->link))
+		list_del_init(&e->link);
+
+	if (new_l1_apicid != -1)
+		new_vcpu = avic_vcpu_by_l1_apicid(kvm, new_l1_apicid);
+
+	if (new_vcpu)
+		l0_apicid = kvm_cpu_get_apicid(new_vcpu->cpu);
+
+	physid_entry_set_apicid(&sentry, l0_apicid);
+
+	trace_kvm_avic_physid_update_vcpu_guest(new_l1_apicid, l0_apicid);
+
+	if (sentry != old_sentry)
+		WRITE_ONCE(*e->sentry, sentry);
+}
+
+static void avic_physid_shadow_entry_create(struct kvm *kvm,
+					    struct avic_physid_table *t,
+					    int n,
+					    u64 gentry)
+{
+	struct avic_physid_entry_descr *e = &t->entries[n];
+	struct page *backing_page;
+	u64 backing_page_gpa = physid_entry_get_backing_table(gentry);
+	int l1_apic_id = physid_entry_get_apicid(gentry);
+	hpa_t backing_page_hpa;
+	u64 sentry = 0;
+
+
+	if (backing_page_gpa == INVALID_BACKING_PAGE)
+		return;
+
+	/* Pin the APIC backing page */
+	backing_page = gfn_to_page(kvm, gpa_to_gfn(backing_page_gpa));
+
+	if (is_error_page(backing_page))
+		/* Invalid GPA in the guest entry - point to a dummy entry */
+		backing_page_hpa = t->dummy_page_hpa;
+	else
+		backing_page_hpa = page_to_phys(backing_page);
+
+	physid_entry_set_backing_table(&sentry, backing_page_hpa);
+
+	e->gentry = gentry;
+	*e->sentry = sentry;
+
+	if (test_and_set_bit(n, t->valid_entires))
+		WARN_ON(1);
+
+	if (backing_page_hpa != t->dummy_page_hpa)
+		avic_physid_shadow_entry_set_vcpu(kvm, t, n, l1_apic_id);
+}
+
+static void avic_physid_shadow_entry_remove(struct kvm *kvm,
+					   struct avic_physid_table *t,
+					   int n)
+{
+	struct avic_physid_entry_descr *e = &t->entries[n];
+	hpa_t backing_page_hpa;
+
+	if (!test_and_clear_bit(n, t->valid_entires))
+		WARN_ON(1);
+
+	/* Release the APIC backing page */
+	backing_page_hpa = physid_entry_get_backing_table(*e->sentry);
+
+	if (backing_page_hpa != t->dummy_page_hpa)
+		kvm_release_pfn_dirty(backing_page_hpa >> PAGE_SHIFT);
+
+	if (!list_empty(&e->link))
+		list_del_init(&e->link);
+
+	e->gentry = 0;
+	*e->sentry = 0;
+}
+
+
+static bool
+avic_physid_shadow_table_setup_write_tracking(struct kvm *kvm,
+					      struct avic_physid_table *t,
+					      bool enable)
+{
+	struct kvm_memory_slot *slot;
+
+	write_lock(&kvm->mmu_lock);
+	slot = gfn_to_memslot(kvm, t->gfn);
+	if (!slot) {
+		write_unlock(&kvm->mmu_lock);
+		return false;
+	}
+
+	if (enable)
+		kvm_slot_page_track_add_page(kvm, slot, t->gfn, KVM_PAGE_TRACK_WRITE);
+	else
+		kvm_slot_page_track_remove_page(kvm, slot, t->gfn, KVM_PAGE_TRACK_WRITE);
+	write_unlock(&kvm->mmu_lock);
+	return true;
+}
+
+static void
+avic_physid_shadow_table_erase(struct kvm *kvm, struct avic_physid_table *t)
+{
+	int i;
+
+	if (!t->nentries)
+		return;
+
+	avic_physid_shadow_table_setup_write_tracking(kvm, t, false);
+
+	for_each_set_bit(i, t->valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT)
+		avic_physid_shadow_entry_remove(kvm, t, i);
+
+	t->nentries = 0;
+	t->flood_count = 0;
+}
+
+static struct avic_physid_table *
+avic_physid_shadow_table_alloc(struct kvm *kvm, gfn_t gfn)
+{
+	struct avic_physid_entry_descr *e;
+	struct avic_physid_table *t;
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+	u64 *shadow_table_address;
+	int i;
+
+	if (kvm_page_track_write_tracking_enable(kvm))
+		return NULL;
+
+	lockdep_assert_held(&kvm_svm->avic.tables_lock);
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL_ACCOUNT);
+	if (!t)
+		return NULL;
+
+	t->shadow_table = alloc_page(GFP_KERNEL_ACCOUNT|__GFP_ZERO);
+	if (!t->shadow_table)
+		goto err_free_table;
+
+	shadow_table_address = page_address(t->shadow_table);
+	t->shadow_table_hpa = __sme_set(page_to_phys(t->shadow_table));
+
+	for (i = 0; i < ARRAY_SIZE(t->entries); i++) {
+		e = &t->entries[i];
+		e->sentry = &shadow_table_address[i];
+		e->gentry = 0;
+		INIT_LIST_HEAD(&e->link);
+	}
+
+	t->gfn = gfn;
+	t->refcount = 1;
+
+	list_add_tail(&t->link, &kvm_svm->avic.physid_tables);
+
+	t->dummy_page_hpa = page_to_phys(kvm_svm->avic.invalid_physid_page);
+
+	trace_kvm_avic_physid_table_alloc(gfn_to_gpa(gfn));
+	return t;
+
+err_free_table:
+	kfree(t);
+	return NULL;
+}
+
+static void
+avic_physid_shadow_table_free(struct kvm *kvm, struct avic_physid_table *t)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+	lockdep_assert_held(&kvm_svm->avic.tables_lock);
+
+	WARN_ON(t->refcount);
+
+	avic_physid_shadow_table_erase(kvm, t);
+
+	trace_kvm_avic_physid_table_free(gfn_to_gpa(t->gfn));
+
+	hlist_del(&t->hash_link);
+	list_del(&t->link);
+	__free_page(t->shadow_table);
+	kfree(t);
+}
+
+static struct avic_physid_table *
+__avic_physid_shadow_table_get(struct hlist_head *head, gfn_t gfn)
+{
+	struct avic_physid_table *t;
+
+	hlist_for_each_entry(t, head, hash_link)
+		if (t->gfn == gfn) {
+			t->refcount++;
+			return t;
+		}
+	return NULL;
+}
+
+struct avic_physid_table *
+avic_physid_shadow_table_get(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+	struct hlist_head *hlist;
+	struct avic_physid_table *t;
+
+	mutex_lock(&kvm_svm->avic.tables_lock);
+
+	hlist = &kvm_svm->avic.physid_gpa_hash[avic_physid_hash(gfn)];
+	t = __avic_physid_shadow_table_get(hlist, gfn);
+	if (!t) {
+		t = avic_physid_shadow_table_alloc(vcpu->kvm, gfn);
+		if (!t)
+			goto out_unlock;
+		hlist_add_head(&t->hash_link, hlist);
+	}
+out_unlock:
+	mutex_unlock(&kvm_svm->avic.tables_lock);
+	return t;
+}
+
+static void
+__avic_physid_shadow_table_put(struct kvm *kvm, struct avic_physid_table *t)
+{
+	WARN_ON(t->refcount <= 0);
+	if (--t->refcount == 0)
+		avic_physid_shadow_table_free(kvm, t);
+}
+
+void avic_physid_shadow_table_put(struct kvm *kvm, struct avic_physid_table *t)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+	mutex_lock(&kvm_svm->avic.tables_lock);
+	__avic_physid_shadow_table_put(kvm, t);
+	mutex_unlock(&kvm_svm->avic.tables_lock);
+}
+
+static void avic_physid_shadow_table_invalidate(struct kvm *kvm,
+		struct avic_physid_table *t)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+	lockdep_assert_held(&kvm_svm->avic.tables_lock);
+	avic_physid_shadow_table_erase(kvm, t);
+}
+
+int avic_physid_shadow_table_sync(struct kvm_vcpu *vcpu,
+				  struct avic_physid_table *t, int nentries)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+	struct kvm_host_map map;
+	u64 *gentries;
+	int i;
+	int ret = 0;
+
+	mutex_lock(&kvm_svm->avic.tables_lock);
+
+	if (t->nentries >= nentries)
+		goto out_unlock;
+
+
+	trace_kvm_avic_physid_table_reload(gfn_to_gpa(t->gfn), t->nentries, nentries);
+
+	if (t->nentries == 0) {
+		if (!avic_physid_shadow_table_setup_write_tracking(vcpu->kvm, t, true)) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+	}
+
+	if (kvm_vcpu_map(vcpu, t->gfn, &map)) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	gentries = (u64 *)map.hva;
+
+	for (i = t->nentries ; i < nentries ; i++)
+		avic_physid_shadow_entry_create(vcpu->kvm, t, i, gentries[i]);
+
+	/* publish the table before setting nentries */
+	wmb();
+	WRITE_ONCE(t->nentries, nentries);
+
+	kvm_vcpu_unmap(vcpu, &map, false);
+out_unlock:
+	mutex_unlock(&kvm_svm->avic.tables_lock);
+	return ret;
+}
+
+static void avic_physid_shadow_table_track_write(struct kvm_vcpu *vcpu,
+						 gpa_t gpa,
+						 const u8 *new,
+						 int bytes,
+						 struct kvm_page_track_notifier_node *node)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+	struct hlist_head *hlist;
+	struct avic_physid_table *t;
+	gfn_t gfn = gpa_to_gfn(gpa);
+	unsigned int page_offset = offset_in_page(gpa);
+	unsigned int entry_offset = page_offset & 0x7;
+	int first = page_offset / sizeof(u64);
+	int last = (page_offset + bytes - 1) / sizeof(u64);
+	u64 new_entry, old_entry;
+	int l1_apic_id;
+
+	if (WARN_ON_ONCE(bytes == 0))
+		return;
+
+	mutex_lock(&kvm_svm->avic.tables_lock);
+
+	hlist = &kvm_svm->avic.physid_gpa_hash[avic_physid_hash(gfn)];
+	t = __avic_physid_shadow_table_get(hlist, gfn);
+
+	if (!t)
+		goto out_unlock;
+
+	trace_kvm_avic_physid_table_write(gpa, bytes);
+
+	/*
+	 * Update policy:
+	 *
+	 * Only a write to a single entry, entry that had a valid backing page
+	 * on the last VM entry with this page, and only if the
+	 * write touches only the is_running and/or apic_id part of this entry
+	 * is allowed.
+	 *
+	 * Writes outside of known number of entries are ignored to support
+	 * case when the guest is adding entries to end of the page
+	 * in the process of a cpu hotplug.
+	 *
+	 * All other writes, which are not supposed to happen during
+	 * use of the page, cause the page to be invalidated,
+	 * and read as a whole, next time it is used by a vCPU for VM entry.
+	 */
+
+	if (first >= t->nentries)
+		goto out_table_put;
+
+	if (first != last || !test_bit(first, t->valid_entires))
+		goto invalidate;
+
+	/* update the entry with written bytes */
+	old_entry = t->entries[first].gentry;
+	new_entry = old_entry;
+	memcpy(((u8 *)&new_entry) + entry_offset, new, bytes);
+
+	/* if backing page changed, invalidate the whole page*/
+	if (physid_entry_get_backing_table(old_entry) !=
+				physid_entry_get_backing_table(new_entry))
+		goto invalidate;
+
+	/*
+	 * Detect write flooding to physid pages that might not be used
+	 * for the purpose anymore
+	 */
+	if (!atomic_read(&t->usecount)) {
+		if (++t->flood_count > t->nentries * AVIC_PHYSID_FLOOD_COUNT)
+			goto invalidate;
+	} else {
+		t->flood_count = 0;
+	}
+
+	/* Update the backing cpu */
+	l1_apic_id = physid_entry_get_apicid(new_entry);
+	avic_physid_shadow_entry_set_vcpu(vcpu->kvm, t, first, l1_apic_id);
+	t->entries[first].gentry = new_entry;
+	goto out_table_put;
+invalidate:
+	avic_physid_shadow_table_invalidate(vcpu->kvm, t);
+out_table_put:
+	__avic_physid_shadow_table_put(vcpu->kvm, t);
+out_unlock:
+	mutex_unlock(&kvm_svm->avic.tables_lock);
+}
+
+static void avic_physid_shadow_table_flush_memslot(struct kvm *kvm,
+						   struct kvm_memory_slot *slot,
+						   struct kvm_page_track_notifier_node *node)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+	struct avic_physid_table *t, *n;
+	int i;
+
+	mutex_lock(&kvm_svm->avic.tables_lock);
+
+	list_for_each_entry_safe(t, n, &kvm_svm->avic.physid_tables, link) {
+
+		if (gfn_in_memslot(slot, t->gfn)) {
+			avic_physid_shadow_table_invalidate(kvm, t);
+			continue;
+		}
+
+		for_each_set_bit(i, t->valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT) {
+			u64 gentry = t->entries[i].gentry;
+			gpa_t gpa = physid_entry_get_backing_table(gentry);
+
+			if (gfn_in_memslot(slot, gpa_to_gfn(gpa))) {
+				avic_physid_shadow_table_invalidate(kvm, t);
+				break;
+			}
+		}
+	}
+	mutex_unlock(&kvm_svm->avic.tables_lock);
+}
+
+
 /*
  * This is a wrapper of struct amd_iommu_ir_data.
  */
@@ -113,18 +540,22 @@ void avic_vm_destroy(struct kvm *kvm)
 		__free_page(avic->logical_id_table_page);
 	if (avic->physical_id_table_page)
 		__free_page(avic->physical_id_table_page);
+	if (avic->invalid_physid_page)
+		__free_page(avic->invalid_physid_page);
 
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
 	hash_del(&avic->hnode);
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+
+	kvm_page_track_unregister_notifier(kvm, &avic->write_tracker);
 }
 
 int avic_vm_init(struct kvm *kvm)
 {
 	unsigned long flags;
 	int err = -ENOMEM;
-	struct page *p_page;
-	struct page *l_page;
+	struct page *page;
 	struct kvm_svm_avic *avic = &to_kvm_svm(kvm)->avic;
 	u32 vm_id;
 
@@ -132,18 +563,25 @@ int avic_vm_init(struct kvm *kvm)
 		return 0;
 
 	/* Allocating physical APIC ID table (4KB) */
-	p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (!p_page)
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!page)
 		goto free_avic;
 
-	avic->physical_id_table_page = p_page;
+	avic->physical_id_table_page = page;
 
 	/* Allocating logical APIC ID table (4KB) */
-	l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (!l_page)
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!page)
 		goto free_avic;
 
-	avic->logical_id_table_page = l_page;
+	avic->logical_id_table_page = page;
+
+	/* Allocating a dummy page for invalid nested avic physid entries */
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!page)
+		goto free_avic;
+
+	avic->invalid_physid_page = page;
 
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
  again:
@@ -165,6 +603,13 @@ int avic_vm_init(struct kvm *kvm)
 	hash_add(svm_vm_data_hash, &avic->hnode, avic->vm_id);
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 
+	mutex_init(&avic->tables_lock);
+	INIT_LIST_HEAD(&avic->physid_tables);
+
+	avic->write_tracker.track_write = avic_physid_shadow_table_track_write;
+	avic->write_tracker.track_flush_slot = avic_physid_shadow_table_flush_memslot;
+
+	kvm_page_track_register_notifier(kvm, &avic->write_tracker);
 	return 0;
 
 free_avic:
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index dfca4c06e2071..fc15e1f938793 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -18,6 +18,7 @@
 #include <linux/kvm_types.h>
 #include <linux/kvm_host.h>
 #include <linux/bits.h>
+#include <linux/hash.h>
 
 #include <asm/svm.h>
 #include <asm/sev-common.h>
@@ -89,13 +90,33 @@ struct kvm_sev_info {
 };
 
 
+#define AVIC_PHYSID_HASH_SHIFT 8
+#define AVIC_PHYSID_HASH_SIZE (1 << AVIC_PHYSID_HASH_SHIFT)
+
 struct kvm_svm_avic {
 	u32 vm_id;
 	struct page *logical_id_table_page;
 	struct page *physical_id_table_page;
 	struct hlist_node hnode;
+
+	struct mutex tables_lock;
+
+	/* List of all shadow tables */
+	struct list_head physid_tables;
+
+	/* GPA hash table to find a shadow table via its GPA */
+	struct hlist_head physid_gpa_hash[AVIC_PHYSID_HASH_SIZE];
+
+	struct kvm_page_track_notifier_node write_tracker;
+
+	struct page *invalid_physid_page;
 };
 
+static __always_inline unsigned int avic_physid_hash(gfn_t gfn)
+{
+	return hash_64(gfn, AVIC_PHYSID_HASH_SHIFT);
+}
+
 struct kvm_svm {
 	struct kvm kvm;
 	struct kvm_svm_avic avic;
@@ -147,6 +168,49 @@ struct vmcb_ctrl_area_cached {
 	u8 reserved_sw[32];
 };
 
+struct avic_physid_entry_descr {
+	struct list_head link;
+
+	/* cached value of guest entry */
+	u64  gentry;
+
+	/* shadow table entry pointer*/
+	u64 *sentry;
+};
+
+#define AVIC_PHYSID_FLOOD_COUNT 1000
+
+struct avic_physid_table {
+	/* List of all tables member */
+	struct list_head link;
+
+	/* GPA hash of all tables member */
+	struct hlist_node hash_link;
+
+	/* GPA of the table in guest memory*/
+	gfn_t gfn;
+
+	/* Number of entries that we shadow and which are valid*/
+	int nentries;
+	DECLARE_BITMAP(valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT);
+
+	struct avic_physid_entry_descr entries[AVIC_MAX_PHYSICAL_ID_COUNT];
+
+	/* Guest visible shadow table */
+	struct page *shadow_table;
+	hpa_t shadow_table_hpa;
+	hpa_t dummy_page_hpa;
+
+	/* Number of vCPUs which have reference to this table  */
+	int refcount;
+
+	/* number of vCPUs that are in guest mode and use this table */
+	atomic_t usecount;
+
+	/* Number of writes to this page between uses of it*/
+	int flood_count;
+};
+
 struct svm_nested_state {
 	struct kvm_vmcb_info vmcb02;
 	u64 hsave_msr;
@@ -628,6 +692,13 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
 
+struct avic_physid_table *
+avic_physid_shadow_table_get(struct kvm_vcpu *vcpu, gfn_t gfn);
+void avic_physid_shadow_table_put(struct kvm *kvm, struct avic_physid_table *t);
+int avic_physid_shadow_table_sync(struct kvm_vcpu *vcpu,
+				  struct avic_physid_table *t, int nentries);
+
+
 #define INVALID_BACKING_PAGE	(~(u64)0)
 
 static inline u64 physid_entry_get_backing_table(u64 entry)
-- 
2.26.3



More information about the Intel-gfx mailing list