aboutsummaryrefslogtreecommitdiff
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c192
1 files changed, 119 insertions, 73 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 515dfe9d3bcf..fab4d3790578 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -702,30 +702,31 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
/*
* .change_pte() must be surrounded by .invalidate_range_{start,end}().
- * If mmu_notifier_count is zero, then no in-progress invalidations,
- * including this one, found a relevant memslot at start(); rechecking
- * memslots here is unnecessary. Note, a false positive (count elevated
- * by a different invalidation) is sub-optimal but functionally ok.
+ * If mmu_invalidate_in_progress is zero, then no in-progress
+ * invalidations, including this one, found a relevant memslot at
+ * start(); rechecking memslots here is unnecessary. Note, a false
+ * positive (count elevated by a different invalidation) is sub-optimal
+ * but functionally ok.
*/
WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
- if (!READ_ONCE(kvm->mmu_notifier_count))
+ if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
return;
kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
}
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
- unsigned long end)
+void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
+ unsigned long end)
{
/*
* The count increase must become visible at unlock time as no
* spte can be established without taking the mmu_lock and
* count is also read inside the mmu_lock critical section.
*/
- kvm->mmu_notifier_count++;
- if (likely(kvm->mmu_notifier_count == 1)) {
- kvm->mmu_notifier_range_start = start;
- kvm->mmu_notifier_range_end = end;
+ kvm->mmu_invalidate_in_progress++;
+ if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+ kvm->mmu_invalidate_range_start = start;
+ kvm->mmu_invalidate_range_end = end;
} else {
/*
* Fully tracking multiple concurrent ranges has diminishing
@@ -736,10 +737,10 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
* accumulate and persist until all outstanding invalidates
* complete.
*/
- kvm->mmu_notifier_range_start =
- min(kvm->mmu_notifier_range_start, start);
- kvm->mmu_notifier_range_end =
- max(kvm->mmu_notifier_range_end, end);
+ kvm->mmu_invalidate_range_start =
+ min(kvm->mmu_invalidate_range_start, start);
+ kvm->mmu_invalidate_range_end =
+ max(kvm->mmu_invalidate_range_end, end);
}
}
@@ -752,7 +753,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
.end = range->end,
.pte = __pte(0),
.handler = kvm_unmap_gfn_range,
- .on_lock = kvm_inc_notifier_count,
+ .on_lock = kvm_mmu_invalidate_begin,
.on_unlock = kvm_arch_guest_memory_reclaimed,
.flush_on_ret = true,
.may_block = mmu_notifier_range_blockable(range),
@@ -763,7 +764,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
/*
* Prevent memslot modification between range_start() and range_end()
* so that conditionally locking provides the same result in both
- * functions. Without that guarantee, the mmu_notifier_count
+ * functions. Without that guarantee, the mmu_invalidate_in_progress
* adjustments will be imbalanced.
*
* Pairs with the decrement in range_end().
@@ -779,7 +780,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* any given time, and the caches themselves can check for hva overlap,
* i.e. don't need to rely on memslot overlap checks for performance.
* Because this runs without holding mmu_lock, the pfn caches must use
- * mn_active_invalidate_count (see above) instead of mmu_notifier_count.
+ * mn_active_invalidate_count (see above) instead of
+ * mmu_invalidate_in_progress.
*/
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
hva_range.may_block);
@@ -789,22 +791,22 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
return 0;
}
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
- unsigned long end)
+void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
+ unsigned long end)
{
/*
* This sequence increase will notify the kvm page fault that
* the page that is going to be mapped in the spte could have
* been freed.
*/
- kvm->mmu_notifier_seq++;
+ kvm->mmu_invalidate_seq++;
smp_wmb();
/*
* The above sequence increase must be visible before the
* below count decrease, which is ensured by the smp_wmb above
- * in conjunction with the smp_rmb in mmu_notifier_retry().
+ * in conjunction with the smp_rmb in mmu_invalidate_retry().
*/
- kvm->mmu_notifier_count--;
+ kvm->mmu_invalidate_in_progress--;
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -816,7 +818,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
.end = range->end,
.pte = __pte(0),
.handler = (void *)kvm_null_fn,
- .on_lock = kvm_dec_notifier_count,
+ .on_lock = kvm_mmu_invalidate_end,
.on_unlock = (void *)kvm_null_fn,
.flush_on_ret = false,
.may_block = mmu_notifier_range_blockable(range),
@@ -837,7 +839,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (wake)
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
- BUG_ON(kvm->mmu_notifier_count < 0);
+ BUG_ON(kvm->mmu_invalidate_in_progress < 0);
}
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1134,6 +1136,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
if (!kvm)
return ERR_PTR(-ENOMEM);
+ /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
+ __module_get(kvm_chardev_ops.owner);
+
KVM_MMU_LOCK_INIT(kvm);
mmgrab(current->mm);
kvm->mm = current->mm;
@@ -1193,8 +1198,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
goto out_err_no_arch_destroy_vm;
}
- kvm->max_halt_poll_ns = halt_poll_ns;
-
r = kvm_arch_init_vm(kvm, type);
if (r)
goto out_err_no_arch_destroy_vm;
@@ -1211,9 +1214,17 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
if (r)
goto out_err_no_mmu_notifier;
+ r = kvm_coalesced_mmio_init(kvm);
+ if (r < 0)
+ goto out_no_coalesced_mmio;
+
+ r = kvm_create_vm_debugfs(kvm, fdname);
+ if (r)
+ goto out_err_no_debugfs;
+
r = kvm_arch_post_init_vm(kvm);
if (r)
- goto out_err_mmu_notifier;
+ goto out_err;
mutex_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
@@ -1222,25 +1233,13 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
preempt_notifier_inc();
kvm_init_pm_notifier(kvm);
- /*
- * When the fd passed to this ioctl() is opened it pins the module,
- * but try_module_get() also prevents getting a reference if the module
- * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
- */
- if (!try_module_get(kvm_chardev_ops.owner)) {
- r = -ENODEV;
- goto out_err_mmu_notifier;
- }
-
- r = kvm_create_vm_debugfs(kvm, fdname);
- if (r)
- goto out_err;
-
return kvm;
out_err:
- module_put(kvm_chardev_ops.owner);
-out_err_mmu_notifier:
+ kvm_destroy_vm_debugfs(kvm);
+out_err_no_debugfs:
+ kvm_coalesced_mmio_free(kvm);
+out_no_coalesced_mmio:
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
if (kvm->mmu_notifier.ops)
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
@@ -1259,6 +1258,7 @@ out_err_no_irq_srcu:
out_err_no_srcu:
kvm_arch_free_vm(kvm);
mmdrop(current->mm);
+ module_put(kvm_chardev_ops.owner);
return ERR_PTR(r);
}
@@ -2516,7 +2516,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
{
unsigned int flags = FOLL_HWPOISON;
struct page *page;
- int npages = 0;
+ int npages;
might_sleep();
@@ -3375,9 +3375,6 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
if (val < grow_start)
val = grow_start;
- if (val > vcpu->kvm->max_halt_poll_ns)
- val = vcpu->kvm->max_halt_poll_ns;
-
vcpu->halt_poll_ns = val;
out:
trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
@@ -3407,10 +3404,8 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
int ret = -EINTR;
int idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (kvm_arch_vcpu_runnable(vcpu)) {
- kvm_make_request(KVM_REQ_UNHALT, vcpu);
+ if (kvm_arch_vcpu_runnable(vcpu))
goto out;
- }
if (kvm_cpu_has_pending_timer(vcpu))
goto out;
if (signal_pending(current))
@@ -3483,6 +3478,24 @@ static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
}
}
+static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (kvm->override_halt_poll_ns) {
+ /*
+ * Ensure kvm->max_halt_poll_ns is not read before
+ * kvm->override_halt_poll_ns.
+ *
+ * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
+ */
+ smp_rmb();
+ return READ_ONCE(kvm->max_halt_poll_ns);
+ }
+
+ return READ_ONCE(halt_poll_ns);
+}
+
/*
* Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
* polling is enabled, busy wait for a short time before blocking to avoid the
@@ -3491,12 +3504,18 @@ static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
*/
void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
+ unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
- bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
ktime_t start, cur, poll_end;
bool waited = false;
+ bool do_halt_poll;
u64 halt_ns;
+ if (vcpu->halt_poll_ns > max_halt_poll_ns)
+ vcpu->halt_poll_ns = max_halt_poll_ns;
+
+ do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
+
start = cur = poll_end = ktime_get();
if (do_halt_poll) {
ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
@@ -3535,18 +3554,21 @@ out:
update_halt_poll_stats(vcpu, start, poll_end, !waited);
if (halt_poll_allowed) {
+ /* Recompute the max halt poll time in case it changed. */
+ max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
+
if (!vcpu_valid_wakeup(vcpu)) {
shrink_halt_poll_ns(vcpu);
- } else if (vcpu->kvm->max_halt_poll_ns) {
+ } else if (max_halt_poll_ns) {
if (halt_ns <= vcpu->halt_poll_ns)
;
/* we had a long block, shrink polling */
else if (vcpu->halt_poll_ns &&
- halt_ns > vcpu->kvm->max_halt_poll_ns)
+ halt_ns > max_halt_poll_ns)
shrink_halt_poll_ns(vcpu);
/* we had a short halt and our poll time is too small */
- else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
- halt_ns < vcpu->kvm->max_halt_poll_ns)
+ else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
+ halt_ns < max_halt_poll_ns)
grow_halt_poll_ns(vcpu);
} else {
vcpu->halt_poll_ns = 0;
@@ -4378,7 +4400,7 @@ void kvm_unregister_device_ops(u32 type)
static int kvm_ioctl_create_device(struct kvm *kvm,
struct kvm_create_device *cd)
{
- const struct kvm_device_ops *ops = NULL;
+ const struct kvm_device_ops *ops;
struct kvm_device *dev;
bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
int type;
@@ -4473,7 +4495,13 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_NR_MEMSLOTS:
return KVM_USER_MEM_SLOTS;
case KVM_CAP_DIRTY_LOG_RING:
-#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
+ return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
+#else
+ return 0;
+#endif
+ case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
#else
return 0;
@@ -4575,9 +4603,23 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
return -EINVAL;
kvm->max_halt_poll_ns = cap->args[0];
+
+ /*
+ * Ensure kvm->override_halt_poll_ns does not become visible
+ * before kvm->max_halt_poll_ns.
+ *
+ * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
+ */
+ smp_wmb();
+ kvm->override_halt_poll_ns = true;
+
return 0;
}
case KVM_CAP_DIRTY_LOG_RING:
+ case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+ if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
+ return -EINVAL;
+
return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
default:
return kvm_vm_ioctl_enable_cap(kvm, cap);
@@ -4832,6 +4874,12 @@ struct compat_kvm_clear_dirty_log {
};
};
+long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ return -ENOTTY;
+}
+
static long kvm_vm_compat_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4840,6 +4888,11 @@ static long kvm_vm_compat_ioctl(struct file *filp,
if (kvm->mm != current->mm || kvm->vm_dead)
return -EIO;
+
+ r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
+ if (r != -ENOTTY)
+ return r;
+
switch (ioctl) {
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
case KVM_CLEAR_DIRTY_LOG: {
@@ -4913,11 +4966,6 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
goto put_fd;
}
-#ifdef CONFIG_KVM_MMIO
- r = kvm_coalesced_mmio_init(kvm);
- if (r < 0)
- goto put_kvm;
-#endif
file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (IS_ERR(file)) {
r = PTR_ERR(file);
@@ -5396,6 +5444,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
int (*get)(void *, u64 *), int (*set)(void *, u64),
const char *fmt)
{
+ int ret;
struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
inode->i_private;
@@ -5407,15 +5456,13 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
if (!kvm_get_kvm_safe(stat_data->kvm))
return -ENOENT;
- if (simple_attr_open(inode, file, get,
- kvm_stats_debugfs_mode(stat_data->desc) & 0222
- ? set : NULL,
- fmt)) {
+ ret = simple_attr_open(inode, file, get,
+ kvm_stats_debugfs_mode(stat_data->desc) & 0222
+ ? set : NULL, fmt);
+ if (ret)
kvm_put_kvm(stat_data->kvm);
- return -ENOMEM;
- }
- return 0;
+ return ret;
}
static int kvm_debugfs_release(struct inode *inode, struct file *file)
@@ -5884,7 +5931,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
r = kvm_async_pf_init();
if (r)
- goto out_free_5;
+ goto out_free_4;
kvm_chardev_ops.owner = module;
@@ -5908,10 +5955,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
out_unreg:
kvm_async_pf_deinit();
-out_free_5:
+out_free_4:
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
-out_free_4:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);