diff options
-rw-r--r-- | Documentation/virtual/kvm/api.txt | 63 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_page_track.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/page_track.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 44 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 7 | ||||
-rw-r--r-- | drivers/ptp/ptp_kvm.c | 5 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 4 | ||||
-rwxr-xr-x | tools/kvm/kvm_stat/kvm_stat | 381 | ||||
-rw-r--r-- | tools/kvm/kvm_stat/kvm_stat.txt | 26 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 3 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 44 |
14 files changed, 496 insertions, 99 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e601c8f01fd9..753e88e5eb2a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3400,6 +3400,69 @@ struct kvm_ppc_resize_hpt { __u32 pad; }; +4.104 KVM_X86_GET_MCE_CAP_SUPPORTED + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: system ioctl +Parameters: u64 mce_cap (out) +Returns: 0 on success, -1 on error + +Returns supported MCE capabilities. The u64 mce_cap parameter +has the same format as the MSR_IA32_MCG_CAP register. Supported +capabilities will have the corresponding bits set. + +4.105 KVM_X86_SETUP_MCE + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: vcpu ioctl +Parameters: u64 mcg_cap (in) +Returns: 0 on success, + -EFAULT if u64 mcg_cap cannot be read, + -EINVAL if the requested number of banks is invalid, + -EINVAL if requested MCE capability is not supported. + +Initializes MCE support for use. The u64 mcg_cap parameter +has the same format as the MSR_IA32_MCG_CAP register and +specifies which capabilities should be enabled. The maximum +supported number of error-reporting banks can be retrieved when +checking for KVM_CAP_MCE. The supported capabilities can be +retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. + +4.106 KVM_X86_SET_MCE + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_x86_mce (in) +Returns: 0 on success, + -EFAULT if struct kvm_x86_mce cannot be read, + -EINVAL if the bank number is invalid, + -EINVAL if VAL bit is not set in status field. + +Inject a machine check error (MCE) into the guest. The input +parameter is: + +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +If the MCE being reported is an uncorrected error, KVM will +inject it as an MCE exception into the guest. If the guest +MCG_STATUS register reports that an MCE is in progress, KVM +causes an KVM_EXIT_SHUTDOWN vmexit. + +Otherwise, if the MCE is a corrected error, KVM will just +store it in the corresponding bank (provided this bank is +not holding a previously reported uncorrected error). + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index d74747b031ec..c4eda791f877 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -46,6 +46,7 @@ struct kvm_page_track_notifier_node { }; void kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 73ea24d4f119..047b17a26269 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -657,6 +657,9 @@ void kvm_pic_destroy(struct kvm *kvm) { struct kvm_pic *vpic = kvm->arch.vpic; + if (!vpic) + return; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5c07d2..289270a6aecb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -635,6 +635,9 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + if (!ioapic) + return; + cancel_delayed_work_sync(&ioapic->eoi_inject); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 37942e419c32..60168cdd0546 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } +void kvm_page_track_cleanup(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1efe2c62b3f..5fba70646c32 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1379,6 +1379,9 @@ static void avic_vm_destroy(struct kvm *kvm) unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + if (!avic) + return; + avic_free_vm_id(vm_data->avic_vm_id); if (vm_data->avic_logical_id_table_page) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98e82ee1e699..2ee00dbbbd51 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1239,6 +1239,11 @@ static inline bool cpu_has_vmx_invvpid_global(void) return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; } +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -2753,7 +2758,6 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -2781,10 +2785,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. */ - if (enable_vpid) + if (enable_vpid) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - else + } else vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) @@ -4024,6 +4030,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); } +static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) +{ + if (enable_ept) + vmx_flush_tlb(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -6517,8 +6529,10 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); - if (!cpu_has_vmx_vpid()) + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) @@ -8501,7 +8515,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8547,6 +8562,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb_ept_only(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -8572,8 +8588,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) */ if (!is_guest_mode(vcpu) || !nested_cpu_has2(get_vmcs12(&vmx->vcpu), - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); + vmx_flush_tlb_ept_only(vcpu); + } } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -9974,7 +9992,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -10121,8 +10138,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_intr_status); } - nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; - /* * Write an illegal value to APIC_ACCESS_ADDR. Later, * nested_get_vmcs12_pages will either fix it up or @@ -10255,6 +10270,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_ept(vmcs12)) { kvm_mmu_unload(vcpu); nested_ept_init_mmu_context(vcpu); + } else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* @@ -10282,12 +10300,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_efer(vcpu, vcpu->arch.efer); /* Shadow page tables on either EPT or shadow page tables. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) return 1; - kvm_mmu_reset_context(vcpu); - if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -11056,6 +11072,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx->nested.change_vmcs01_virtual_x2apic_mode = false; vmx_set_virtual_x2apic_mode(vcpu, vcpu->arch.apic_base & X2APIC_ENABLE); + } else if (!nested_cpu_has_ept(vmcs12) && + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1faf620a6fdc..ccbd45ecd41a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8153,11 +8153,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); kvm_iommu_unmap_guest(kvm); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); + kvm_pic_destroy(kvm); + kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_cleanup(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -8566,11 +8567,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_ready(work->arch.token, work->gva); if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + trace_kvm_async_pf_ready(work->arch.token, work->gva); if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c index 09b4df74291e..bb865695d7a6 100644 --- a/drivers/ptp/ptp_kvm.c +++ b/drivers/ptp/ptp_kvm.c @@ -193,10 +193,7 @@ static int __init ptp_kvm_init(void) kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL); - if (IS_ERR(kvm_ptp_clock.ptp_clock)) - return PTR_ERR(kvm_ptp_clock.ptp_clock); - - return 0; + return PTR_ERR_OR_ZERO(kvm_ptp_clock.ptp_clock); } module_init(ptp_kvm_init); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..d0250744507a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev); +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 581278c58488..8f74ed8e7237 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -30,8 +30,8 @@ import fcntl import resource import struct import re +import subprocess from collections import defaultdict -from time import sleep VMX_EXIT_REASONS = { 'EXCEPTION_NMI': 0, @@ -225,6 +225,7 @@ IOCTL_NUMBERS = { 'RESET': 0x00002403, } + class Arch(object): """Encapsulates global architecture specific data. @@ -255,12 +256,14 @@ class Arch(object): return ArchX86(SVM_EXIT_REASONS) return + class ArchX86(Arch): def __init__(self, exit_reasons): self.sc_perf_evt_open = 298 self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = exit_reasons + class ArchPPC(Arch): def __init__(self): self.sc_perf_evt_open = 319 @@ -275,12 +278,14 @@ class ArchPPC(Arch): self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 self.exit_reasons = {} + class ArchA64(Arch): def __init__(self): self.sc_perf_evt_open = 241 self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = AARCH64_EXIT_REASONS + class ArchS390(Arch): def __init__(self): self.sc_perf_evt_open = 331 @@ -316,6 +321,61 @@ def parse_int_list(list_string): return integers +def get_pid_from_gname(gname): + """Fuzzy function to convert guest name to QEMU process pid. + + Returns a list of potential pids, can be empty if no match found. + Throws an exception on processing errors. + + """ + pids = [] + try: + child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'], + stdout=subprocess.PIPE) + except: + raise Exception + for line in child.stdout: + line = line.lstrip().split(' ', 1) + # perform a sanity check before calling the more expensive + # function to possibly extract the guest name + if ' -name ' in line[1] and gname == get_gname_from_pid(line[0]): + pids.append(int(line[0])) + child.stdout.close() + + return pids + + +def get_gname_from_pid(pid): + """Returns the guest name for a QEMU process pid. + + Extracts the guest name from the QEMU comma line by processing the '-name' + option. Will also handle names specified out of sequence. + + """ + name = '' + try: + line = open('/proc/{}/cmdline'.format(pid), 'rb').read().split('\0') + parms = line[line.index('-name') + 1].split(',') + while '' in parms: + # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results in + # ['foo', '', 'bar'], which we revert here + idx = parms.index('') + parms[idx - 1] += ',' + parms[idx + 1] + del parms[idx:idx+2] + # the '-name' switch allows for two ways to specify the guest name, + # where the plain name overrides the name specified via 'guest=' + for arg in parms: + if '=' not in arg: + name = arg + break + if arg[:6] == 'guest=': + name = arg[6:] + except (ValueError, IOError, IndexError): + pass + + return name + + def get_online_cpus(): """Returns a list of cpu id integers.""" with open('/sys/devices/system/cpu/online') as cpu_list: @@ -342,6 +402,7 @@ def get_filters(): libc = ctypes.CDLL('libc.so.6', use_errno=True) syscall = libc.syscall + class perf_event_attr(ctypes.Structure): """Struct that holds the necessary data to set up a trace event. @@ -370,6 +431,7 @@ class perf_event_attr(ctypes.Structure): self.size = ctypes.sizeof(self) self.read_format = PERF_FORMAT_GROUP + def perf_event_open(attr, pid, cpu, group_fd, flags): """Wrapper for the sys_perf_evt_open() syscall. @@ -395,6 +457,7 @@ PERF_FORMAT_GROUP = 1 << 3 PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' + class Group(object): """Represents a perf event group.""" @@ -427,6 +490,7 @@ class Group(object): struct.unpack(read_format, os.read(self.events[0].fd, length)))) + class Event(object): """Represents a performance event and manages its life cycle.""" def __init__(self, name, group, trace_cpu, trace_pid, trace_point, @@ -510,6 +574,7 @@ class Event(object): """Resets the count of the trace event in the kernel.""" fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) + class TracepointProvider(object): """Data provider for the stats class. @@ -551,6 +616,7 @@ class TracepointProvider(object): def setup_traces(self): """Creates all event and group objects needed to be able to retrieve data.""" + fields = self.get_available_fields() if self._pid > 0: # Fetch list of all threads of the monitored pid, as qemu # starts a thread for each vcpu. @@ -561,7 +627,7 @@ class TracepointProvider(object): # The constant is needed as a buffer for python libs, std # streams and other files that the script opens. - newlim = len(groupids) * len(self._fields) + 50 + newlim = len(groupids) * len(fields) + 50 try: softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) @@ -577,7 +643,7 @@ class TracepointProvider(object): for groupid in groupids: group = Group() - for name in self._fields: + for name in fields: tracepoint = name tracefilter = None match = re.match(r'(.*)\((.*)\)', name) @@ -650,13 +716,23 @@ class TracepointProvider(object): ret[name] += val return ret + def reset(self): + """Reset all field counters""" + for group in self.group_leaders: + for event in group.events: + event.reset() + + class DebugfsProvider(object): """Provides data from the files that KVM creates in the kvm debugfs folder.""" def __init__(self): self._fields = self.get_available_fields() + self._baseline = {} self._pid = 0 self.do_read = True + self.paths = [] + self.reset() def get_available_fields(self): """"Returns a list of available fields. @@ -673,6 +749,7 @@ class DebugfsProvider(object): @fields.setter def fields(self, fields): self._fields = fields + self.reset() @property def pid(self): @@ -690,10 +767,11 @@ class DebugfsProvider(object): self.paths = filter(lambda x: "{}-".format(pid) in x, vms) else: - self.paths = [''] + self.paths = [] self.do_read = True + self.reset() - def read(self): + def read(self, reset=0): """Returns a dict with format:'file name / field -> current value'.""" results = {} @@ -701,10 +779,22 @@ class DebugfsProvider(object): if not self.do_read: return results - for path in self.paths: + paths = self.paths + if self._pid == 0: + paths = [] + for entry in os.walk(PATH_DEBUGFS_KVM): + for dir in entry[1]: + paths.append(dir) + for path in paths: for field in self._fields: - results[field] = results.get(field, 0) \ - + self.read_field(field, path) + value = self.read_field(field, path) + key = path + field + if reset: + self._baseline[key] = value + if self._baseline.get(key, -1) == -1: + self._baseline[key] = value + results[field] = (results.get(field, 0) + value - + self._baseline.get(key, 0)) return results @@ -718,6 +808,12 @@ class DebugfsProvider(object): except IOError: return 0 + def reset(self): + """Reset field counters""" + self._baseline = {} + self.read(1) + + class Stats(object): """Manages the data providers and the data they provide. @@ -753,14 +849,20 @@ class Stats(object): for provider in self.providers: provider.pid = self._pid_filter + def reset(self): + self.values = {} + for provider in self.providers: + provider.reset() + @property def fields_filter(self): return self._fields_filter @fields_filter.setter def fields_filter(self, fields_filter): - self._fields_filter = fields_filter - self.update_provider_filters() + if fields_filter != self._fields_filter: + self._fields_filter = fields_filter + self.update_provider_filters() @property def pid_filter(self): @@ -768,9 +870,10 @@ class Stats(object): @pid_filter.setter def pid_filter(self, pid): - self._pid_filter = pid - self.values = {} - self.update_provider_pid() + if pid != self._pid_filter: + self._pid_filter = pid + self.values = {} + self.update_provider_pid() def get(self): """Returns a dict with field -> (value, delta to last value) of all @@ -778,23 +881,26 @@ class Stats(object): for provider in self.providers: new = provider.read() for key in provider.fields: - oldval = self.values.get(key, (0, 0)) + oldval = self.values.get(key, (0, 0))[0] newval = new.get(key, 0) - newdelta = None - if oldval is not None: - newdelta = newval - oldval[0] + newdelta = newval - oldval self.values[key] = (newval, newdelta) return self.values LABEL_WIDTH = 40 NUMBER_WIDTH = 10 +DELAY_INITIAL = 0.25 +DELAY_REGULAR = 3.0 +MAX_GUEST_NAME_LEN = 48 +MAX_REGEX_LEN = 44 +DEFAULT_REGEX = r'^[^\(]*$' + class Tui(object): """Instruments curses to draw a nice text ui.""" def __init__(self, stats): self.stats = stats self.screen = None - self.drilldown = False self.update_drilldown() def __enter__(self): @@ -809,7 +915,14 @@ class Tui(object): # return from C start_color() is ignorable. try: curses.start_color() - except: + except curses.error: + pass + + # Hide cursor in extra statement as some monochrome terminals + # might support hiding but not colors. + try: + curses.curs_set(0) + except curses.error: pass curses.use_default_colors() @@ -827,36 +940,60 @@ class Tui(object): def update_drilldown(self): """Sets or removes a filter that only allows fields without braces.""" if not self.stats.fields_filter: - self.stats.fields_filter = r'^[^\(]*$' + self.stats.fields_filter = DEFAULT_REGEX - elif self.stats.fields_filter == r'^[^\(]*$': + elif self.stats.fields_filter == DEFAULT_REGEX: self.stats.fields_filter = None def update_pid(self, pid): """Propagates pid selection to stats object.""" self.stats.pid_filter = pid - def refresh(self, sleeptime): - """Refreshes on-screen data.""" + def refresh_header(self, pid=None): + """Refreshes the header.""" + if pid is None: + pid = self.stats.pid_filter self.screen.erase() - if self.stats.pid_filter > 0: - self.screen.addstr(0, 0, 'kvm statistics - pid {0}' - .format(self.stats.pid_filter), - curses.A_BOLD) + gname = get_gname_from_pid(pid) + if gname: + gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...' + if len(gname) > MAX_GUEST_NAME_LEN + else gname)) + if pid > 0: + self.screen.addstr(0, 0, 'kvm statistics - pid {0} {1}' + .format(pid, gname), curses.A_BOLD) else: self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) + if self.stats.fields_filter and self.stats.fields_filter \ + != DEFAULT_REGEX: + regex = self.stats.fields_filter + if len(regex) > MAX_REGEX_LEN: + regex = regex[:MAX_REGEX_LEN] + '...' + self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex)) self.screen.addstr(2, 1, 'Event') self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - len('Total'), 'Total') - self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 - + len('%Total'), '%Total') + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 + 8 - len('Current'), 'Current') + self.screen.addstr(4, 1, 'Collecting data...') + self.screen.refresh() + + def refresh_body(self, sleeptime): row = 3 + self.screen.move(row, 0) + self.screen.clrtobot() stats = self.stats.get() + def sortkey(x): if stats[x][1]: return (-stats[x][1], -stats[x][0]) else: return (0, -stats[x][0]) + total = 0. + for val in stats.values(): + total += val[0] for key in sorted(stats.keys(), key=sortkey): if row >= self.screen.getmaxyx()[0]: @@ -869,6 +1006,8 @@ class Tui(object): col += LABEL_WIDTH self.screen.addstr(row, col, '%10d' % (values[0],)) col += NUMBER_WIDTH + self.screen.addstr(row, col, '%7.1f' % (values[0] * 100 / total,)) + col += 7 if values[1] is not None: self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) row += 1 @@ -893,20 +1032,24 @@ class Tui(object): regex = self.screen.getstr() curses.noecho() if len(regex) == 0: + self.stats.fields_filter = DEFAULT_REGEX + self.refresh_header() return try: re.compile(regex) self.stats.fields_filter = regex + self.refresh_header() return except re.error: continue - def show_vm_selection(self): + def show_vm_selection_by_pid(self): """Draws PID selection mask. Asks for a pid until a valid pid or 0 has been entered. """ + msg = '' while True: self.screen.erase() self.screen.addstr(0, 0, @@ -915,6 +1058,7 @@ class Tui(object): self.screen.addstr(1, 0, 'This might limit the shown data to the trace ' 'statistics.') + self.screen.addstr(5, 0, msg) curses.echo() self.screen.addstr(3, 0, "Pid [0 or pid]: ") @@ -922,60 +1066,128 @@ class Tui(object): curses.noecho() try: - pid = int(pid) - - if pid == 0: - self.update_pid(pid) - break - else: - if not os.path.isdir(os.path.join('/proc/', str(pid))): + if len(pid) > 0: + pid = int(pid) + if pid != 0 and not os.path.isdir(os.path.join('/proc/', + str(pid))): + msg = '"' + str(pid) + '": Not a running process' continue - else: - self.update_pid(pid) - break + else: + pid = 0 + self.refresh_header(pid) + self.update_pid(pid) + break except ValueError: + msg = '"' + str(pid) + '": Not a valid pid' continue + def show_vm_selection_by_guest_name(self): + """Draws guest selection mask. + + Asks for a guest name until a valid guest name or '' is entered. + + """ + msg = '' + while True: + self.screen.erase() + self.screen.addstr(0, 0, + 'Show statistics for specific guest.', + curses.A_BOLD) + self.screen.addstr(1, 0, + 'This might limit the shown data to the trace ' + 'statistics.') + self.screen.addstr(5, 0, msg) + curses.echo() + self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") + gname = self.screen.getstr() + curses.noecho() + + if not gname: + self.refresh_header(0) + self.update_pid(0) + break + else: + pids = [] + try: + pids = get_pid_from_gname(gname) + except: + msg = '"' + gname + '": Internal error while searching, ' \ + 'use pid filter instead' + continue + if len(pids) == 0: + msg = '"' + gname + '": Not an active guest' + continue + if len(pids) > 1: + msg = '"' + gname + '": Multiple matches found, use pid ' \ + 'filter instead' + continue + self.refresh_header(pids[0]) + self.update_pid(pids[0]) + break + def show_stats(self): """Refreshes the screen and processes user input.""" - sleeptime = 0.25 + sleeptime = DELAY_INITIAL + self.refresh_header() while True: - self.refresh(sleeptime) + self.refresh_body(sleeptime) curses.halfdelay(int(sleeptime * 10)) - sleeptime = 3 + sleeptime = DELAY_REGULAR try: char = self.screen.getkey() if char == 'x': - self.drilldown = not self.drilldown + self.refresh_header() self.update_drilldown() + sleeptime = DELAY_INITIAL if char == 'q': break + if char == 'c': + self.stats.fields_filter = DEFAULT_REGEX + self.refresh_header(0) + self.update_pid(0) + sleeptime = DELAY_INITIAL if char == 'f': self.show_filter_selection() + sleeptime = DELAY_INITIAL + if char == 'g': + self.show_vm_selection_by_guest_name() + sleeptime = DELAY_INITIAL if char == 'p': - self.show_vm_selection() + self.show_vm_selection_by_pid() + sleeptime = DELAY_INITIAL + if char == 'r': + self.refresh_header() + self.stats.reset() + sleeptime = DELAY_INITIAL except KeyboardInterrupt: break except curses.error: continue + def batch(stats): """Prints statistics in a key, value format.""" - s = stats.get() - time.sleep(1) - s = stats.get() - for key in sorted(s.keys()): - values = s[key] - print '%-42s%10d%10d' % (key, values[0], values[1]) + try: + s = stats.get() + time.sleep(1) + s = stats.get() + for key in sorted(s.keys()): + values = s[key] + print '%-42s%10d%10d' % (key, values[0], values[1]) + except KeyboardInterrupt: + pass + def log(stats): """Prints statistics as reiterating key block, multiple value blocks.""" keys = sorted(stats.get().iterkeys()) + def banner(): for k in keys: print '%s' % k, print + def statline(): s = stats.get() for k in keys: @@ -984,11 +1196,15 @@ def log(stats): line = 0 banner_repeat = 20 while True: - time.sleep(1) - if line % banner_repeat == 0: - banner() - statline() - line += 1 + try: + time.sleep(1) + if line % banner_repeat == 0: + banner() + statline() + line += 1 + except KeyboardInterrupt: + break + def get_options(): """Returns processed program arguments.""" @@ -1009,6 +1225,16 @@ Requirements: CAP_SYS_ADMIN and perf events are used. - CAP_SYS_RESOURCE if the hard limit is not high enough to allow the large number of files that are possibly opened. + +Interactive Commands: + c clear filter + f filter by regular expression + g filter by guest name + p filter by PID + q quit + x toggle reporting of stats for individual child trace events + r reset stats +Press any other key to refresh statistics immediately. """ class PlainHelpFormatter(optparse.IndentedHelpFormatter): @@ -1018,6 +1244,22 @@ Requirements: else: return "" + def cb_guest_to_pid(option, opt, val, parser): + try: + pids = get_pid_from_gname(val) + except: + raise optparse.OptionValueError('Error while searching for guest ' + '"{}", use "-p" to specify a pid ' + 'instead'.format(val)) + if len(pids) == 0: + raise optparse.OptionValueError('No guest by the name "{}" ' + 'found'.format(val)) + if len(pids) > 1: + raise optparse.OptionValueError('Multiple processes found (pids: ' + '{}) - use "-p" to specify a pid ' + 'instead'.format(" ".join(pids))) + parser.values.pid = pids[0] + optparser = optparse.OptionParser(description=description_text, formatter=PlainHelpFormatter()) optparser.add_option('-1', '--once', '--batch', @@ -1051,15 +1293,24 @@ Requirements: help='fields to display (regex)', ) optparser.add_option('-p', '--pid', - action='store', - default=0, - type=int, - dest='pid', - help='restrict statistics to pid', - ) + action='store', + default=0, + type='int', + dest='pid', + help='restrict statistics to pid', + ) + optparser.add_option('-g', '--guest', + action='callback', + type='string', + dest='pid', + metavar='GUEST', + help='restrict statistics to guest by name', + callback=cb_guest_to_pid, + ) (options, _) = optparser.parse_args(sys.argv) return options + def get_providers(options): """Returns a list of data providers depending on the passed options.""" providers = [] @@ -1073,6 +1324,7 @@ def get_providers(options): return providers + def check_access(options): """Exits if the current user can't access all needed directories.""" if not os.path.exists('/sys/kernel/debug'): @@ -1086,8 +1338,8 @@ def check_access(options): "Also ensure, that the kvm modules are loaded.\n") sys.exit(1) - if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints - or not options.debugfs): + if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or + not options.debugfs): sys.stderr.write("Please enable CONFIG_TRACING in your kernel " "when using the option -t (default).\n" "If it is enabled, make {0} readable by the " @@ -1098,10 +1350,11 @@ def check_access(options): sys.stderr.write("Falling back to debugfs statistics!\n") options.debugfs = True - sleep(5) + time.sleep(5) return options + def main(): options = get_options() options = check_access(options) diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index b92a153d7115..109431bdc63c 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -18,11 +18,33 @@ state transitions such as guest mode entry and exit. This tool is useful for observing guest behavior from the host perspective. Often conclusions about performance or buggy behavior can be drawn from the output. +While running in regular mode, use any of the keys listed in section +'Interactive Commands' below. +Use batch and logging modes for scripting purposes. The set of KVM kernel module trace events may be specific to the kernel version or architecture. It is best to check the KVM kernel module source code for the meaning of events. +INTERACTIVE COMMANDS +-------------------- +[horizontal] +*c*:: clear filter + +*f*:: filter by regular expression + +*g*:: filter by guest name + +*p*:: filter by PID + +*q*:: quit + +*r*:: reset stats + +*x*:: toggle reporting of stats for child trace events + +Press any other key to refresh statistics immediately. + OPTIONS ------- -1:: @@ -46,6 +68,10 @@ OPTIONS --pid=<pid>:: limit statistics to one virtual machine (pid) +-g<guest>:: +--guest=<guest_name>:: + limit statistics to one virtual machine (guest name) + -f<fields>:: --fields=<fields>:: fields to display (regex) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a29786dd9522..4d28a9ddbee0 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); - kvm->buses[bus_idx]->ioeventfd_count--; + if (kvm->buses[bus_idx]) + kvm->buses[bus_idx]->ioeventfd_count--; ioeventfd_release(p); ret = 0; break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a17d78759727..88257b311cb5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -727,8 +727,11 @@ static void kvm_destroy_vm(struct kvm *kvm) list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); - for (i = 0; i < KVM_NR_BUSES; i++) - kvm_io_bus_destroy(kvm->buses[i]); + for (i = 0; i < KVM_NR_BUSES; i++) { + if (kvm->buses[i]) + kvm_io_bus_destroy(kvm->buses[i]); + kvm->buses[i] = NULL; + } kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); @@ -1062,7 +1065,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * changes) is disallowed above, so any other attribute changes getting * here can be skipped. */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { + if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) { r = kvm_iommu_map_pages(kvm, &new); return r; } @@ -3474,6 +3477,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3491,6 +3496,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; /* First try the device referenced by cookie. */ if ((cookie >= 0) && (cookie < bus->dev_count) && @@ -3541,6 +3548,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3553,6 +3562,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; + if (!bus) + return -ENOMEM; + /* exclude ioeventfd which is limited by maximum fd */ if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; @@ -3572,37 +3584,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, } /* Caller must hold slots_lock. */ -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - int i, r; + int i; struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - r = -ENOENT; + if (!bus) + return; + for (i = 0; i < bus->dev_count; i++) if (bus->range[i].dev == dev) { - r = 0; break; } - if (r) - return r; + if (i == bus->dev_count) + return; new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); - if (!new_bus) - return -ENOMEM; + if (!new_bus) { + pr_err("kvm: failed to shrink bus, removing it completely\n"); + goto broken; + } memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); +broken: rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); - return r; + return; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, @@ -3615,6 +3631,8 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + if (!bus) + goto out_unlock; dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); if (dev_idx < 0) |