diff --git a/drivers/Kconfig b/drivers/Kconfig index 4929e923b5c6..e7da9fa724ec 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -80,4 +80,6 @@ source "drivers/rtc/Kconfig" source "drivers/dma/Kconfig" +source "drivers/kvm/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 50f76da598c9..0dd96d1afd39 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_SPI) += spi/ obj-$(CONFIG_PCCARD) += pcmcia/ obj-$(CONFIG_DIO) += dio/ obj-$(CONFIG_SBUS) += sbus/ +obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig new file mode 100644 index 000000000000..36412e90f09b --- /dev/null +++ b/drivers/kvm/Kconfig @@ -0,0 +1,33 @@ +# +# KVM configuration +# +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + depends on X86 && EXPERIMENTAL + ---help--- + Support hosting fully virtualized guest machines using hardware + virtualization extensions. You will need a fairly recent + processor equipped with virtualization extensions. You will also + need to select one or more of the processor modules below. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + To compile this as a module, choose M here: the module + will be called kvm. + + If unsure, say N. + +config KVM_INTEL + tristate "KVM for Intel processors support" + depends on KVM + ---help--- + Provides support for KVM on Intel processors equipped with the VT + extensions. + +config KVM_AMD + tristate "KVM for AMD processors support" + depends on KVM + ---help--- + Provides support for KVM on AMD processors equipped with the AMD-V + (SVM) extensions. diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile new file mode 100644 index 000000000000..c0a789fa9d65 --- /dev/null +++ b/drivers/kvm/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for Kernel-based Virtual Machine module +# + +kvm-objs := kvm_main.o mmu.o x86_emulate.o +obj-$(CONFIG_KVM) += kvm.o +kvm-intel-objs = vmx.o +obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +kvm-amd-objs = svm.o +obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h new file mode 100644 index 000000000000..5785d0870ab6 --- /dev/null +++ b/drivers/kvm/kvm.h @@ -0,0 +1,551 @@ +#ifndef __KVM_H +#define __KVM_H + +/* + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include + +#include "vmx.h" +#include + +#define CR0_PE_MASK (1ULL << 0) +#define CR0_TS_MASK (1ULL << 3) +#define CR0_NE_MASK (1ULL << 5) +#define CR0_WP_MASK (1ULL << 16) +#define CR0_NW_MASK (1ULL << 29) +#define CR0_CD_MASK (1ULL << 30) +#define CR0_PG_MASK (1ULL << 31) + +#define CR3_WPT_MASK (1ULL << 3) +#define CR3_PCD_MASK (1ULL << 4) + +#define CR3_RESEVED_BITS 0x07ULL +#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL) +#define CR3_FLAGS_MASK ((1ULL << 5) - 1) + +#define CR4_VME_MASK (1ULL << 0) +#define CR4_PSE_MASK (1ULL << 4) +#define CR4_PAE_MASK (1ULL << 5) +#define CR4_PGE_MASK (1ULL << 7) +#define CR4_VMXE_MASK (1ULL << 13) + +#define KVM_GUEST_CR0_MASK \ + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ + | CR0_NW_MASK | CR0_CD_MASK) +#define KVM_VM_CR0_ALWAYS_ON \ + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) +#define KVM_GUEST_CR4_MASK \ + (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK) + +#define INVALID_PAGE (~(hpa_t)0) +#define UNMAPPED_GVA (~(gpa_t)0) + +#define KVM_MAX_VCPUS 1 +#define KVM_MEMORY_SLOTS 4 +#define KVM_NUM_MMU_PAGES 256 + +#define FX_IMAGE_SIZE 512 +#define FX_IMAGE_ALIGN 16 +#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN) + +#define DE_VECTOR 0 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 + +#define SELECTOR_TI_MASK (1 << 2) +#define SELECTOR_RPL_MASK 0x03 + +#define IOPL_SHIFT 12 + +/* + * Address types: + * + * gva - guest virtual address + * gpa - guest physical address + * gfn - guest frame number + * hva - host virtual address + * hpa - host physical address + * hfn - host frame number + */ + +typedef unsigned long gva_t; +typedef u64 gpa_t; +typedef unsigned long gfn_t; + +typedef unsigned long hva_t; +typedef u64 hpa_t; +typedef unsigned long hfn_t; + +struct kvm_mmu_page { + struct list_head link; + hpa_t page_hpa; + unsigned long slot_bitmap; /* One bit set per slot which has memory + * in this shadow page. + */ + int global; /* Set if all ptes in this page are global */ + u64 *parent_pte; +}; + +struct vmcs { + u32 revision_id; + u32 abort; + char data[0]; +}; + +#define vmx_msr_entry kvm_msr_entry + +struct kvm_vcpu; + +/* + * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level + * 32-bit). The kvm_mmu structure abstracts the details of the current mmu + * mode. + */ +struct kvm_mmu { + void (*new_cr3)(struct kvm_vcpu *vcpu); + int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); + void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva); + void (*free)(struct kvm_vcpu *vcpu); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + hpa_t root_hpa; + int root_level; + int shadow_root_level; +}; + +struct kvm_guest_debug { + int enabled; + unsigned long bp[4]; + int singlestep; +}; + +enum { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, + VCPU_REGS_RDX = 2, + VCPU_REGS_RBX = 3, + VCPU_REGS_RSP = 4, + VCPU_REGS_RBP = 5, + VCPU_REGS_RSI = 6, + VCPU_REGS_RDI = 7, +#ifdef __x86_64__ + VCPU_REGS_R8 = 8, + VCPU_REGS_R9 = 9, + VCPU_REGS_R10 = 10, + VCPU_REGS_R11 = 11, + VCPU_REGS_R12 = 12, + VCPU_REGS_R13 = 13, + VCPU_REGS_R14 = 14, + VCPU_REGS_R15 = 15, +#endif + NR_VCPU_REGS +}; + +enum { + VCPU_SREG_CS, + VCPU_SREG_DS, + VCPU_SREG_ES, + VCPU_SREG_FS, + VCPU_SREG_GS, + VCPU_SREG_SS, + VCPU_SREG_TR, + VCPU_SREG_LDTR, +}; + +struct kvm_vcpu { + struct kvm *kvm; + union { + struct vmcs *vmcs; + struct vcpu_svm *svm; + }; + struct mutex mutex; + int cpu; + int launched; + unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ +#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) + unsigned long irq_pending[NR_IRQ_WORDS]; + unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ + unsigned long rip; /* needs vcpu_load_rsp_rip() */ + + unsigned long cr0; + unsigned long cr2; + unsigned long cr3; + unsigned long cr4; + unsigned long cr8; + u64 shadow_efer; + u64 apic_base; + int nmsrs; + struct vmx_msr_entry *guest_msrs; + struct vmx_msr_entry *host_msrs; + + struct list_head free_pages; + struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; + struct kvm_mmu mmu; + + struct kvm_guest_debug guest_debug; + + char fx_buf[FX_BUF_SIZE]; + char *host_fx_image; + char *guest_fx_image; + + int mmio_needed; + int mmio_read_completed; + int mmio_is_write; + int mmio_size; + unsigned char mmio_data[8]; + gpa_t mmio_phys_addr; + + struct { + int active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; + } rmode; +}; + +struct kvm_memory_slot { + gfn_t base_gfn; + unsigned long npages; + unsigned long flags; + struct page **phys_mem; + unsigned long *dirty_bitmap; +}; + +struct kvm { + spinlock_t lock; /* protects everything except vcpus */ + int nmemslots; + struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; + struct list_head active_mmu_pages; + struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; + int memory_config_version; + int busy; +}; + +struct kvm_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; + + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_exits; +}; + +struct descriptor_table { + u16 limit; + unsigned long base; +} __attribute__((packed)); + +struct kvm_arch_ops { + int (*cpu_has_kvm_support)(void); /* __init */ + int (*disabled_by_bios)(void); /* __init */ + void (*hardware_enable)(void *dummy); /* __init */ + void (*hardware_disable)(void *dummy); + int (*hardware_setup)(void); /* __init */ + void (*hardware_unsetup)(void); /* __exit */ + + int (*vcpu_create)(struct kvm_vcpu *vcpu); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + + struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + + int (*set_guest_debug)(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); + void (*get_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + void (*set_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + int (*is_long_mode)(struct kvm_vcpu *vcpu); + void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, + unsigned long cr0); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); + void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception); + void (*cache_regs)(struct kvm_vcpu *vcpu); + void (*decache_regs)(struct kvm_vcpu *vcpu); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr); + void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + unsigned long addr, u32 err_code); + + void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); + + int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); + int (*vcpu_setup)(struct kvm_vcpu *vcpu); + void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +}; + +extern struct kvm_stat kvm_stat; +extern struct kvm_arch_ops *kvm_arch_ops; + +#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) + +int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); +void kvm_exit_arch(void); + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu); +int kvm_mmu_init(struct kvm_vcpu *vcpu); + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + +hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); +#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) +#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) +static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } +hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); + +void kvm_emulator_want_group7_invlpg(void); + +extern hpa_t bad_page_address; + +static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return slot->phys_mem[gfn - slot->base_gfn]; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty(struct kvm *kvm, gfn_t gfn); + +enum emulation_result { + EMULATE_DONE, /* no further processing */ + EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_FAIL, /* can't emulate this instruction */ +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long cr2, u16 error_code); +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags); + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, + unsigned long *rflags); + +struct x86_emulate_ctxt; + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); +int emulate_clts(struct kvm_vcpu *vcpu); +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, + unsigned long *dest); +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long value); + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); + +#ifdef __x86_64__ +void set_efer(struct kvm_vcpu *vcpu, u64 efer); +#endif + +void fx_init(struct kvm_vcpu *vcpu); + +void load_msrs(struct vmx_msr_entry *e, int n); +void save_msrs(struct vmx_msr_entry *e, int n); +void kvm_resched(struct kvm_vcpu *vcpu); + +int kvm_read_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *dest); + +int kvm_write_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *data); + +unsigned long segment_base(u16 selector); + +static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL; +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return vcpu->cr4 & CR4_PAE_MASK; +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return vcpu->cr4 & CR4_PSE_MASK; +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return vcpu->cr0 & CR0_PG_MASK; +} + +static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + return slot - kvm->memslots; +} + +static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page->private; +} + +static inline u16 read_fs(void) +{ + u16 seg; + asm ("mov %%fs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 read_gs(void) +{ + u16 seg; + asm ("mov %%gs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 read_ldt(void) +{ + u16 ldt; + asm ("sldt %0" : "=g"(ldt)); + return ldt; +} + +static inline void load_fs(u16 sel) +{ + asm ("mov %0, %%fs" : : "rm"(sel)); +} + +static inline void load_gs(u16 sel) +{ + asm ("mov %0, %%gs" : : "rm"(sel)); +} + +#ifndef load_ldt +static inline void load_ldt(u16 sel) +{ + asm ("lldt %0" : : "g"(sel)); +} +#endif + +static inline void get_idt(struct descriptor_table *table) +{ + asm ("sidt %0" : "=m"(*table)); +} + +static inline void get_gdt(struct descriptor_table *table) +{ + asm ("sgdt %0" : "=m"(*table)); +} + +static inline unsigned long read_tr_base(void) +{ + u16 tr; + asm ("str %0" : "=g"(tr)); + return segment_base(tr); +} + +#ifdef __x86_64__ +static inline unsigned long read_msr(unsigned long msr) +{ + u64 value; + + rdmsrl(msr, value); + return value; +} +#endif + +static inline void fx_save(void *image) +{ + asm ("fxsave (%0)":: "r" (image)); +} + +static inline void fx_restore(void *image) +{ + asm ("fxrstor (%0)":: "r" (image)); +} + +static inline void fpu_init(void) +{ + asm ("finit"); +} + +static inline u32 get_rdx_init_val(void) +{ + return 0x600; /* P6 family */ +} + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" + +#define MSR_IA32_TIME_STAMP_COUNTER 0x010 + +#define TSS_IOPB_BASE_OFFSET 0x66 +#define TSS_BASE_SIZE 0x68 +#define TSS_IOPB_SIZE (65536 / 8) +#define TSS_REDIRECTION_SIZE (256 / 8) +#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) + +#ifdef __x86_64__ + +/* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore + * we need to allocate shadow page tables in the first 4GB of memory, which + * happens to fit the DMA32 zone. + */ +#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32) + +#else + +#define GFP_KVM_MMU GFP_KERNEL + +#endif + +#endif diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c new file mode 100644 index 000000000000..b6b8a41b5ec8 --- /dev/null +++ b/drivers/kvm/kvm_main.c @@ -0,0 +1,1935 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "kvm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "x86_emulate.h" +#include "segment_descriptor.h" + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +struct kvm_arch_ops *kvm_arch_ops; +struct kvm_stat kvm_stat; +EXPORT_SYMBOL_GPL(kvm_stat); + +static struct kvm_stats_debugfs_item { + const char *name; + u32 *data; + struct dentry *dentry; +} debugfs_entries[] = { + { "pf_fixed", &kvm_stat.pf_fixed }, + { "pf_guest", &kvm_stat.pf_guest }, + { "tlb_flush", &kvm_stat.tlb_flush }, + { "invlpg", &kvm_stat.invlpg }, + { "exits", &kvm_stat.exits }, + { "io_exits", &kvm_stat.io_exits }, + { "mmio_exits", &kvm_stat.mmio_exits }, + { "signal_exits", &kvm_stat.signal_exits }, + { "irq_exits", &kvm_stat.irq_exits }, + { 0, 0 } +}; + +static struct dentry *debugfs_dir; + +#define MAX_IO_MSRS 256 + +#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL +#define LMSW_GUEST_MASK 0x0eULL +#define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) +#define CR8_RESEVED_BITS (~0x0fULL) +#define EFER_RESERVED_BITS 0xfffffffffffff2fe + +struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +{ + int i; + + for (i = 0; i < vcpu->nmsrs; ++i) + if (vcpu->guest_msrs[i].index == msr) + return &vcpu->guest_msrs[i]; + return 0; +} +EXPORT_SYMBOL_GPL(find_msr_entry); + +#ifdef __x86_64__ +// LDT or TSS descriptor in the GDT. 16 bytes. +struct segment_descriptor_64 { + struct segment_descriptor s; + u32 base_higher; + u32 pad_zero; +}; + +#endif + +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct segment_descriptor *d; + unsigned long table_base; + typedef unsigned long ul; + unsigned long v; + + if (selector == 0) + return 0; + + asm ("sgdt %0" : "=m"(gdt)); + table_base = gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector; + + asm ("sldt %0" : "=g"(ldt_selector)); + table_base = segment_base(ldt_selector); + } + d = (struct segment_descriptor *)(table_base + (selector & ~7)); + v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); +#ifdef __x86_64__ + if (d->system == 0 + && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + +int kvm_read_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *dest) +{ + unsigned char *host_buf = dest; + unsigned long req_size = size; + + while (size) { + hpa_t paddr; + unsigned now; + unsigned offset; + hva_t guest_buf; + + paddr = gva_to_hpa(vcpu, addr); + + if (is_error_hpa(paddr)) + break; + + guest_buf = (hva_t)kmap_atomic( + pfn_to_page(paddr >> PAGE_SHIFT), + KM_USER0); + offset = addr & ~PAGE_MASK; + guest_buf |= offset; + now = min(size, PAGE_SIZE - offset); + memcpy(host_buf, (void*)guest_buf, now); + host_buf += now; + addr += now; + size -= now; + kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); + } + return req_size - size; +} +EXPORT_SYMBOL_GPL(kvm_read_guest); + +int kvm_write_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *data) +{ + unsigned char *host_buf = data; + unsigned long req_size = size; + + while (size) { + hpa_t paddr; + unsigned now; + unsigned offset; + hva_t guest_buf; + + paddr = gva_to_hpa(vcpu, addr); + + if (is_error_hpa(paddr)) + break; + + guest_buf = (hva_t)kmap_atomic( + pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); + offset = addr & ~PAGE_MASK; + guest_buf |= offset; + now = min(size, PAGE_SIZE - offset); + memcpy((void*)guest_buf, host_buf, now); + host_buf += now; + addr += now; + size -= now; + kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); + } + return req_size - size; +} +EXPORT_SYMBOL_GPL(kvm_write_guest); + +static int vcpu_slot(struct kvm_vcpu *vcpu) +{ + return vcpu - vcpu->kvm->vcpus; +} + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot) +{ + struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot]; + + mutex_lock(&vcpu->mutex); + if (unlikely(!vcpu->vmcs)) { + mutex_unlock(&vcpu->mutex); + return 0; + } + return kvm_arch_ops->vcpu_load(vcpu); +} + +static void vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_arch_ops->vcpu_put(vcpu); + put_cpu(); + mutex_unlock(&vcpu->mutex); +} + +static int kvm_dev_open(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int i; + + if (!kvm) + return -ENOMEM; + + spin_lock_init(&kvm->lock); + INIT_LIST_HEAD(&kvm->active_mmu_pages); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_vcpu *vcpu = &kvm->vcpus[i]; + + mutex_init(&vcpu->mutex); + vcpu->mmu.root_hpa = INVALID_PAGE; + INIT_LIST_HEAD(&vcpu->free_pages); + } + filp->private_data = kvm; + return 0; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + if (!dont || free->phys_mem != dont->phys_mem) + if (free->phys_mem) { + for (i = 0; i < free->npages; ++i) + __free_page(free->phys_mem[i]); + vfree(free->phys_mem); + } + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + vfree(free->dirty_bitmap); + + free->phys_mem = 0; + free->npages = 0; + free->dirty_bitmap = 0; +} + +static void kvm_free_physmem(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) + kvm_free_physmem_slot(&kvm->memslots[i], 0); +} + +static void kvm_free_vcpu(struct kvm_vcpu *vcpu) +{ + kvm_arch_ops->vcpu_free(vcpu); + kvm_mmu_destroy(vcpu); +} + +static void kvm_free_vcpus(struct kvm *kvm) +{ + unsigned int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) + kvm_free_vcpu(&kvm->vcpus[i]); +} + +static int kvm_dev_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); + kfree(kvm); + return 0; +} + +static void inject_gp(struct kvm_vcpu *vcpu) +{ + kvm_arch_ops->inject_gp(vcpu, 0); +} + +static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, + unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; + unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; + int i; + u64 pdpte; + u64 *pdpt; + struct kvm_memory_slot *memslot; + + spin_lock(&vcpu->kvm->lock); + memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); + /* FIXME: !memslot - emulate? 0xff? */ + pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); + + for (i = 0; i < 4; ++i) { + pdpte = pdpt[offset + i]; + if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) + break; + } + + kunmap_atomic(pdpt, KM_USER0); + spin_unlock(&vcpu->kvm->lock); + + return i != 4; +} + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (cr0 & CR0_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", + cr0, vcpu->cr0); + inject_gp(vcpu); + return; + } + + if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { + printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); + inject_gp(vcpu); + return; + } + + if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { + printk(KERN_DEBUG "set_cr0: #GP, set PG flag " + "and a clear PE flag\n"); + inject_gp(vcpu); + return; + } + + if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { +#ifdef __x86_64__ + if ((vcpu->shadow_efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while PAE is disabled\n"); + inject_gp(vcpu); + return; + } + kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while CS.L == 1\n"); + inject_gp(vcpu); + return; + + } + } else +#endif + if (is_pae(vcpu) && + pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { + printk(KERN_DEBUG "set_cr0: #GP, pdptrs " + "reserved bits\n"); + inject_gp(vcpu); + return; + } + + } + + kvm_arch_ops->set_cr0(vcpu, cr0); + vcpu->cr0 = cr0; + + spin_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + spin_unlock(&vcpu->kvm->lock); + return; +} +EXPORT_SYMBOL_GPL(set_cr0); + +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); +} +EXPORT_SYMBOL_GPL(lmsw); + +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + + if (kvm_arch_ops->is_long_mode(vcpu)) { + if (!(cr4 & CR4_PAE_MASK)) { + printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " + "in long mode\n"); + inject_gp(vcpu); + return; + } + } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) + && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { + printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); + inject_gp(vcpu); + } + + if (cr4 & CR4_VMXE_MASK) { + printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); + inject_gp(vcpu); + return; + } + kvm_arch_ops->set_cr4(vcpu, cr4); + spin_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + spin_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr4); + +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (kvm_arch_ops->is_long_mode(vcpu)) { + if ( cr3 & CR3_L_MODE_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + } else { + if (cr3 & CR3_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + if (is_paging(vcpu) && is_pae(vcpu) && + pdptrs_have_reserved_bits_set(vcpu, cr3)) { + printk(KERN_DEBUG "set_cr3: #GP, pdptrs " + "reserved bits\n"); + inject_gp(vcpu); + return; + } + } + + vcpu->cr3 = cr3; + spin_lock(&vcpu->kvm->lock); + vcpu->mmu.new_cr3(vcpu); + spin_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr3); + +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if ( cr8 & CR8_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); + inject_gp(vcpu); + return; + } + vcpu->cr8 = cr8; +} +EXPORT_SYMBOL_GPL(set_cr8); + +void fx_init(struct kvm_vcpu *vcpu) +{ + struct __attribute__ ((__packed__)) fx_image_s { + u16 control; //fcw + u16 status; //fsw + u16 tag; // ftw + u16 opcode; //fop + u64 ip; // fpu ip + u64 operand;// fpu dp + u32 mxcsr; + u32 mxcsr_mask; + + } *fx_image; + + fx_save(vcpu->host_fx_image); + fpu_init(); + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + + fx_image = (struct fx_image_s *)vcpu->guest_fx_image; + fx_image->mxcsr = 0x1f80; + memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), + 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); +} +EXPORT_SYMBOL_GPL(fx_init); + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n) +{ + int r; + struct kvm_vcpu *vcpu; + + r = -EINVAL; + if (n < 0 || n >= KVM_MAX_VCPUS) + goto out; + + vcpu = &kvm->vcpus[n]; + + mutex_lock(&vcpu->mutex); + + if (vcpu->vmcs) { + mutex_unlock(&vcpu->mutex); + return -EEXIST; + } + + vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, + FX_IMAGE_ALIGN); + vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; + + vcpu->cpu = -1; /* First load will set up TR */ + vcpu->kvm = kvm; + r = kvm_arch_ops->vcpu_create(vcpu); + if (r < 0) + goto out_free_vcpus; + + kvm_arch_ops->vcpu_load(vcpu); + + r = kvm_arch_ops->vcpu_setup(vcpu); + if (r >= 0) + r = kvm_mmu_init(vcpu); + + vcpu_put(vcpu); + + if (r < 0) + goto out_free_vcpus; + + return 0; + +out_free_vcpus: + kvm_free_vcpu(vcpu); + mutex_unlock(&vcpu->mutex); +out: + return r; +} + +/* + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. + */ +static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_memory_region *mem) +{ + int r; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; + int memory_config_version; + + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + + memslot = &kvm->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; + + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + +raced: + spin_lock(&kvm->lock); + + memory_config_version = kvm->memory_config_version; + new = old = *memslot; + + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_unlock; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots[i]; + + if (s == memslot) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_unlock; + } + /* + * Do memory allocations outside lock. memory_config_version will + * detect any races. + */ + spin_unlock(&kvm->lock); + + /* Deallocate if slot is being removed */ + if (!npages) + new.phys_mem = 0; + + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = 0; + + r = -ENOMEM; + + /* Allocate if a slot is being created */ + if (npages && !new.phys_mem) { + new.phys_mem = vmalloc(npages * sizeof(struct page *)); + + if (!new.phys_mem) + goto out_free; + + memset(new.phys_mem, 0, npages * sizeof(struct page *)); + for (i = 0; i < npages; ++i) { + new.phys_mem[i] = alloc_page(GFP_HIGHUSER + | __GFP_ZERO); + if (!new.phys_mem[i]) + goto out_free; + } + } + + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; + + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) + goto out_free; + memset(new.dirty_bitmap, 0, dirty_bytes); + } + + spin_lock(&kvm->lock); + + if (memory_config_version != kvm->memory_config_version) { + spin_unlock(&kvm->lock); + kvm_free_physmem_slot(&new, &old); + goto raced; + } + + r = -EAGAIN; + if (kvm->busy) + goto out_unlock; + + if (mem->slot >= kvm->nmemslots) + kvm->nmemslots = mem->slot + 1; + + *memslot = new; + ++kvm->memory_config_version; + + spin_unlock(&kvm->lock); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_vcpu *vcpu; + + vcpu = vcpu_load(kvm, i); + if (!vcpu) + continue; + kvm_mmu_reset_context(vcpu); + vcpu_put(vcpu); + } + + kvm_free_physmem_slot(&old, &new); + return 0; + +out_unlock: + spin_unlock(&kvm->lock); +out_free: + kvm_free_physmem_slot(&new, &old); +out: + return r; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + int r, i; + int n; + unsigned long any = 0; + + spin_lock(&kvm->lock); + + /* + * Prevent changes to guest memory configuration even while the lock + * is not taken. + */ + ++kvm->busy; + spin_unlock(&kvm->lock); + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, 8) / 8; + + for (i = 0; !any && i < n; ++i) + any = memslot->dirty_bitmap[i]; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + + if (any) { + spin_lock(&kvm->lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->lock); + memset(memslot->dirty_bitmap, 0, n); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_vcpu *vcpu = vcpu_load(kvm, i); + + if (!vcpu) + continue; + kvm_arch_ops->tlb_flush(vcpu); + vcpu_put(vcpu); + } + } + + r = 0; + +out: + spin_lock(&kvm->lock); + --kvm->busy; + spin_unlock(&kvm->lock); + return r; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return 0; +} +EXPORT_SYMBOL_GPL(gfn_to_memslot); + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memory_slot *memslot = 0; + unsigned long rel_gfn; + + for (i = 0; i < kvm->nmemslots; ++i) { + memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) { + + if (!memslot || !memslot->dirty_bitmap) + return; + + rel_gfn = gfn - memslot->base_gfn; + + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); + return; + } + } +} + +static int emulator_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + void *data = val; + + while (bytes) { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned long pfn; + struct kvm_memory_slot *memslot; + void *page; + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + pfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, pfn); + if (!memslot) + return X86EMUL_UNHANDLEABLE; + page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0); + + memcpy(data, page + offset, tocopy); + + kunmap_atomic(page, KM_USER0); + + bytes -= tocopy; + data += tocopy; + addr += tocopy; + } + + return X86EMUL_CONTINUE; +} + +static int emulator_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", + addr, bytes); + return X86EMUL_UNHANDLEABLE; +} + +static int emulator_read_emulated(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + vcpu->mmio_read_completed = 0; + return X86EMUL_CONTINUE; + } else if (emulator_read_std(addr, val, bytes, ctxt) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + else { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) + return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT; + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; + } +} + +static int emulator_write_emulated(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, &val, bytes); + + return X86EMUL_CONTINUE; +} + +static int emulator_cmpxchg_emulated(unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + static int reported; + + if (!reported) { + reported = 1; + printk(KERN_WARNING "kvm: emulating exchange as write\n"); + } + return emulator_write_emulated(addr, new, bytes, ctxt); +} + +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_arch_ops->get_segment_base(vcpu, seg); +} + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + spin_lock(&vcpu->kvm->lock); + vcpu->mmu.inval_page(vcpu, address); + spin_unlock(&vcpu->kvm->lock); + kvm_arch_ops->invlpg(vcpu, address); + return X86EMUL_CONTINUE; +} + +int emulate_clts(struct kvm_vcpu *vcpu) +{ + unsigned long cr0 = vcpu->cr0; + + cr0 &= ~CR0_TS_MASK; + kvm_arch_ops->set_cr0(vcpu, cr0); + return X86EMUL_CONTINUE; +} + +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + + switch (dr) { + case 0 ... 3: + *dest = kvm_arch_ops->get_dr(vcpu, dr); + return X86EMUL_CONTINUE; + default: + printk(KERN_DEBUG "%s: unexpected dr %u\n", + __FUNCTION__, dr); + return X86EMUL_UNHANDLEABLE; + } +} + +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +{ + unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; + int exception; + + kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); + if (exception) { + /* FIXME: better handling */ + return X86EMUL_UNHANDLEABLE; + } + return X86EMUL_CONTINUE; +} + +static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) +{ + static int reported; + u8 opcodes[4]; + unsigned long rip = ctxt->vcpu->rip; + unsigned long rip_linear; + + rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); + + if (reported) + return; + + emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); + + printk(KERN_ERR "emulation failed but !mmio_needed?" + " rip %lx %02x %02x %02x %02x\n", + rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + reported = 1; +} + +struct x86_emulate_ops emulate_ops = { + .read_std = emulator_read_std, + .write_std = emulator_write_std, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, + struct kvm_run *run, + unsigned long cr2, + u16 error_code) +{ + struct x86_emulate_ctxt emulate_ctxt; + int r; + int cs_db, cs_l; + + kvm_arch_ops->cache_regs(vcpu); + + kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + emulate_ctxt.vcpu = vcpu; + emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); + emulate_ctxt.cr2 = cr2; + emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { + emulate_ctxt.cs_base = 0; + emulate_ctxt.ds_base = 0; + emulate_ctxt.es_base = 0; + emulate_ctxt.ss_base = 0; + } else { + emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); + emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); + emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); + emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); + } + + emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); + emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); + + vcpu->mmio_is_write = 0; + r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); + + if ((r || vcpu->mmio_is_write) && run) { + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } + + if (r) { + if (!vcpu->mmio_needed) { + report_emulation_failure(&emulate_ctxt); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) + return EMULATE_DO_MMIO; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(emulate_instruction); + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_arch_ops->set_gdt(vcpu, &dt); +} + +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_arch_ops->set_idt(vcpu, &dt); +} + +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + lmsw(vcpu, msw); + *rflags = kvm_arch_ops->get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + switch (cr) { + case 0: + return vcpu->cr0; + case 2: + return vcpu->cr2; + case 3: + return vcpu->cr3; + case 4: + return vcpu->cr4; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + return 0; + } +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); + *rflags = kvm_arch_ops->get_rflags(vcpu); + break; + case 2: + vcpu->cr2 = val; + break; + case 3: + set_cr3(vcpu, val); + break; + case 4: + set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + } +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); +} + +#ifdef __x86_64__ + +void set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vmx_msr_entry *msr; + + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + inject_gp(vcpu); + return; + } + + if (is_paging(vcpu) + && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + inject_gp(vcpu); + return; + } + + efer &= ~EFER_LMA; + efer |= vcpu->shadow_efer & EFER_LMA; + + vcpu->shadow_efer = efer; + + msr = find_msr_entry(vcpu, MSR_EFER); + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + msr->data = efer; +} +EXPORT_SYMBOL_GPL(set_efer); + +#endif + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + return kvm_arch_ops->set_msr(vcpu, msr_index, data); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + vcpu_put(vcpu); + cond_resched(); + /* Cannot fail - no vcpu unplug yet. */ + vcpu_load(vcpu->kvm, vcpu_slot(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +void load_msrs(struct vmx_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + wrmsrl(e[i].index, e[i].data); +} +EXPORT_SYMBOL_GPL(load_msrs); + +void save_msrs(struct vmx_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + rdmsrl(e[i].index, e[i].data); +} +EXPORT_SYMBOL_GPL(save_msrs); + +static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run) +{ + struct kvm_vcpu *vcpu; + int r; + + if (kvm_run->vcpu < 0 || kvm_run->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, kvm_run->vcpu); + if (!vcpu) + return -ENOENT; + + if (kvm_run->emulated) { + kvm_arch_ops->skip_emulated_instruction(vcpu); + kvm_run->emulated = 0; + } + + if (kvm_run->mmio_completed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + } + + vcpu->mmio_needed = 0; + + r = kvm_arch_ops->run(vcpu, kvm_run); + + vcpu_put(vcpu); + return r; +} + +static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs) +{ + struct kvm_vcpu *vcpu; + + if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, regs->vcpu); + if (!vcpu) + return -ENOENT; + + kvm_arch_ops->cache_regs(vcpu); + + regs->rax = vcpu->regs[VCPU_REGS_RAX]; + regs->rbx = vcpu->regs[VCPU_REGS_RBX]; + regs->rcx = vcpu->regs[VCPU_REGS_RCX]; + regs->rdx = vcpu->regs[VCPU_REGS_RDX]; + regs->rsi = vcpu->regs[VCPU_REGS_RSI]; + regs->rdi = vcpu->regs[VCPU_REGS_RDI]; + regs->rsp = vcpu->regs[VCPU_REGS_RSP]; + regs->rbp = vcpu->regs[VCPU_REGS_RBP]; +#ifdef __x86_64__ + regs->r8 = vcpu->regs[VCPU_REGS_R8]; + regs->r9 = vcpu->regs[VCPU_REGS_R9]; + regs->r10 = vcpu->regs[VCPU_REGS_R10]; + regs->r11 = vcpu->regs[VCPU_REGS_R11]; + regs->r12 = vcpu->regs[VCPU_REGS_R12]; + regs->r13 = vcpu->regs[VCPU_REGS_R13]; + regs->r14 = vcpu->regs[VCPU_REGS_R14]; + regs->r15 = vcpu->regs[VCPU_REGS_R15]; +#endif + + regs->rip = vcpu->rip; + regs->rflags = kvm_arch_ops->get_rflags(vcpu); + + /* + * Don't leak debug flags in case they were set for guest debugging + */ + if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs) +{ + struct kvm_vcpu *vcpu; + + if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, regs->vcpu); + if (!vcpu) + return -ENOENT; + + vcpu->regs[VCPU_REGS_RAX] = regs->rax; + vcpu->regs[VCPU_REGS_RBX] = regs->rbx; + vcpu->regs[VCPU_REGS_RCX] = regs->rcx; + vcpu->regs[VCPU_REGS_RDX] = regs->rdx; + vcpu->regs[VCPU_REGS_RSI] = regs->rsi; + vcpu->regs[VCPU_REGS_RDI] = regs->rdi; + vcpu->regs[VCPU_REGS_RSP] = regs->rsp; + vcpu->regs[VCPU_REGS_RBP] = regs->rbp; +#ifdef __x86_64__ + vcpu->regs[VCPU_REGS_R8] = regs->r8; + vcpu->regs[VCPU_REGS_R9] = regs->r9; + vcpu->regs[VCPU_REGS_R10] = regs->r10; + vcpu->regs[VCPU_REGS_R11] = regs->r11; + vcpu->regs[VCPU_REGS_R12] = regs->r12; + vcpu->regs[VCPU_REGS_R13] = regs->r13; + vcpu->regs[VCPU_REGS_R14] = regs->r14; + vcpu->regs[VCPU_REGS_R15] = regs->r15; +#endif + + vcpu->rip = regs->rip; + kvm_arch_ops->set_rflags(vcpu, regs->rflags); + + kvm_arch_ops->decache_regs(vcpu); + + vcpu_put(vcpu); + + return 0; +} + +static void get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_arch_ops->get_segment(vcpu, var, seg); +} + +static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs) +{ + struct kvm_vcpu *vcpu; + struct descriptor_table dt; + + if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + vcpu = vcpu_load(kvm, sregs->vcpu); + if (!vcpu) + return -ENOENT; + + get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_arch_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; + kvm_arch_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; + + sregs->cr0 = vcpu->cr0; + sregs->cr2 = vcpu->cr2; + sregs->cr3 = vcpu->cr3; + sregs->cr4 = vcpu->cr4; + sregs->cr8 = vcpu->cr8; + sregs->efer = vcpu->shadow_efer; + sregs->apic_base = vcpu->apic_base; + + memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, + sizeof sregs->interrupt_bitmap); + + vcpu_put(vcpu); + + return 0; +} + +static void set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_arch_ops->set_segment(vcpu, var, seg); +} + +static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs) +{ + struct kvm_vcpu *vcpu; + int mmu_reset_needed = 0; + int i; + struct descriptor_table dt; + + if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + vcpu = vcpu_load(kvm, sregs->vcpu); + if (!vcpu) + return -ENOENT; + + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; + kvm_arch_ops->set_idt(vcpu, &dt); + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; + kvm_arch_ops->set_gdt(vcpu, &dt); + + vcpu->cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->cr3 != sregs->cr3; + vcpu->cr3 = sregs->cr3; + + vcpu->cr8 = sregs->cr8; + + mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; +#ifdef __x86_64__ + kvm_arch_ops->set_efer(vcpu, sregs->efer); +#endif + vcpu->apic_base = sregs->apic_base; + + mmu_reset_needed |= vcpu->cr0 != sregs->cr0; + kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); + + mmu_reset_needed |= vcpu->cr4 != sregs->cr4; + kvm_arch_ops->set_cr4(vcpu, sregs->cr4); + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->irq_pending); + vcpu->irq_summary = 0; + for (i = 0; i < NR_IRQ_WORDS; ++i) + if (vcpu->irq_pending[i]) + __set_bit(i, &vcpu->irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + */ +static u32 msrs_to_save[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef __x86_64__ + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return set_msr(vcpu, index, *data); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + struct kvm_vcpu *vcpu; + int i; + + if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, msrs->vcpu); + if (!vcpu) + return -ENOENT; + + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = vmalloc(size); + if (!entries) + goto out; + + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r = n = __msr_io(kvm, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r = n; + +out_free: + vfree(entries); +out: + return r; +} + +/* + * Translate a guest virtual address to a guest physical address. + */ +static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr) +{ + unsigned long vaddr = tr->linear_address; + struct kvm_vcpu *vcpu; + gpa_t gpa; + + vcpu = vcpu_load(kvm, tr->vcpu); + if (!vcpu) + return -ENOENT; + spin_lock(&kvm->lock); + gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); + tr->physical_address = gpa; + tr->valid = gpa != UNMAPPED_GVA; + tr->writeable = 1; + tr->usermode = 0; + spin_unlock(&kvm->lock); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq) +{ + struct kvm_vcpu *vcpu; + + if (irq->vcpu < 0 || irq->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + vcpu = vcpu_load(kvm, irq->vcpu); + if (!vcpu) + return -ENOENT; + + set_bit(irq->irq, vcpu->irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_dev_ioctl_debug_guest(struct kvm *kvm, + struct kvm_debug_guest *dbg) +{ + struct kvm_vcpu *vcpu; + int r; + + if (dbg->vcpu < 0 || dbg->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + vcpu = vcpu_load(kvm, dbg->vcpu); + if (!vcpu) + return -ENOENT; + + r = kvm_arch_ops->set_guest_debug(vcpu, dbg); + + vcpu_put(vcpu); + + return r; +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + int r = -EINVAL; + + switch (ioctl) { + case KVM_CREATE_VCPU: { + r = kvm_dev_ioctl_create_vcpu(kvm, arg); + if (r) + goto out; + break; + } + case KVM_RUN: { + struct kvm_run kvm_run; + + r = -EFAULT; + if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) + goto out; + r = kvm_dev_ioctl_run(kvm, &kvm_run); + if (r < 0) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) + goto out; + r = 0; + break; + } + case KVM_GET_REGS: { + struct kvm_regs kvm_regs; + + r = -EFAULT; + if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs)) + goto out; + r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &kvm_regs, sizeof kvm_regs)) + goto out; + r = 0; + break; + } + case KVM_SET_REGS: { + struct kvm_regs kvm_regs; + + r = -EFAULT; + if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs)) + goto out; + r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_SREGS: { + struct kvm_sregs kvm_sregs; + + r = -EFAULT; + if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs)) + goto out; + r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &kvm_sregs, sizeof kvm_sregs)) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + struct kvm_sregs kvm_sregs; + + r = -EFAULT; + if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs)) + goto out; + r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, (void *)arg, sizeof tr)) + goto out; + r = kvm_dev_ioctl_translate(kvm, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, (void *)arg, sizeof irq)) + goto out; + r = kvm_dev_ioctl_interrupt(kvm, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_DEBUG_GUEST: { + struct kvm_debug_guest dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, (void *)arg, sizeof dbg)) + goto out; + r = kvm_dev_ioctl_debug_guest(kvm, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_mem, (void *)arg, sizeof kvm_mem)) + goto out; + r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, (void *)arg, sizeof log)) + goto out; + r = kvm_dev_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } + case KVM_GET_MSRS: + r = msr_io(kvm, (void __user *)arg, get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(kvm, (void __user *)arg, do_set_msr, 0); + break; + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = (void __user *)arg; + struct kvm_msr_list msr_list; + unsigned n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = ARRAY_SIZE(msrs_to_save); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < ARRAY_SIZE(msrs_to_save)) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + sizeof msrs_to_save)) + goto out; + r = 0; + } + default: + ; + } +out: + return r; +} + +static struct page *kvm_dev_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm *kvm = vma->vm_file->private_data; + unsigned long pgoff; + struct kvm_memory_slot *slot; + struct page *page; + + *type = VM_FAULT_MINOR; + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + slot = gfn_to_memslot(kvm, pgoff); + if (!slot) + return NOPAGE_SIGBUS; + page = gfn_to_page(slot, pgoff); + if (!page) + return NOPAGE_SIGBUS; + get_page(page); + return page; +} + +static struct vm_operations_struct kvm_dev_vm_ops = { + .nopage = kvm_dev_nopage, +}; + +static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_dev_vm_ops; + return 0; +} + +static struct file_operations kvm_chardev_ops = { + .open = kvm_dev_open, + .release = kvm_dev_release, + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, + .mmap = kvm_dev_mmap, +}; + +static struct miscdevice kvm_dev = { + MISC_DYNAMIC_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + if (val == SYS_RESTART) { + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + +static __init void kvm_init_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + debugfs_dir = debugfs_create_dir("kvm", 0); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir, + p->data); +} + +static void kvm_exit_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(debugfs_dir); +} + +hpa_t bad_page_address; + +int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) +{ + int r; + + kvm_arch_ops = ops; + + if (!kvm_arch_ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + return -EOPNOTSUPP; + } + if (kvm_arch_ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); + return -EOPNOTSUPP; + } + + r = kvm_arch_ops->hardware_setup(); + if (r < 0) + return r; + + on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1); + register_reboot_notifier(&kvm_reboot_notifier); + + kvm_chardev_ops.owner = module; + + r = misc_register(&kvm_dev); + if (r) { + printk (KERN_ERR "kvm: misc device register failed\n"); + goto out_free; + } + + return r; + +out_free: + unregister_reboot_notifier(&kvm_reboot_notifier); + on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + kvm_arch_ops->hardware_unsetup(); + return r; +} + +void kvm_exit_arch(void) +{ + misc_deregister(&kvm_dev); + + unregister_reboot_notifier(&kvm_reboot_notifier); + on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + kvm_arch_ops->hardware_unsetup(); +} + +static __init int kvm_init(void) +{ + static struct page *bad_page; + int r = 0; + + kvm_init_debug(); + + if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { + r = -ENOMEM; + goto out; + } + + bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; + memset(__va(bad_page_address), 0, PAGE_SIZE); + + return r; + +out: + kvm_exit_debug(); + return r; +} + +static __exit void kvm_exit(void) +{ + kvm_exit_debug(); + __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); +} + +module_init(kvm_init) +module_exit(kvm_exit) + +EXPORT_SYMBOL_GPL(kvm_init_arch); +EXPORT_SYMBOL_GPL(kvm_exit_arch); diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h new file mode 100644 index 000000000000..7d7f2aa10960 --- /dev/null +++ b/drivers/kvm/kvm_svm.h @@ -0,0 +1,44 @@ +#ifndef __KVM_SVM_H +#define __KVM_SVM_H + +#include +#include +#include + +#include "svm.h" +#include "kvm.h" + +static const u32 host_save_msrs[] = { +#ifdef __x86_64__ + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, MSR_GS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_IA32_DEBUGCTLMSR, /*MSR_IA32_LASTBRANCHFROMIP, + MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/ +}; + +#define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs)) +#define NUM_DB_REGS 4 + +struct vcpu_svm { + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + + unsigned long cr0; + unsigned long cr4; + unsigned long db_regs[NUM_DB_REGS]; + + u64 next_rip; + + u64 host_msrs[NR_HOST_SAVE_MSRS]; + unsigned long host_cr2; + unsigned long host_db_regs[NUM_DB_REGS]; + unsigned long host_dr6; + unsigned long host_dr7; +}; + +#endif + diff --git a/drivers/kvm/kvm_vmx.h b/drivers/kvm/kvm_vmx.h new file mode 100644 index 000000000000..87e12d2bfa16 --- /dev/null +++ b/drivers/kvm/kvm_vmx.h @@ -0,0 +1,14 @@ +#ifndef __KVM_VMX_H +#define __KVM_VMX_H + +#ifdef __x86_64__ +/* + * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt + * mechanism (cpu bug AA24) + */ +#define NR_BAD_MSRS 2 +#else +#define NR_BAD_MSRS 0 +#endif + +#endif diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c new file mode 100644 index 000000000000..4e29d9b7211c --- /dev/null +++ b/drivers/kvm/mmu.c @@ -0,0 +1,699 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include +#include +#include +#include +#include +#include + +#include "vmx.h" +#include "kvm.h" + +#define pgprintk(x...) do { } while (0) + +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } + +#define PT64_ENT_PER_PAGE 512 +#define PT32_ENT_PER_PAGE 1024 + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_MASK (1ULL << 63) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + + +#define PT32_PTE_COPY_MASK \ + (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \ + PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_PAT_MASK | \ + PT_GLOBAL_MASK ) + +#define PT32_NON_PTE_COPY_MASK \ + (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \ + PT_ACCESSED_MASK | PT_DIRTY_MASK) + + +#define PT64_PTE_COPY_MASK \ + (PT64_NX_MASK | PT32_PTE_COPY_MASK) + +#define PT64_NON_PTE_COPY_MASK \ + (PT64_NX_MASK | PT32_NON_PTE_COPY_MASK) + + + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1) +#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT) + +#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1) +#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT)) + +#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT) + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) + + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +static int is_write_protection(struct kvm_vcpu *vcpu) +{ + return vcpu->cr0 & CR0_WP_MASK; +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +static int is_writeble_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_io_pte(unsigned long pte) +{ + return pte & PT_SHADOW_IO_MARK; +} + +static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) +{ + struct kvm_mmu_page *page_head = page_header(page_hpa); + + list_del(&page_head->link); + page_head->page_hpa = page_hpa; + list_add(&page_head->link, &vcpu->free_pages); +} + +static int is_empty_shadow_page(hpa_t page_hpa) +{ + u32 *pos; + u32 *end; + for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32); + pos != end; pos++) + if (*pos != 0) + return 0; + return 1; +} + +static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) +{ + struct kvm_mmu_page *page; + + if (list_empty(&vcpu->free_pages)) + return INVALID_PAGE; + + page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); + list_del(&page->link); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); + ASSERT(is_empty_shadow_page(page->page_hpa)); + page->slot_bitmap = 0; + page->global = 1; + page->parent_pte = parent_pte; + return page->page_hpa; +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); + struct kvm_mmu_page *page_head = page_header(__pa(pte)); + + __set_bit(slot, &page_head->slot_bitmap); +} + +hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; +} + +hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct kvm_memory_slot *slot; + struct page *page; + + ASSERT((gpa & HPA_ERR_MASK) == 0); + slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!slot) + return gpa | HPA_ERR_MASK; + page = gfn_to_page(slot, gpa >> PAGE_SHIFT); + return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) + | (gpa & (PAGE_SIZE-1)); +} + +hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return UNMAPPED_GVA; + return gpa_to_hpa(vcpu, gpa); +} + + +static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa, + int level) +{ + ASSERT(vcpu); + ASSERT(VALID_PAGE(page_hpa)); + ASSERT(level <= PT64_ROOT_LEVEL && level > 0); + + if (level == 1) + memset(__va(page_hpa), 0, PAGE_SIZE); + else { + u64 *pos; + u64 *end; + + for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE; + pos != end; pos++) { + u64 current_ent = *pos; + + *pos = 0; + if (is_present_pte(current_ent)) + release_pt_page_64(vcpu, + current_ent & + PT64_BASE_ADDR_MASK, + level - 1); + } + } + kvm_mmu_free_page(vcpu, page_hpa); +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) +{ + int level = PT32E_ROOT_LEVEL; + hpa_t table_addr = vcpu->mmu.root_hpa; + + for (; ; level--) { + u32 index = PT64_INDEX(v, level); + u64 *table; + + ASSERT(VALID_PAGE(table_addr)); + table = __va(table_addr); + + if (level == 1) { + mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); + page_header_update_slot(vcpu->kvm, table, v); + table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | + PT_USER_MASK; + return 0; + } + + if (table[index] == 0) { + hpa_t new_table = kvm_mmu_alloc_page(vcpu, + &table[index]); + + if (!VALID_PAGE(new_table)) { + pgprintk("nonpaging_map: ENOMEM\n"); + return -ENOMEM; + } + + if (level == PT32E_ROOT_LEVEL) + table[index] = new_table | PT_PRESENT_MASK; + else + table[index] = new_table | PT_PRESENT_MASK | + PT_WRITABLE_MASK | PT_USER_MASK; + } + table_addr = table[index] & PT64_BASE_ADDR_MASK; + } +} + +static void nonpaging_flush(struct kvm_vcpu *vcpu) +{ + hpa_t root = vcpu->mmu.root_hpa; + + ++kvm_stat.tlb_flush; + pgprintk("nonpaging_flush\n"); + ASSERT(VALID_PAGE(root)); + release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); + root = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(root)); + vcpu->mmu.root_hpa = root; + if (is_paging(vcpu)) + root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); + kvm_arch_ops->set_cr3(vcpu, root); + kvm_arch_ops->tlb_flush(vcpu); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + return vaddr; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + int ret; + gpa_t addr = gva; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); + + for (;;) { + hpa_t paddr; + + paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); + + if (is_error_hpa(paddr)) + return 1; + + ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); + if (ret) { + nonpaging_flush(vcpu); + continue; + } + break; + } + return ret; +} + +static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) +{ +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + hpa_t root; + + ASSERT(vcpu); + root = vcpu->mmu.root_hpa; + if (VALID_PAGE(root)) + release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); + vcpu->mmu.root_hpa = INVALID_PAGE; +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->inval_page = nonpaging_inval_page; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->root_level = PT32E_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(context->root_hpa)); + kvm_arch_ops->set_cr3(vcpu, context->root_hpa); + return 0; +} + + +static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page, *npage; + + list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages, + link) { + if (page->global) + continue; + + if (!page->parent_pte) + continue; + + *page->parent_pte = 0; + release_pt_page_64(vcpu, page->page_hpa, 1); + } + ++kvm_stat.tlb_flush; + kvm_arch_ops->tlb_flush(vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + kvm_mmu_flush_tlb(vcpu); +} + +static void mark_pagetable_nonglobal(void *shadow_pte) +{ + page_header(__pa(shadow_pte))->global = 0; +} + +static inline void set_pte_common(struct kvm_vcpu *vcpu, + u64 *shadow_pte, + gpa_t gaddr, + int dirty, + u64 access_bits) +{ + hpa_t paddr; + + *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; + if (!dirty) + access_bits &= ~PT_WRITABLE_MASK; + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + *shadow_pte |= access_bits; + + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); + + if (!(*shadow_pte & PT_GLOBAL_MASK)) + mark_pagetable_nonglobal(shadow_pte); + + if (is_error_hpa(paddr)) { + *shadow_pte |= gaddr; + *shadow_pte |= PT_SHADOW_IO_MARK; + *shadow_pte &= ~PT_PRESENT_MASK; + } else { + *shadow_pte |= paddr; + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + } +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + u64 addr, + u32 err_code) +{ + kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); +} + +static inline int fix_read_pf(u64 *shadow_ent) +{ + if ((*shadow_ent & PT_SHADOW_USER_MASK) && + !(*shadow_ent & PT_USER_MASK)) { + /* + * If supervisor write protect is disabled, we shadow kernel + * pages as user pages so we can trap the write access. + */ + *shadow_ent |= PT_USER_MASK; + *shadow_ent &= ~PT_WRITABLE_MASK; + + return 1; + + } + return 0; +} + +static int may_access(u64 pte, int write, int user) +{ + + if (user && !(pte & PT_USER_MASK)) + return 0; + if (write && !(pte & PT_WRITABLE_MASK)) + return 0; + return 1; +} + +/* + * Remove a shadow pte. + */ +static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) +{ + hpa_t page_addr = vcpu->mmu.root_hpa; + int level = vcpu->mmu.shadow_root_level; + + ++kvm_stat.invlpg; + + for (; ; level--) { + u32 index = PT64_INDEX(addr, level); + u64 *table = __va(page_addr); + + if (level == PT_PAGE_TABLE_LEVEL ) { + table[index] = 0; + return; + } + + if (!is_present_pte(table[index])) + return; + + page_addr = table[index] & PT64_BASE_ADDR_MASK; + + if (level == PT_DIRECTORY_LEVEL && + (table[index] & PT_SHADOW_PS_MARK)) { + table[index] = 0; + release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL); + + kvm_arch_ops->tlb_flush(vcpu); + return; + } + } +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->inval_page = paging_inval_page; + context->gva_to_gpa = paging64_gva_to_gpa; + context->free = paging_free; + context->root_level = PT64_ROOT_LEVEL; + context->shadow_root_level = PT64_ROOT_LEVEL; + context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(context->root_hpa)); + kvm_arch_ops->set_cr3(vcpu, context->root_hpa | + (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + return 0; +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->inval_page = paging_inval_page; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(context->root_hpa)); + kvm_arch_ops->set_cr3(vcpu, context->root_hpa | + (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + int ret; + + if ((ret = paging64_init_context(vcpu))) + return ret; + + vcpu->mmu.root_level = PT32E_ROOT_LEVEL; + vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL; + return 0; +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + + if (!is_paging(vcpu)) + return nonpaging_init_context(vcpu); + else if (kvm_arch_ops->is_long_mode(vcpu)) + return paging64_init_context(vcpu); + else if (is_pae(vcpu)) + return paging32E_init_context(vcpu); + else + return paging32_init_context(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->mmu.root_hpa)) { + vcpu->mmu.free(vcpu); + vcpu->mmu.root_hpa = INVALID_PAGE; + } +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + while (!list_empty(&vcpu->free_pages)) { + struct kvm_mmu_page *page; + + page = list_entry(vcpu->free_pages.next, + struct kvm_mmu_page, link); + list_del(&page->link); + __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); + page->page_hpa = INVALID_PAGE; + } +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + int i; + + ASSERT(vcpu); + + for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { + struct page *page; + struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; + + INIT_LIST_HEAD(&page_header->link); + if ((page = alloc_page(GFP_KVM_MMU)) == NULL) + goto error_1; + page->private = (unsigned long)page_header; + page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; + memset(__va(page_header->page_hpa), 0, PAGE_SIZE); + list_add(&page_header->link, &vcpu->free_pages); + } + return 0; + +error_1: + free_mmu_pages(vcpu); + return -ENOMEM; +} + +int kvm_mmu_init(struct kvm_vcpu *vcpu) +{ + int r; + + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + ASSERT(list_empty(&vcpu->free_pages)); + + if ((r = alloc_mmu_pages(vcpu))) + return r; + + if ((r = init_kvm_mmu(vcpu))) { + free_mmu_pages(vcpu); + return r; + } + return 0; +} + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *page; + + list_for_each_entry(page, &kvm->active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, &page->slot_bitmap)) + continue; + + pt = __va(page->page_hpa); + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (pt[i] & PT_WRITABLE_MASK) + pt[i] &= ~PT_WRITABLE_MASK; + + } +} diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h new file mode 100644 index 000000000000..765c2e1a048e --- /dev/null +++ b/drivers/kvm/paging_tmpl.h @@ -0,0 +1,397 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/* + * We need the mmu code to access both 32-bit and 64-bit guest ptes, + * so the code in this file is compiled twice, once per pte size. + */ + +#if PTTYPE == 64 + #define pt_element_t u64 + #define guest_walker guest_walker64 + #define FNAME(name) paging##64_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) + #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK + #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK +#elif PTTYPE == 32 + #define pt_element_t u32 + #define guest_walker guest_walker32 + #define FNAME(name) paging##32_##name + #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) + #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK + #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK +#else + #error Invalid PTTYPE value +#endif + +/* + * The guest_walker structure emulates the behavior of the hardware page + * table walker. + */ +struct guest_walker { + int level; + pt_element_t *table; + pt_element_t inherited_ar; +}; + +static void FNAME(init_walker)(struct guest_walker *walker, + struct kvm_vcpu *vcpu) +{ + hpa_t hpa; + struct kvm_memory_slot *slot; + + walker->level = vcpu->mmu.root_level; + slot = gfn_to_memslot(vcpu->kvm, + (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK); + walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); + + ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) || + (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); + + walker->table = (pt_element_t *)( (unsigned long)walker->table | + (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) ); + walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; +} + +static void FNAME(release_walker)(struct guest_walker *walker) +{ + kunmap_atomic(walker->table, KM_USER0); +} + +static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, + u64 *shadow_pte, u64 access_bits) +{ + ASSERT(*shadow_pte == 0); + access_bits &= guest_pte; + *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); + set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, + guest_pte & PT_DIRTY_MASK, access_bits); +} + +static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, + u64 *shadow_pte, u64 access_bits, + int index) +{ + gpa_t gaddr; + + ASSERT(*shadow_pte == 0); + access_bits &= guest_pde; + gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index; + if (PTTYPE == 32 && is_cpuid_PSE36()) + gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << + (32 - PT32_DIR_PSE36_SHIFT); + *shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK | PT_GLOBAL_MASK)) | + ((guest_pde & PT_DIR_PAT_MASK) >> + (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT)); + set_pte_common(vcpu, shadow_pte, gaddr, + guest_pde & PT_DIRTY_MASK, access_bits); +} + +/* + * Fetch a guest pte from a specific level in the paging hierarchy. + */ +static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu, + struct guest_walker *walker, + int level, + gva_t addr) +{ + + ASSERT(level > 0 && level <= walker->level); + + for (;;) { + int index = PT_INDEX(addr, walker->level); + hpa_t paddr; + + ASSERT(((unsigned long)walker->table & PAGE_MASK) == + ((unsigned long)&walker->table[index] & PAGE_MASK)); + if (level == walker->level || + !is_present_pte(walker->table[index]) || + (walker->level == PT_DIRECTORY_LEVEL && + (walker->table[index] & PT_PAGE_SIZE_MASK) && + (PTTYPE == 64 || is_pse(vcpu)))) + return &walker->table[index]; + if (walker->level != 3 || kvm_arch_ops->is_long_mode(vcpu)) + walker->inherited_ar &= walker->table[index]; + paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK); + kunmap_atomic(walker->table, KM_USER0); + walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), + KM_USER0); + --walker->level; + } +} + +/* + * Fetch a shadow pte for a specific level in the paging hierarchy. + */ +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker) +{ + hpa_t shadow_addr; + int level; + u64 *prev_shadow_ent = NULL; + + shadow_addr = vcpu->mmu.root_hpa; + level = vcpu->mmu.shadow_root_level; + + for (; ; level--) { + u32 index = SHADOW_PT_INDEX(addr, level); + u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; + pt_element_t *guest_ent; + + if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { + if (level == PT_PAGE_TABLE_LEVEL) + return shadow_ent; + shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; + prev_shadow_ent = shadow_ent; + continue; + } + + if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) { + ASSERT(level == PT32E_ROOT_LEVEL); + guest_ent = FNAME(fetch_guest)(vcpu, walker, + PT32_ROOT_LEVEL, addr); + } else + guest_ent = FNAME(fetch_guest)(vcpu, walker, + level, addr); + + if (!is_present_pte(*guest_ent)) + return NULL; + + /* Don't set accessed bit on PAE PDPTRs */ + if (vcpu->mmu.root_level != 3 || walker->level != 3) + *guest_ent |= PT_ACCESSED_MASK; + + if (level == PT_PAGE_TABLE_LEVEL) { + + if (walker->level == PT_DIRECTORY_LEVEL) { + if (prev_shadow_ent) + *prev_shadow_ent |= PT_SHADOW_PS_MARK; + FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, + walker->inherited_ar, + PT_INDEX(addr, PT_PAGE_TABLE_LEVEL)); + } else { + ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); + FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar); + } + return shadow_ent; + } + + shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent); + if (!VALID_PAGE(shadow_addr)) + return ERR_PTR(-ENOMEM); + if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3) + *shadow_ent = shadow_addr | + (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK)); + else { + *shadow_ent = shadow_addr | + (*guest_ent & PT_NON_PTE_COPY_MASK); + *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK); + } + prev_shadow_ent = shadow_ent; + } +} + +/* + * The guest faulted for write. We need to + * + * - check write permissions + * - update the guest pte dirty bit + * - update our own dirty page tracking structures + */ +static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, + u64 *shadow_ent, + struct guest_walker *walker, + gva_t addr, + int user) +{ + pt_element_t *guest_ent; + int writable_shadow; + gfn_t gfn; + + if (is_writeble_pte(*shadow_ent)) + return 0; + + writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; + if (user) { + /* + * User mode access. Fail if it's a kernel page or a read-only + * page. + */ + if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) + return 0; + ASSERT(*shadow_ent & PT_USER_MASK); + } else + /* + * Kernel mode access. Fail if it's a read-only page and + * supervisor write protection is enabled. + */ + if (!writable_shadow) { + if (is_write_protection(vcpu)) + return 0; + *shadow_ent &= ~PT_USER_MASK; + } + + guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr); + + if (!is_present_pte(*guest_ent)) { + *shadow_ent = 0; + return 0; + } + + gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + mark_page_dirty(vcpu->kvm, gfn); + *shadow_ent |= PT_WRITABLE_MASK; + *guest_ent |= PT_DIRTY_MASK; + + return 1; +} + +/* + * Page fault handler. There are several causes for a page fault: + * - there is no shadow pte for the guest pte + * - write access through a shadow pte marked read only so that we can set + * the dirty bit + * - write access to a shadow pte marked read only so we can update the page + * dirty bitmap, when userspace requests it + * - mmio access; in this case we will never install a present shadow pte + * - normal guest page fault due to the guest pte marked not present, not + * writable, or not executable + * + * Returns: 1 if we need to emulate the instruction, 0 otherwise + */ +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, + u32 error_code) +{ + int write_fault = error_code & PFERR_WRITE_MASK; + int pte_present = error_code & PFERR_PRESENT_MASK; + int user_fault = error_code & PFERR_USER_MASK; + struct guest_walker walker; + u64 *shadow_pte; + int fixed; + + /* + * Look up the shadow pte for the faulting address. + */ + for (;;) { + FNAME(init_walker)(&walker, vcpu); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker); + if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ + nonpaging_flush(vcpu); + FNAME(release_walker)(&walker); + continue; + } + break; + } + + /* + * The page is not mapped by the guest. Let the guest handle it. + */ + if (!shadow_pte) { + inject_page_fault(vcpu, addr, error_code); + FNAME(release_walker)(&walker); + return 0; + } + + /* + * Update the shadow pte. + */ + if (write_fault) + fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, + user_fault); + else + fixed = fix_read_pf(shadow_pte); + + FNAME(release_walker)(&walker); + + /* + * mmio: emulate if accessible, otherwise its a guest fault. + */ + if (is_io_pte(*shadow_pte)) { + if (may_access(*shadow_pte, write_fault, user_fault)) + return 1; + pgprintk("%s: io work, no access\n", __FUNCTION__); + inject_page_fault(vcpu, addr, + error_code | PFERR_PRESENT_MASK); + return 0; + } + + /* + * pte not present, guest page fault. + */ + if (pte_present && !fixed) { + inject_page_fault(vcpu, addr, error_code); + return 0; + } + + ++kvm_stat.pf_fixed; + + return 0; +} + +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + struct guest_walker walker; + pt_element_t guest_pte; + gpa_t gpa; + + FNAME(init_walker)(&walker, vcpu); + guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL, + vaddr); + FNAME(release_walker)(&walker); + + if (!is_present_pte(guest_pte)) + return UNMAPPED_GVA; + + if (walker.level == PT_DIRECTORY_LEVEL) { + ASSERT((guest_pte & PT_PAGE_SIZE_MASK)); + ASSERT(PTTYPE == 64 || is_pse(vcpu)); + + gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr & + (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK)); + + if (PTTYPE == 32 && is_cpuid_PSE36()) + gpa |= (guest_pte & PT32_DIR_PSE36_MASK) << + (32 - PT32_DIR_PSE36_SHIFT); + } else { + gpa = (guest_pte & PT_BASE_ADDR_MASK); + gpa |= (vaddr & ~PAGE_MASK); + } + + return gpa; +} + +#undef pt_element_t +#undef guest_walker +#undef FNAME +#undef PT_BASE_ADDR_MASK +#undef PT_INDEX +#undef SHADOW_PT_INDEX +#undef PT_LEVEL_MASK +#undef PT_PTE_COPY_MASK +#undef PT_NON_PTE_COPY_MASK +#undef PT_DIR_BASE_ADDR_MASK diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h new file mode 100644 index 000000000000..71fdf458619a --- /dev/null +++ b/drivers/kvm/segment_descriptor.h @@ -0,0 +1,17 @@ +struct segment_descriptor { + u16 limit_low; + u16 base_low; + u8 base_mid; + u8 type : 4; + u8 system : 1; + u8 dpl : 2; + u8 present : 1; + u8 limit_high : 4; + u8 avl : 1; + u8 long_mode : 1; + u8 default_op : 1; + u8 granularity : 1; + u8 base_high; +} __attribute__((packed)); + + diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c new file mode 100644 index 000000000000..a33a89c68138 --- /dev/null +++ b/drivers/kvm/svm.c @@ -0,0 +1,1677 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include + +#include "kvm_svm.h" +#include "x86_emulate.h" + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +#define IOPM_ALLOC_ORDER 2 +#define MSRPM_ALLOC_ORDER 1 + +#define DB_VECTOR 1 +#define UD_VECTOR 6 +#define GP_VECTOR 13 + +#define DR7_GD_MASK (1 << 13) +#define DR6_BD_MASK (1 << 13) +#define CR4_DE_MASK (1UL << 3) + +#define SEG_TYPE_LDT 2 +#define SEG_TYPE_BUSY_TSS16 3 + +#define KVM_EFER_LMA (1 << 10) +#define KVM_EFER_LME (1 << 8) + +unsigned long iopm_base; +unsigned long msrpm_base; + +struct kvm_ldttss_desc { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + u32 base3; + u32 zero1; +} __attribute__((packed)); + +struct svm_cpu_data { + int cpu; + + uint64_t asid_generation; + uint32_t max_asid; + uint32_t next_asid; + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; +}; + +static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); + +struct svm_init_data { + int cpu; + int r; +}; + +static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; + +#define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges)) +#define MSRS_RANGE_SIZE 2048 +#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) + +#define MAX_INST_SIZE 15 + +static unsigned get_addr_size(struct kvm_vcpu *vcpu) +{ + struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; + u16 cs_attrib; + + if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM)) + return 2; + + cs_attrib = sa->cs.attrib; + + return (cs_attrib & SVM_SELECTOR_L_MASK) ? 8 : + (cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2; +} + +static inline u8 pop_irq(struct kvm_vcpu *vcpu) +{ + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); + return irq; +} + +static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) +{ + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); +} + +static inline void clgi(void) +{ + asm volatile (SVM_CLGI); +} + +static inline void stgi(void) +{ + asm volatile (SVM_STGI); +} + +static inline void invlpga(unsigned long addr, u32 asid) +{ + asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); +} + +static inline unsigned long kvm_read_cr2(void) +{ + unsigned long cr2; + + asm volatile ("mov %%cr2, %0" : "=r" (cr2)); + return cr2; +} + +static inline void kvm_write_cr2(unsigned long val) +{ + asm volatile ("mov %0, %%cr2" :: "r" (val)); +} + +static inline unsigned long read_dr6(void) +{ + unsigned long dr6; + + asm volatile ("mov %%dr6, %0" : "=r" (dr6)); + return dr6; +} + +static inline void write_dr6(unsigned long val) +{ + asm volatile ("mov %0, %%dr6" :: "r" (val)); +} + +static inline unsigned long read_dr7(void) +{ + unsigned long dr7; + + asm volatile ("mov %%dr7, %0" : "=r" (dr7)); + return dr7; +} + +static inline void write_dr7(unsigned long val) +{ + asm volatile ("mov %0, %%dr7" :: "r" (val)); +} + +static inline int svm_is_long_mode(struct kvm_vcpu *vcpu) +{ + return vcpu->svm->vmcb->save.efer & KVM_EFER_LMA; +} + +static inline void force_new_asid(struct kvm_vcpu *vcpu) +{ + vcpu->svm->asid_generation--; +} + +static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + +static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (!(efer & KVM_EFER_LMA)) + efer &= ~KVM_EFER_LME; + + vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; + vcpu->shadow_efer = efer; +} + +static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) +{ + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + GP_VECTOR; + vcpu->svm->vmcb->control.event_inj_err = error_code; +} + +static void inject_ud(struct kvm_vcpu *vcpu) +{ + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_TYPE_EXEPT | + UD_VECTOR; +} + +static void inject_db(struct kvm_vcpu *vcpu) +{ + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_TYPE_EXEPT | + DB_VECTOR; +} + +static int is_page_fault(uint32_t info) +{ + info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); +} + +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + if (!vcpu->svm->next_rip) { + printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); + return; + } + if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) { + printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", + __FUNCTION__, + vcpu->svm->vmcb->save.rip, + vcpu->svm->next_rip); + } + + vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; + vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; +} + +static int has_svm(void) +{ + uint32_t eax, ebx, ecx, edx; + + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) { + printk(KERN_INFO "has_svm: not amd\n"); + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + printk(KERN_DEBUG "has_svm: svm not available\n"); + return 0; + } + return 1; +} + +static void svm_hardware_disable(void *garbage) +{ + struct svm_cpu_data *svm_data + = per_cpu(svm_data, raw_smp_processor_id()); + + if (svm_data) { + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + per_cpu(svm_data, raw_smp_processor_id()) = 0; + __free_page(svm_data->save_area); + kfree(svm_data); + } +} + +static void svm_hardware_enable(void *garbage) +{ + + struct svm_cpu_data *svm_data; + uint64_t efer; +#ifdef __x86_64__ + struct desc_ptr gdt_descr; +#else + struct Xgt_desc_struct gdt_descr; +#endif + struct desc_struct *gdt; + int me = raw_smp_processor_id(); + + if (!has_svm()) { + printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); + return; + } + svm_data = per_cpu(svm_data, me); + + if (!svm_data) { + printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", + me); + return; + } + + svm_data->asid_generation = 1; + svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + svm_data->next_asid = svm_data->max_asid + 1; + + asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); + gdt = (struct desc_struct *)gdt_descr.address; + svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); + + wrmsrl(MSR_VM_HSAVE_PA, + page_to_pfn(svm_data->save_area) << PAGE_SHIFT); +} + +static int svm_cpu_init(int cpu) +{ + struct svm_cpu_data *svm_data; + int r; + + svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!svm_data) + return -ENOMEM; + svm_data->cpu = cpu; + svm_data->save_area = alloc_page(GFP_KERNEL); + r = -ENOMEM; + if (!svm_data->save_area) + goto err_1; + + per_cpu(svm_data, cpu) = svm_data; + + return 0; + +err_1: + kfree(svm_data); + return r; + +} + +static int set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) +{ + int i; + + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr >= msrpm_ranges[i] && + msr < msrpm_ranges[i] + MSRS_IN_RANGE) { + u32 msr_offset = (i * MSRS_IN_RANGE + msr - + msrpm_ranges[i]) * 2; + + u32 *base = msrpm + (msr_offset / 32); + u32 msr_shift = msr_offset % 32; + u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); + *base = (*base & ~(0x3 << msr_shift)) | + (mask << msr_shift); + return 1; + } + } + printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr); + return 0; +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + struct page *msrpm_pages; + void *msrpm_va; + int r; + + + iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); + + if (!iopm_pages) + return -ENOMEM; + memset(page_address(iopm_pages), 0xff, + PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + + r = -ENOMEM; + if (!msrpm_pages) + goto err_1; + + msrpm_va = page_address(msrpm_pages); + memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; + +#ifdef __x86_64__ + set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_STAR, 1, 1); + set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); + + for_each_online_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err_2; + } + return 0; + +err_2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); + msrpm_base = 0; +err_1: + __free_pages(iopm_pages, IOPM_ALLOC_ORDER); + iopm_base = 0; + return r; +} + +static __exit void svm_hardware_unsetup(void) +{ + __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + iopm_base = msrpm_base = 0; +} + +static void init_seg(struct vmcb_seg *seg) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + seg->limit = 0xffff; + seg->base = 0; +} + +static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | type; + seg->limit = 0xffff; + seg->base = 0; +} + +static int svm_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static void init_vmcb(struct vmcb *vmcb) +{ + struct vmcb_control_area *control = &vmcb->control; + struct vmcb_save_area *save = &vmcb->save; + u64 tsc; + + control->intercept_cr_read = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK; + + control->intercept_cr_write = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK; + + control->intercept_dr_read = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK; + + control->intercept_dr_write = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR7_MASK; + + control->intercept_exceptions = 1 << PF_VECTOR; + + + control->intercept = (1ULL << INTERCEPT_INTR) | + (1ULL << INTERCEPT_NMI) | + /* + * selective cr0 intercept bug? + * 0: 0f 22 d8 mov %eax,%cr3 + * 3: 0f 20 c0 mov %cr0,%eax + * 6: 0d 00 00 00 80 or $0x80000000,%eax + * b: 0f 22 c0 mov %eax,%cr0 + * set cr3 ->interception + * get cr0 ->interception + * set cr0 -> no interception + */ + /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ + (1ULL << INTERCEPT_CPUID) | + (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | + (1ULL << INTERCEPT_INVLPGA) | + (1ULL << INTERCEPT_IOIO_PROT) | + (1ULL << INTERCEPT_MSR_PROT) | + (1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_VMRUN) | + (1ULL << INTERCEPT_VMMCALL) | + (1ULL << INTERCEPT_VMLOAD) | + (1ULL << INTERCEPT_VMSAVE) | + (1ULL << INTERCEPT_STGI) | + (1ULL << INTERCEPT_CLGI) | + (1ULL << INTERCEPT_SKINIT); + + control->iopm_base_pa = iopm_base; + control->msrpm_base_pa = msrpm_base; + rdtscll(tsc); + control->tsc_offset = -tsc; + control->int_ctl = V_INTR_MASKING_MASK; + + init_seg(&save->es); + init_seg(&save->ss); + init_seg(&save->ds); + init_seg(&save->fs); + init_seg(&save->gs); + + save->cs.selector = 0xf000; + /* Executable/Readable Code Segment */ + save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | + SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; + save->cs.limit = 0xffff; + save->cs.base = 0xffff0000; + + save->gdtr.limit = 0xffff; + save->idtr.limit = 0xffff; + + init_sys_seg(&save->ldtr, SEG_TYPE_LDT); + init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); + + save->efer = MSR_EFER_SVME_MASK; + + save->dr6 = 0xffff0ff0; + save->dr7 = 0x400; + save->rflags = 2; + save->rip = 0x0000fff0; + + /* + * cr0 val on cpu init should be 0x60000010, we enable cpu + * cache by default. the orderly way is to enable cache in bios. + */ + save->cr0 = 0x00000010 | CR0_PG_MASK; + save->cr4 = CR4_PAE_MASK; + /* rdx = ?? */ +} + +static int svm_create_vcpu(struct kvm_vcpu *vcpu) +{ + struct page *page; + int r; + + r = -ENOMEM; + vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL); + if (!vcpu->svm) + goto out1; + page = alloc_page(GFP_KERNEL); + if (!page) + goto out2; + + vcpu->svm->vmcb = page_address(page); + memset(vcpu->svm->vmcb, 0, PAGE_SIZE); + vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + vcpu->svm->cr0 = 0x00000010; + vcpu->svm->asid_generation = 0; + memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); + init_vmcb(vcpu->svm->vmcb); + + return 0; + +out2: + kfree(vcpu->svm); +out1: + return r; +} + +static void svm_free_vcpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->svm) + return; + if (vcpu->svm->vmcb) + __free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT)); + kfree(vcpu->svm); +} + +static struct kvm_vcpu *svm_vcpu_load(struct kvm_vcpu *vcpu) +{ + get_cpu(); + return vcpu; +} + +static void svm_vcpu_put(struct kvm_vcpu *vcpu) +{ + put_cpu(); +} + +static void svm_cache_regs(struct kvm_vcpu *vcpu) +{ + vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax; + vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp; + vcpu->rip = vcpu->svm->vmcb->save.rip; +} + +static void svm_decache_regs(struct kvm_vcpu *vcpu) +{ + vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; + vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; + vcpu->svm->vmcb->save.rip = vcpu->rip; +} + +static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) +{ + return vcpu->svm->vmcb->save.rflags; +} + +static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + vcpu->svm->vmcb->save.rflags = rflags; +} + +static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_save_area *save = &vcpu->svm->vmcb->save; + + switch (seg) { + case VCPU_SREG_CS: return &save->cs; + case VCPU_SREG_DS: return &save->ds; + case VCPU_SREG_ES: return &save->es; + case VCPU_SREG_FS: return &save->fs; + case VCPU_SREG_GS: return &save->gs; + case VCPU_SREG_SS: return &save->ss; + case VCPU_SREG_TR: return &save->tr; + case VCPU_SREG_LDTR: return &save->ldtr; + } + BUG(); + return 0; +} + +static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + return s->base; +} + +static void svm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + var->base = s->base; + var->limit = s->limit; + var->selector = s->selector; + var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; + var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; + var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; + var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; + var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; + var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; + var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + var->unusable = !var->present; +} + +static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct vmcb_seg *s = svm_seg(vcpu, VCPU_SREG_CS); + + *db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; + *l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; +} + +static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vcpu->svm->vmcb->save.ldtr.limit; + dt->base = vcpu->svm->vmcb->save.ldtr.base; +} + +static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vcpu->svm->vmcb->save.ldtr.limit = dt->limit; + vcpu->svm->vmcb->save.ldtr.base = dt->base ; +} + +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vcpu->svm->vmcb->save.gdtr.limit; + dt->base = vcpu->svm->vmcb->save.gdtr.base; +} + +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vcpu->svm->vmcb->save.gdtr.limit = dt->limit; + vcpu->svm->vmcb->save.gdtr.base = dt->base ; +} + +static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ +#ifdef __x86_64__ + if (vcpu->shadow_efer & KVM_EFER_LME) { + if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { + vcpu->shadow_efer |= KVM_EFER_LMA; + vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; + } + + if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) { + vcpu->shadow_efer &= ~KVM_EFER_LMA; + vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); + } + } +#endif + vcpu->svm->cr0 = cr0; + vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK; + vcpu->cr0 = cr0; +} + +static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + vcpu->cr4 = cr4; + vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK; +} + +static void svm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + s->base = var->base; + s->limit = var->limit; + s->selector = var->selector; + if (var->unusable) + s->attrib = 0; + else { + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; + } + if (seg == VCPU_SREG_CS) + vcpu->svm->vmcb->save.cpl + = (vcpu->svm->vmcb->save.cs.attrib + >> SVM_SELECTOR_DPL_SHIFT) & 3; + +} + +/* FIXME: + + vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK; + vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); + +*/ + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + return -EOPNOTSUPP; +} + +static void load_host_msrs(struct kvm_vcpu *vcpu) +{ + int i; + + for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) + wrmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); +} + +static void save_host_msrs(struct kvm_vcpu *vcpu) +{ + int i; + + for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) + rdmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); +} + +static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) +{ + if (svm_data->next_asid > svm_data->max_asid) { + ++svm_data->asid_generation; + svm_data->next_asid = 1; + vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + } + + vcpu->cpu = svm_data->cpu; + vcpu->svm->asid_generation = svm_data->asid_generation; + vcpu->svm->vmcb->control.asid = svm_data->next_asid++; +} + +static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + invlpga(address, vcpu->svm->vmcb->control.asid); // is needed? +} + +static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +{ + return vcpu->svm->db_regs[dr]; +} + +static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception) +{ + *exception = 0; + + if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) { + vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK; + vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK; + *exception = DB_VECTOR; + return; + } + + switch (dr) { + case 0 ... 3: + vcpu->svm->db_regs[dr] = value; + return; + case 4 ... 5: + if (vcpu->cr4 & CR4_DE_MASK) { + *exception = UD_VECTOR; + return; + } + case 7: { + if (value & ~((1ULL << 32) - 1)) { + *exception = GP_VECTOR; + return; + } + vcpu->svm->vmcb->save.dr7 = value; + return; + } + default: + printk(KERN_DEBUG "%s: unexpected dr %u\n", + __FUNCTION__, dr); + *exception = UD_VECTOR; + return; + } +} + +static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info; + u64 fault_address; + u32 error_code; + enum emulation_result er; + + if (is_external_interrupt(exit_int_info)) + push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + + spin_lock(&vcpu->kvm->lock); + + fault_address = vcpu->svm->vmcb->control.exit_info_2; + error_code = vcpu->svm->vmcb->control.exit_info_1; + if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) { + spin_unlock(&vcpu->kvm->lock); + return 1; + } + er = emulate_instruction(vcpu, kvm_run, fault_address, error_code); + spin_unlock(&vcpu->kvm->lock); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++kvm_stat.mmio_exits; + kvm_run->exit_reason = KVM_EXIT_MMIO; + return 0; + case EMULATE_FAIL: + vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); + break; + default: + BUG(); + } + + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; +} + +static int io_get_override(struct kvm_vcpu *vcpu, + struct vmcb_seg **seg, + int *addr_override) +{ + u8 inst[MAX_INST_SIZE]; + unsigned ins_length; + gva_t rip; + int i; + + rip = vcpu->svm->vmcb->save.rip; + ins_length = vcpu->svm->next_rip - rip; + rip += vcpu->svm->vmcb->save.cs.base; + + if (ins_length > MAX_INST_SIZE) + printk(KERN_DEBUG + "%s: inst length err, cs base 0x%llx rip 0x%llx " + "next rip 0x%llx ins_length %u\n", + __FUNCTION__, + vcpu->svm->vmcb->save.cs.base, + vcpu->svm->vmcb->save.rip, + vcpu->svm->vmcb->control.exit_info_2, + ins_length); + + if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length) + /* #PF */ + return 0; + + *addr_override = 0; + *seg = 0; + for (i = 0; i < ins_length; i++) + switch (inst[i]) { + case 0xf0: + case 0xf2: + case 0xf3: + case 0x66: + continue; + case 0x67: + *addr_override = 1; + continue; + case 0x2e: + *seg = &vcpu->svm->vmcb->save.cs; + continue; + case 0x36: + *seg = &vcpu->svm->vmcb->save.ss; + continue; + case 0x3e: + *seg = &vcpu->svm->vmcb->save.ds; + continue; + case 0x26: + *seg = &vcpu->svm->vmcb->save.es; + continue; + case 0x64: + *seg = &vcpu->svm->vmcb->save.fs; + continue; + case 0x65: + *seg = &vcpu->svm->vmcb->save.gs; + continue; + default: + return 1; + } + printk(KERN_DEBUG "%s: unexpected\n", __FUNCTION__); + return 0; +} + +static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) +{ + unsigned long addr_mask; + unsigned long *reg; + struct vmcb_seg *seg; + int addr_override; + struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save; + u16 cs_attrib = save_area->cs.attrib; + unsigned addr_size = get_addr_size(vcpu); + + if (!io_get_override(vcpu, &seg, &addr_override)) + return 0; + + if (addr_override) + addr_size = (addr_size == 2) ? 4: (addr_size >> 1); + + if (ins) { + reg = &vcpu->regs[VCPU_REGS_RDI]; + seg = &vcpu->svm->vmcb->save.es; + } else { + reg = &vcpu->regs[VCPU_REGS_RSI]; + seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds; + } + + addr_mask = ~0ULL >> (64 - (addr_size * 8)); + + if ((cs_attrib & SVM_SELECTOR_L_MASK) && + !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) { + *address = (*reg & addr_mask); + return addr_mask; + } + + if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) { + svm_inject_gp(vcpu, 0); + return 0; + } + + *address = (*reg & addr_mask) + seg->base; + return addr_mask; +} + +static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? + int _in = io_info & SVM_IOIO_TYPE_MASK; + + ++kvm_stat.io_exits; + + vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; + + kvm_run->exit_reason = KVM_EXIT_IO; + kvm_run->io.port = io_info >> 16; + kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); + kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; + kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; + + if (kvm_run->io.string) { + unsigned addr_mask; + + addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); + if (!addr_mask) { + printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); + return 1; + } + + if (kvm_run->io.rep) { + kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; + kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags + & X86_EFLAGS_DF) != 0; + } + } else { + kvm_run->io.value = vcpu->svm->vmcb->save.rax; + } + return 0; +} + + +static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + return 1; +} + +static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; + skip_emulated_instruction(vcpu); + if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)) + return 1; + + kvm_run->exit_reason = KVM_EXIT_HLT; + return 0; +} + +static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + inject_ud(vcpu); + return 1; +} + +static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__); + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; +} + +static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; + kvm_run->exit_reason = KVM_EXIT_CPUID; + return 0; +} + +static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (emulate_instruction(vcpu, 0, 0, 0) != EMULATE_DONE) + printk(KERN_ERR "%s: failed\n", __FUNCTION__); + return 1; +} + +static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +{ + switch (ecx) { + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + *data = 0; + break; + case MSR_IA32_TIME_STAMP_COUNTER: { + u64 tsc; + + rdtscll(tsc); + *data = vcpu->svm->vmcb->control.tsc_offset + tsc; + break; + } + case MSR_EFER: + *data = vcpu->shadow_efer; + break; + case MSR_IA32_APICBASE: + *data = vcpu->apic_base; + break; +#ifdef __x86_64__ + case MSR_STAR: + *data = vcpu->svm->vmcb->save.star; + break; + case MSR_LSTAR: + *data = vcpu->svm->vmcb->save.lstar; + break; + case MSR_CSTAR: + *data = vcpu->svm->vmcb->save.cstar; + break; + case MSR_KERNEL_GS_BASE: + *data = vcpu->svm->vmcb->save.kernel_gs_base; + break; + case MSR_SYSCALL_MASK: + *data = vcpu->svm->vmcb->save.sfmask; + break; +#endif + case MSR_IA32_SYSENTER_CS: + *data = vcpu->svm->vmcb->save.sysenter_cs; + break; + case MSR_IA32_SYSENTER_EIP: + *data = vcpu->svm->vmcb->save.sysenter_eip; + break; + case MSR_IA32_SYSENTER_ESP: + *data = vcpu->svm->vmcb->save.sysenter_esp; + break; + default: + printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", ecx); + return 1; + } + return 0; +} + +static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data; + + if (svm_get_msr(vcpu, ecx, &data)) + svm_inject_gp(vcpu, 0); + else { + vcpu->svm->vmcb->save.rax = data & 0xffffffff; + vcpu->regs[VCPU_REGS_RDX] = data >> 32; + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; + skip_emulated_instruction(vcpu); + } + return 1; +} + +static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +{ + switch (ecx) { +#ifdef __x86_64__ + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" + , __FUNCTION__, data); + break; + case MSR_IA32_TIME_STAMP_COUNTER: { + u64 tsc; + + rdtscll(tsc); + vcpu->svm->vmcb->control.tsc_offset = data - tsc; + break; + } + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + vcpu->apic_base = data; + break; +#ifdef __x86_64___ + case MSR_STAR: + vcpu->svm->vmcb->save.star = data; + break; + case MSR_LSTAR: + vcpu->svm->vmcb->save.lstar = data; + break; + case MSR_CSTAR: + vcpu->svm->vmcb->save.cstar = data; + break; + case MSR_KERNEL_GS_BASE: + vcpu->svm->vmcb->save.kernel_gs_base = data; + break; + case MSR_SYSCALL_MASK: + vcpu->svm->vmcb->save.sfmask = data; + break; +#endif + case MSR_IA32_SYSENTER_CS: + vcpu->svm->vmcb->save.sysenter_cs = data; + break; + case MSR_IA32_SYSENTER_EIP: + vcpu->svm->vmcb->save.sysenter_eip = data; + break; + case MSR_IA32_SYSENTER_ESP: + vcpu->svm->vmcb->save.sysenter_esp = data; + break; + default: + printk(KERN_ERR "kvm: unhandled wrmsr: %x\n", ecx); + return 1; + } + return 0; +} + +static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data = (vcpu->svm->vmcb->save.rax & -1u) + | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; + if (svm_set_msr(vcpu, ecx, data)) + svm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); + return 1; +} + +static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (vcpu->svm->vmcb->control.exit_info_1) + return wrmsr_interception(vcpu, kvm_run); + else + return rdmsr_interception(vcpu, kvm_run); +} + +static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) = { + [SVM_EXIT_READ_CR0] = emulate_on_interception, + [SVM_EXIT_READ_CR3] = emulate_on_interception, + [SVM_EXIT_READ_CR4] = emulate_on_interception, + /* for now: */ + [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR3] = emulate_on_interception, + [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_READ_DR0] = emulate_on_interception, + [SVM_EXIT_READ_DR1] = emulate_on_interception, + [SVM_EXIT_READ_DR2] = emulate_on_interception, + [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR0] = emulate_on_interception, + [SVM_EXIT_WRITE_DR1] = emulate_on_interception, + [SVM_EXIT_WRITE_DR2] = emulate_on_interception, + [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_INTR] = nop_on_interception, + [SVM_EXIT_NMI] = nop_on_interception, + [SVM_EXIT_SMI] = nop_on_interception, + [SVM_EXIT_INIT] = nop_on_interception, + /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ + [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_HLT] = halt_interception, + [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPGA] = invalid_op_interception, + [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_MSR] = msr_interception, + [SVM_EXIT_TASK_SWITCH] = task_switch_interception, + [SVM_EXIT_VMRUN] = invalid_op_interception, + [SVM_EXIT_VMMCALL] = invalid_op_interception, + [SVM_EXIT_VMLOAD] = invalid_op_interception, + [SVM_EXIT_VMSAVE] = invalid_op_interception, + [SVM_EXIT_STGI] = invalid_op_interception, + [SVM_EXIT_CLGI] = invalid_op_interception, + [SVM_EXIT_SKINIT] = invalid_op_interception, +}; + + +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 exit_code = vcpu->svm->vmcb->control.exit_code; + + kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; + + if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) + printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " + "exit_code 0x%x\n", + __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, + exit_code); + + if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers) + || svm_exit_handlers[exit_code] == 0) { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", + __FUNCTION__, + exit_code, + vcpu->svm->vmcb->save.rip, + vcpu->cr0, + vcpu->svm->vmcb->save.rflags); + return 0; + } + + return svm_exit_handlers[exit_code](vcpu, kvm_run); +} + +static void reload_tss(struct kvm_vcpu *vcpu) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + svm_data->tss_desc->type = 9; //available 32/64-bit TSS + load_TR_desc(); +} + +static void pre_svm_run(struct kvm_vcpu *vcpu) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + + vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + if (vcpu->cpu != cpu || + vcpu->svm->asid_generation != svm_data->asid_generation) + new_asid(vcpu, svm_data); +} + + +static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu) +{ + struct vmcb_control_area *control; + + if (!vcpu->irq_summary) + return; + + control = &vcpu->svm->vmcb->control; + + control->int_vector = pop_irq(vcpu); + control->int_ctl &= ~V_INTR_PRIO_MASK; + control->int_ctl |= V_IRQ_MASK | + ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); +} + +static void kvm_reput_irq(struct kvm_vcpu *vcpu) +{ + struct vmcb_control_area *control = &vcpu->svm->vmcb->control; + + if (control->int_ctl & V_IRQ_MASK) { + control->int_ctl &= ~V_IRQ_MASK; + push_irq(vcpu, control->int_vector); + } +} + +static void save_db_regs(unsigned long *db_regs) +{ +#ifdef __x86_64__ + asm ("mov %%dr0, %%rax \n\t" + "mov %%rax, %[dr0] \n\t" + "mov %%dr1, %%rax \n\t" + "mov %%rax, %[dr1] \n\t" + "mov %%dr2, %%rax \n\t" + "mov %%rax, %[dr2] \n\t" + "mov %%dr3, %%rax \n\t" + "mov %%rax, %[dr3] \n\t" + : [dr0] "=m"(db_regs[0]), + [dr1] "=m"(db_regs[1]), + [dr2] "=m"(db_regs[2]), + [dr3] "=m"(db_regs[3]) + : : "rax"); +#else + asm ("mov %%dr0, %%eax \n\t" + "mov %%eax, %[dr0] \n\t" + "mov %%dr1, %%eax \n\t" + "mov %%eax, %[dr1] \n\t" + "mov %%dr2, %%eax \n\t" + "mov %%eax, %[dr2] \n\t" + "mov %%dr3, %%eax \n\t" + "mov %%eax, %[dr3] \n\t" + : [dr0] "=m"(db_regs[0]), + [dr1] "=m"(db_regs[1]), + [dr2] "=m"(db_regs[2]), + [dr3] "=m"(db_regs[3]) + : : "eax"); +#endif +} + +static void load_db_regs(unsigned long *db_regs) +{ + asm volatile ("mov %[dr0], %%dr0 \n\t" + "mov %[dr1], %%dr1 \n\t" + "mov %[dr2], %%dr2 \n\t" + "mov %[dr3], %%dr3 \n\t" + : + : [dr0] "r"(db_regs[0]), + [dr1] "r"(db_regs[1]), + [dr2] "r"(db_regs[2]), + [dr3] "r"(db_regs[3]) +#ifdef __x86_64__ + : "rax"); +#else + : "eax"); +#endif +} + +static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u16 fs_selector; + u16 gs_selector; + u16 ldt_selector; + +again: + kvm_try_inject_irq(vcpu); + + clgi(); + + pre_svm_run(vcpu); + + save_host_msrs(vcpu); + fs_selector = read_fs(); + gs_selector = read_gs(); + ldt_selector = read_ldt(); + vcpu->svm->host_cr2 = kvm_read_cr2(); + vcpu->svm->host_dr6 = read_dr6(); + vcpu->svm->host_dr7 = read_dr7(); + vcpu->svm->vmcb->save.cr2 = vcpu->cr2; + + if (vcpu->svm->vmcb->save.dr7 & 0xff) { + write_dr7(0); + save_db_regs(vcpu->svm->host_db_regs); + load_db_regs(vcpu->svm->db_regs); + } + asm volatile ( +#ifdef __x86_64__ + "push %%rbx; push %%rcx; push %%rdx;" + "push %%rsi; push %%rdi; push %%rbp;" + "push %%r8; push %%r9; push %%r10; push %%r11;" + "push %%r12; push %%r13; push %%r14; push %%r15;" +#else + "push %%ebx; push %%ecx; push %%edx;" + "push %%esi; push %%edi; push %%ebp;" +#endif + +#ifdef __x86_64__ + "mov %c[rbx](%[vcpu]), %%rbx \n\t" + "mov %c[rcx](%[vcpu]), %%rcx \n\t" + "mov %c[rdx](%[vcpu]), %%rdx \n\t" + "mov %c[rsi](%[vcpu]), %%rsi \n\t" + "mov %c[rdi](%[vcpu]), %%rdi \n\t" + "mov %c[rbp](%[vcpu]), %%rbp \n\t" + "mov %c[r8](%[vcpu]), %%r8 \n\t" + "mov %c[r9](%[vcpu]), %%r9 \n\t" + "mov %c[r10](%[vcpu]), %%r10 \n\t" + "mov %c[r11](%[vcpu]), %%r11 \n\t" + "mov %c[r12](%[vcpu]), %%r12 \n\t" + "mov %c[r13](%[vcpu]), %%r13 \n\t" + "mov %c[r14](%[vcpu]), %%r14 \n\t" + "mov %c[r15](%[vcpu]), %%r15 \n\t" +#else + "mov %c[rbx](%[vcpu]), %%ebx \n\t" + "mov %c[rcx](%[vcpu]), %%ecx \n\t" + "mov %c[rdx](%[vcpu]), %%edx \n\t" + "mov %c[rsi](%[vcpu]), %%esi \n\t" + "mov %c[rdi](%[vcpu]), %%edi \n\t" + "mov %c[rbp](%[vcpu]), %%ebp \n\t" +#endif + +#ifdef __x86_64__ + /* Enter guest mode */ + "push %%rax \n\t" + "mov %c[svm](%[vcpu]), %%rax \n\t" + "mov %c[vmcb](%%rax), %%rax \n\t" + SVM_VMLOAD "\n\t" + SVM_VMRUN "\n\t" + SVM_VMSAVE "\n\t" + "pop %%rax \n\t" +#else + /* Enter guest mode */ + "push %%eax \n\t" + "mov %c[svm](%[vcpu]), %%eax \n\t" + "mov %c[vmcb](%%eax), %%eax \n\t" + SVM_VMLOAD "\n\t" + SVM_VMRUN "\n\t" + SVM_VMSAVE "\n\t" + "pop %%eax \n\t" +#endif + + /* Save guest registers, load host registers */ +#ifdef __x86_64__ + "mov %%rbx, %c[rbx](%[vcpu]) \n\t" + "mov %%rcx, %c[rcx](%[vcpu]) \n\t" + "mov %%rdx, %c[rdx](%[vcpu]) \n\t" + "mov %%rsi, %c[rsi](%[vcpu]) \n\t" + "mov %%rdi, %c[rdi](%[vcpu]) \n\t" + "mov %%rbp, %c[rbp](%[vcpu]) \n\t" + "mov %%r8, %c[r8](%[vcpu]) \n\t" + "mov %%r9, %c[r9](%[vcpu]) \n\t" + "mov %%r10, %c[r10](%[vcpu]) \n\t" + "mov %%r11, %c[r11](%[vcpu]) \n\t" + "mov %%r12, %c[r12](%[vcpu]) \n\t" + "mov %%r13, %c[r13](%[vcpu]) \n\t" + "mov %%r14, %c[r14](%[vcpu]) \n\t" + "mov %%r15, %c[r15](%[vcpu]) \n\t" + + "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" + "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" + "pop %%rbp; pop %%rdi; pop %%rsi;" + "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" +#else + "mov %%ebx, %c[rbx](%[vcpu]) \n\t" + "mov %%ecx, %c[rcx](%[vcpu]) \n\t" + "mov %%edx, %c[rdx](%[vcpu]) \n\t" + "mov %%esi, %c[rsi](%[vcpu]) \n\t" + "mov %%edi, %c[rdi](%[vcpu]) \n\t" + "mov %%ebp, %c[rbp](%[vcpu]) \n\t" + + "pop %%ebp; pop %%edi; pop %%esi;" + "pop %%edx; pop %%ecx; pop %%ebx; \n\t" +#endif + : + : [vcpu]"a"(vcpu), + [svm]"i"(offsetof(struct kvm_vcpu, svm)), + [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), + [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])) +#ifdef __x86_64__ + ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), + [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])) +#endif + : "cc", "memory" ); + + if ((vcpu->svm->vmcb->save.dr7 & 0xff)) + load_db_regs(vcpu->svm->host_db_regs); + + vcpu->cr2 = vcpu->svm->vmcb->save.cr2; + + write_dr6(vcpu->svm->host_dr6); + write_dr7(vcpu->svm->host_dr7); + kvm_write_cr2(vcpu->svm->host_cr2); + + load_fs(fs_selector); + load_gs(gs_selector); + load_ldt(ldt_selector); + load_host_msrs(vcpu); + + reload_tss(vcpu); + + stgi(); + + kvm_reput_irq(vcpu); + + vcpu->svm->next_rip = 0; + + if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; + kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; + return 0; + } + + if (handle_exit(vcpu, kvm_run)) { + if (signal_pending(current)) { + ++kvm_stat.signal_exits; + return -EINTR; + } + kvm_resched(vcpu); + goto again; + } + return 0; +} + +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + +static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + vcpu->svm->vmcb->save.cr3 = root; + force_new_asid(vcpu); +} + +static void svm_inject_page_fault(struct kvm_vcpu *vcpu, + unsigned long addr, + uint32_t err_code) +{ + uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; + + ++kvm_stat.pf_guest; + + if (is_page_fault(exit_int_info)) { + + vcpu->svm->vmcb->control.event_inj_err = 0; + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + DF_VECTOR; + return; + } + vcpu->cr2 = addr; + vcpu->svm->vmcb->save.cr2 = addr; + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + PF_VECTOR; + vcpu->svm->vmcb->control.event_inj_err = err_code; +} + + +static int is_disabled(void) +{ + return 0; +} + +static struct kvm_arch_ops svm_arch_ops = { + .cpu_has_kvm_support = has_svm, + .disabled_by_bios = is_disabled, + .hardware_setup = svm_hardware_setup, + .hardware_unsetup = svm_hardware_unsetup, + .hardware_enable = svm_hardware_enable, + .hardware_disable = svm_hardware_disable, + + .vcpu_create = svm_create_vcpu, + .vcpu_free = svm_free_vcpu, + + .vcpu_load = svm_vcpu_load, + .vcpu_put = svm_vcpu_put, + + .set_guest_debug = svm_guest_debug, + .get_msr = svm_get_msr, + .set_msr = svm_set_msr, + .get_segment_base = svm_get_segment_base, + .get_segment = svm_get_segment, + .set_segment = svm_set_segment, + .is_long_mode = svm_is_long_mode, + .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .set_cr0 = svm_set_cr0, + .set_cr0_no_modeswitch = svm_set_cr0, + .set_cr3 = svm_set_cr3, + .set_cr4 = svm_set_cr4, + .set_efer = svm_set_efer, + .get_idt = svm_get_idt, + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, + .get_dr = svm_get_dr, + .set_dr = svm_set_dr, + .cache_regs = svm_cache_regs, + .decache_regs = svm_decache_regs, + .get_rflags = svm_get_rflags, + .set_rflags = svm_set_rflags, + + .invlpg = svm_invlpg, + .tlb_flush = svm_flush_tlb, + .inject_page_fault = svm_inject_page_fault, + + .inject_gp = svm_inject_gp, + + .run = svm_vcpu_run, + .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_setup = svm_vcpu_setup, +}; + +static int __init svm_init(void) +{ + kvm_emulator_want_group7_invlpg(); + kvm_init_arch(&svm_arch_ops, THIS_MODULE); + return 0; +} + +static void __exit svm_exit(void) +{ + kvm_exit_arch(); +} + +module_init(svm_init) +module_exit(svm_exit) diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h new file mode 100644 index 000000000000..df731c3fb588 --- /dev/null +++ b/drivers/kvm/svm.h @@ -0,0 +1,315 @@ +#ifndef __SVM_H +#define __SVM_H + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, +}; + + +struct __attribute__ ((__packed__)) vmcb_control_area { + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[44]; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u8 reserved_4[16]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 lbr_ctl; + u8 reserved_5[832]; +}; + + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +struct __attribute__ ((__packed__)) vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct __attribute__ ((__packed__)) vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct __attribute__ ((__packed__)) vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; + +#define SVM_CPUID_FEATURE_SHIFT 2 +#define SVM_CPUID_FUNC 0x8000000a + +#define MSR_EFER_SVME_MASK (1ULL << 12) +#define MSR_VM_HSAVE_PA 0xc0010117ULL + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_MASK 1 +#define INTERCEPT_CR3_MASK (1 << 3) +#define INTERCEPT_CR4_MASK (1 << 4) + +#define INTERCEPT_DR0_MASK 1 +#define INTERCEPT_DR1_MASK (1 << 1) +#define INTERCEPT_DR2_MASK (1 << 2) +#define INTERCEPT_DR3_MASK (1 << 3) +#define INTERCEPT_DR4_MASK (1 << 4) +#define INTERCEPT_DR5_MASK (1 << 5) +#define INTERCEPT_DR6_MASK (1 << 6) +#define INTERCEPT_DR7_MASK (1 << 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP + +#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" +#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" +#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" +#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" +#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" +#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" + +#endif + diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c new file mode 100644 index 000000000000..bda7a7ae2167 --- /dev/null +++ b/drivers/kvm/vmx.c @@ -0,0 +1,2002 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "kvm.h" +#include "vmx.h" +#include "kvm_vmx.h" +#include +#include +#include +#include + +#include "segment_descriptor.h" + +#define MSR_IA32_FEATURE_CONTROL 0x03a + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +static DEFINE_PER_CPU(struct vmcs *, current_vmcs); + +#ifdef __x86_64__ +#define HOST_IS_64 1 +#else +#define HOST_IS_64 0 +#endif + +static struct vmcs_descriptor { + int size; + int order; + u32 revision_id; +} vmcs_descriptor; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +static const u32 vmx_msr_index[] = { +#ifdef __x86_64__ + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, +#endif + MSR_EFER, MSR_K6_STAR, +}; +#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index)) + +struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr); + +static inline int is_page_fault(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_external_interrupt(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static void __vcpu_clear(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + int cpu = smp_processor_id(); + + if (vcpu->cpu == cpu) + vmcs_clear(vcpu->vmcs); + if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) + per_cpu(current_vmcs, cpu) = NULL; +} + +static unsigned long vmcs_readl(unsigned long field) +{ + unsigned long value; + + asm volatile (ASM_VMX_VMREAD_RDX_RAX + : "=a"(value) : "d"(field) : "cc"); + return value; +} + +static u16 vmcs_read16(unsigned long field) +{ + return vmcs_readl(field); +} + +static u32 vmcs_read32(unsigned long field) +{ + return vmcs_readl(field); +} + +static u64 vmcs_read64(unsigned long field) +{ +#ifdef __x86_64__ + return vmcs_readl(field); +#else + return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); +#endif +} + +static void vmcs_writel(unsigned long field, unsigned long value) +{ + u8 error; + + asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + : "=q"(error) : "a"(value), "d"(field) : "cc" ); + if (error) + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +static void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write64(unsigned long field, u64 value) +{ +#ifdef __x86_64__ + vmcs_writel(field, value); +#else + vmcs_writel(field, value); + asm volatile (""); + vmcs_writel(field+1, value >> 32); +#endif +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu) +{ + u64 phys_addr = __pa(vcpu->vmcs); + int cpu; + + cpu = get_cpu(); + + if (vcpu->cpu != cpu) { + smp_call_function(__vcpu_clear, vcpu, 0, 1); + vcpu->launched = 0; + } + + if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) { + u8 error; + + per_cpu(current_vmcs, cpu) = vcpu->vmcs; + asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vcpu->vmcs, phys_addr); + } + + if (vcpu->cpu != cpu) { + struct descriptor_table dt; + unsigned long sysenter_esp; + + vcpu->cpu = cpu; + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. + */ + vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ + get_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ + + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + } + return vcpu; +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + put_cpu(); +} + +static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + return vmcs_readl(GUEST_RFLAGS); +} + +static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + vmcs_writel(GUEST_RFLAGS, rflags); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rip; + u32 interruptibility; + + rip = vmcs_readl(GUEST_RIP); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_writel(GUEST_RIP, rip); + + /* + * We emulated an instruction, so temporary interrupt blocking + * should be removed, if set. + */ + interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + if (interruptibility & 3) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + interruptibility & ~3); +} + +static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) +{ + printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", + vmcs_readl(GUEST_RIP)); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + GP_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); +} + +/* + * reads and returns guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset -- 21.3 + */ +static u64 guest_read_tsc(void) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* + * writes 'guest_tsc' into guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + */ +static void guest_write_tsc(u64 guest_tsc) +{ + u64 host_tsc; + + rdtscll(host_tsc); + vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); +} + +static void reload_tss(void) +{ +#ifndef __x86_64__ + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u64 data; + struct vmx_msr_entry *msr; + + if (!pdata) { + printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); + return -EINVAL; + } + + switch (msr_index) { +#ifdef __x86_64__ + case MSR_FS_BASE: + data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_EFER: + data = vcpu->shadow_efer; + break; +#endif + case MSR_IA32_TIME_STAMP_COUNTER: + data = guest_read_tsc(); + break; + case MSR_IA32_SYSENTER_CS: + data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + data = vmcs_read32(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + data = vmcs_read32(GUEST_SYSENTER_ESP); + break; + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data = 0; + break; + case MSR_IA32_APICBASE: + data = vcpu->apic_base; + break; + default: + msr = find_msr_entry(vcpu, msr_index); + if (!msr) { + printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", msr_index); + return 1; + } + data = msr->data; + break; + } + + *pdata = data; + return 0; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vmx_msr_entry *msr; + switch (msr_index) { +#ifdef __x86_64__ + case MSR_FS_BASE: + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmcs_writel(GUEST_GS_BASE, data); + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_write32(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_write32(GUEST_SYSENTER_ESP, data); + break; +#ifdef __x86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; + case MSR_IA32_MC0_STATUS: + printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" + , __FUNCTION__, data); + break; +#endif + case MSR_IA32_TIME_STAMP_COUNTER: { + guest_write_tsc(data); + break; + } + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + vcpu->apic_base = data; + break; + default: + msr = find_msr_entry(vcpu, msr_index); + if (!msr) { + printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr_index); + return 1; + } + msr->data = data; + break; + } + + return 0; +} + +/* + * Sync the rsp and rip registers into the vcpu structure. This allows + * registers to be accessed by indexing vcpu->regs. + */ +static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) +{ + vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->rip = vmcs_readl(GUEST_RIP); +} + +/* + * Syncs rsp and rip back into the vmcs. Should be called after possible + * modification. + */ +static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); + vmcs_writel(GUEST_RIP, vcpu->rip); +} + +static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + unsigned long dr7 = 0x400; + u32 exception_bitmap; + int old_singlestep; + + exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); + old_singlestep = vcpu->guest_debug.singlestep; + + vcpu->guest_debug.enabled = dbg->enabled; + if (vcpu->guest_debug.enabled) { + int i; + + dr7 |= 0x200; /* exact */ + for (i = 0; i < 4; ++i) { + if (!dbg->breakpoints[i].enabled) + continue; + vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; + dr7 |= 2 << (i*2); /* global enable */ + dr7 |= 0 << (i*4+16); /* execution breakpoint */ + } + + exception_bitmap |= (1u << 1); /* Trap debug exceptions */ + + vcpu->guest_debug.singlestep = dbg->singlestep; + } else { + exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ + vcpu->guest_debug.singlestep = 0; + } + + if (old_singlestep && !vcpu->guest_debug.singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + vmcs_writel(GUEST_RFLAGS, flags); + } + + vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); + vmcs_writel(GUEST_DR7, dr7); + + return 0; +} + +static __init int cpu_has_kvm_support(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + +static __init int vmx_disabled_by_bios(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + return (msr & 5) == 1; /* locked but not enabled */ +} + +static __init void hardware_enable(void *garbage) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 old; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + if ((old & 5) == 0) + /* enable and lock */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5); + write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */ + asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + : "memory", "cc"); +} + +static void hardware_disable(void *garbage) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); +} + +static __init void setup_vmcs_descriptor(void) +{ + u32 vmx_msr_low, vmx_msr_high; + + rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high); + vmcs_descriptor.size = vmx_msr_high & 0x1fff; + vmcs_descriptor.order = get_order(vmcs_descriptor.size); + vmcs_descriptor.revision_id = vmx_msr_low; +}; + +static struct vmcs *alloc_vmcs_cpu(int cpu) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_descriptor.size); + vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ + return vmcs; +} + +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(smp_processor_id()); +} + +static void free_vmcs(struct vmcs *vmcs) +{ + free_pages((unsigned long)vmcs, vmcs_descriptor.order); +} + +static __exit void free_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) + free_vmcs(per_cpu(vmxarea, cpu)); +} + +extern struct vmcs *alloc_vmcs_cpu(int cpu); + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(cpu); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static __init int hardware_setup(void) +{ + setup_vmcs_descriptor(); + return alloc_kvm_area(); +} + +static __exit void hardware_unsetup(void) +{ + free_kvm_area(); +} + +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (vcpu->rmode.active) + vmcs_write32(EXCEPTION_BITMAP, ~0); + else + vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); +} + +static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + if (vmcs_readl(sf->base) == save->base) { + vmcs_write16(sf->selector, save->selector); + vmcs_writel(sf->base, save->base); + vmcs_write32(sf->limit, save->limit); + vmcs_write32(sf->ar_bytes, save->ar); + } else { + u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) + << AR_DPL_SHIFT; + vmcs_write32(sf->ar_bytes, 0x93 | dpl); + } +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->rmode.active = 0; + + vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(IOPL_MASK | X86_EFLAGS_VM); + flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | + (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK)); + + update_exception_bitmap(vcpu); + + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); + + vmcs_write16(GUEST_SS_SELECTOR, 0); + vmcs_write32(GUEST_SS_AR_BYTES, 0x93); + + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); +} + +static int rmode_tss_base(struct kvm* kvm) +{ + gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; +} + +static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + save->selector = vmcs_read16(sf->selector); + save->base = vmcs_readl(sf->base); + save->limit = vmcs_read32(sf->limit); + save->ar = vmcs_read32(sf->ar_bytes); + vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0xf3); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->rmode.active = 1; + + vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); + + vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + + vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT; + + flags |= IOPL_MASK | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); + update_exception_bitmap(vcpu); + + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + + fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); +} + +#ifdef __x86_64__ + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + printk(KERN_DEBUG "%s: tss fixup for long mode. \n", + __FUNCTION__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~AR_TYPE_MASK) + | AR_TYPE_BUSY_64_TSS); + } + + vcpu->shadow_efer |= EFER_LMA; + + find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + | VM_ENTRY_CONTROLS_IA32E_MASK); +} + +static void exit_lmode(struct kvm_vcpu *vcpu) +{ + vcpu->shadow_efer &= ~EFER_LMA; + + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + & ~VM_ENTRY_CONTROLS_IA32E_MASK); +} + +#endif + +static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) + enter_pmode(vcpu); + + if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) + enter_rmode(vcpu); + +#ifdef __x86_64__ + if (vcpu->shadow_efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) + enter_lmode(vcpu); + if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK)) + exit_lmode(vcpu); + } +#endif + + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, + (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vcpu->cr0 = cr0; +} + +/* + * Used when restoring the VM to avoid corrupting segment registers + */ +static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); + update_exception_bitmap(vcpu); + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, + (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vcpu->cr0 = cr0; +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + vmcs_writel(GUEST_CR3, cr3); +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + vcpu->cr4 = cr4; +} + +#ifdef __x86_64__ + +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); + + vcpu->shadow_efer = efer; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_CONTROLS_IA32E_MASK); + msr->data = efer; + + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_CONTROLS_IA32E_MASK); + + msr->data = efer & ~EFER_LME; + } +} + +#endif + +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + return vmcs_readl(sf->base); +} + +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + var->base = vmcs_readl(sf->base); + var->limit = vmcs_read32(sf->limit); + var->selector = vmcs_read16(sf->selector); + ar = vmcs_read32(sf->ar_bytes); + if (ar & AR_UNUSABLE_MASK) + ar = 0; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + var->present = (ar >> 7) & 1; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; + var->unusable = (ar >> 16) & 1; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (var->unusable) + ar = 1 << 16; + else { + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + } + vmcs_write32(sf->ar_bytes, ar); +} + +static int vmx_is_long_mode(struct kvm_vcpu *vcpu) +{ + return vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_CONTROLS_IA32E_MASK; +} + +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); + dt->base = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_IDTR_BASE, dt->base); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); + dt->base = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_GDTR_BASE, dt->base); +} + +static int init_rmode_tss(struct kvm* kvm) +{ + struct page *p1, *p2, *p3; + gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + char *page; + + p1 = _gfn_to_page(kvm, fn++); + p2 = _gfn_to_page(kvm, fn++); + p3 = _gfn_to_page(kvm, fn); + + if (!p1 || !p2 || !p3) { + kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); + return 0; + } + + page = kmap_atomic(p1, KM_USER0); + memset(page, 0, PAGE_SIZE); + *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + kunmap_atomic(page, KM_USER0); + + page = kmap_atomic(p2, KM_USER0); + memset(page, 0, PAGE_SIZE); + kunmap_atomic(page, KM_USER0); + + page = kmap_atomic(p3, KM_USER0); + memset(page, 0, PAGE_SIZE); + *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; + kunmap_atomic(page, KM_USER0); + + return 1; +} + +static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val) +{ + u32 msr_high, msr_low; + + rdmsr(msr, msr_low, msr_high); + + val &= msr_high; + val |= msr_low; + vmcs_write32(vmcs_field, val); +} + +static void seg_setup(int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + vmcs_write16(sf->selector, 0); + vmcs_writel(sf->base, 0); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0x93); +} + +/* + * Sets up the vmcs for emulated real mode. + */ +static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) +{ + u32 host_sysenter_cs; + u32 junk; + unsigned long a; + struct descriptor_table dt; + int i; + int ret = 0; + int nr_good_msrs; + extern asmlinkage void kvm_vmx_return(void); + + if (!init_rmode_tss(vcpu->kvm)) { + ret = -ENOMEM; + goto out; + } + + memset(vcpu->regs, 0, sizeof(vcpu->regs)); + vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); + vcpu->cr8 = 0; + vcpu->apic_base = 0xfee00000 | + /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | + MSR_IA32_APICBASE_ENABLE; + + fx_init(vcpu); + + /* + * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode + * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. + */ + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + + vmcs_writel(GUEST_RFLAGS, 0x02); + vmcs_writel(GUEST_RIP, 0xfff0); + vmcs_writel(GUEST_RSP, 0); + + vmcs_writel(GUEST_CR3, 0); + + //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 + vmcs_writel(GUEST_DR7, 0x400); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, 0); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + + /* I/O */ + vmcs_write64(IO_BITMAP_A, 0); + vmcs_write64(IO_BITMAP_B, 0); + + guest_write_tsc(0); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + + /* Special registers */ + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + /* Control */ + vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, + PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ + | PIN_BASED_NMI_EXITING /* 20.6.1 */ + ); + vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, + CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_HLT_EXITING /* 20.6.2 */ + | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ + | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ + | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ + | CPU_BASED_INVDPG_EXITING + | CPU_BASED_MOV_DR_EXITING + | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ + ); + + vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#ifdef __x86_64__ + rdmsrl(MSR_FS_BASE, a); + vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + rdmsrl(MSR_GS_BASE, a); + vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ +#else + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ +#endif + + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + get_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + + + vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ + + rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); + vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_ESP, a); + vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ + rdmsrl(MSR_IA32_SYSENTER_EIP, a); + vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + + ret = -ENOMEM; + vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vcpu->guest_msrs) + goto out; + vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vcpu->host_msrs) + goto out_free_guest_msrs; + + for (i = 0; i < NR_VMX_MSR; ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + u64 data; + int j = vcpu->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + data = data_low | ((u64)data_high << 32); + vcpu->host_msrs[j].index = index; + vcpu->host_msrs[j].reserved = 0; + vcpu->host_msrs[j].data = data; + vcpu->guest_msrs[j] = vcpu->host_msrs[j]; + ++vcpu->nmsrs; + } + printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); + + nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, + virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); + vmcs_writel(VM_EXIT_MSR_STORE_ADDR, + virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); + vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, + virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS)); + vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, + (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + + + /* 22.2.1, 20.8.1 */ + vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, + VM_ENTRY_CONTROLS, 0); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); + vmcs_writel(TPR_THRESHOLD, 0); + + vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); + vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + + vcpu->cr0 = 0x60000010; + vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode + vmx_set_cr4(vcpu, 0); +#ifdef __x86_64__ + vmx_set_efer(vcpu, 0); +#endif + + return 0; + +out_free_guest_msrs: + kfree(vcpu->guest_msrs); +out: + return ret; +} + +static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) +{ + u16 ent[2]; + u16 cs; + u16 ip; + unsigned long flags; + unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); + u16 sp = vmcs_readl(GUEST_RSP); + u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); + + if (sp > ss_limit || sp - 6 > sp) { + vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", + __FUNCTION__, + vmcs_readl(GUEST_RSP), + vmcs_readl(GUEST_SS_BASE), + vmcs_read32(GUEST_SS_LIMIT)); + return; + } + + if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != + sizeof(ent)) { + vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); + return; + } + + flags = vmcs_readl(GUEST_RFLAGS); + cs = vmcs_readl(GUEST_CS_BASE) >> 4; + ip = vmcs_readl(GUEST_RIP); + + + if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || + kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || + kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { + vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); + return; + } + + vmcs_writel(GUEST_RFLAGS, flags & + ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); + vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; + vmcs_writel(GUEST_CS_BASE, ent[1] << 4); + vmcs_writel(GUEST_RIP, ent[0]); + vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); +} + +static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) +{ + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); + + if (vcpu->rmode.active) { + inject_rmode_irq(vcpu, irq); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static void kvm_try_inject_irq(struct kvm_vcpu *vcpu) +{ + if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) + && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) + /* + * Interrupts enabled, and not blocked by sti or mov ss. Good. + */ + kvm_do_inject_irq(vcpu); + else + /* + * Interrupts blocked. Wait for unblock. + */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) + | CPU_BASED_VIRTUAL_INTR_PENDING); +} + +static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) +{ + struct kvm_guest_debug *dbg = &vcpu->guest_debug; + + set_debugreg(dbg->bp[0], 0); + set_debugreg(dbg->bp[1], 1); + set_debugreg(dbg->bp[2], 2); + set_debugreg(dbg->bp[3], 3); + + if (dbg->singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + vmcs_writel(GUEST_RFLAGS, flags); + } +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + if (!vcpu->rmode.active) + return 0; + + if (vec == GP_VECTOR && err_code == 0) + if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) + return 1; + return 0; +} + +static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 intr_info, error_code; + unsigned long cr2, rip; + u32 vect_info; + enum emulation_result er; + + vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !is_page_fault(intr_info)) { + printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " + "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); + } + + if (is_external_interrupt(vect_info)) { + int irq = vect_info & VECTORING_INFO_VECTOR_MASK; + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + } + + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ + asm ("int $2"); + return 1; + } + error_code = 0; + rip = vmcs_readl(GUEST_RIP); + if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (is_page_fault(intr_info)) { + cr2 = vmcs_readl(EXIT_QUALIFICATION); + + spin_lock(&vcpu->kvm->lock); + if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { + spin_unlock(&vcpu->kvm->lock); + return 1; + } + + er = emulate_instruction(vcpu, kvm_run, cr2, error_code); + spin_unlock(&vcpu->kvm->lock); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++kvm_stat.mmio_exits; + kvm_run->exit_reason = KVM_EXIT_MMIO; + return 0; + case EMULATE_FAIL: + vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); + break; + default: + BUG(); + } + } + + if (vcpu->rmode.active && + handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, + error_code)) + return 1; + + if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { + kvm_run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + kvm_run->ex.error_code = error_code; + return 0; +} + +static int handle_external_interrupt(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + ++kvm_stat.irq_exits; + return 1; +} + + +static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) +{ + u64 inst; + gva_t rip; + int countr_size; + int i, n; + + if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) { + countr_size = 2; + } else { + u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); + + countr_size = (cs_ar & AR_L_MASK) ? 8: + (cs_ar & AR_DB_MASK) ? 4: 2; + } + + rip = vmcs_readl(GUEST_RIP); + if (countr_size != 8) + rip += vmcs_readl(GUEST_CS_BASE); + + n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst); + + for (i = 0; i < n; i++) { + switch (((u8*)&inst)[i]) { + case 0xf0: + case 0xf2: + case 0xf3: + case 0x2e: + case 0x36: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x66: + break; + case 0x67: + countr_size = (countr_size == 2) ? 4: (countr_size >> 1); + default: + goto done; + } + } + return 0; +done: + countr_size *= 8; + *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); + return 1; +} + +static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + + ++kvm_stat.io_exits; + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + kvm_run->exit_reason = KVM_EXIT_IO; + if (exit_qualification & 8) + kvm_run->io.direction = KVM_EXIT_IO_IN; + else + kvm_run->io.direction = KVM_EXIT_IO_OUT; + kvm_run->io.size = (exit_qualification & 7) + 1; + kvm_run->io.string = (exit_qualification & 16) != 0; + kvm_run->io.string_down + = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + kvm_run->io.rep = (exit_qualification & 32) != 0; + kvm_run->io.port = exit_qualification >> 16; + if (kvm_run->io.string) { + if (!get_io_count(vcpu, &kvm_run->io.count)) + return 1; + kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } else + kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ + return 0; +} + +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 address = vmcs_read64(EXIT_QUALIFICATION); + int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + spin_lock(&vcpu->kvm->lock); + vcpu->mmu.inval_page(vcpu, address); + spin_unlock(&vcpu->kvm->lock); + vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length); + return 1; +} + +static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + int cr; + int reg; + + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + vcpu_load_rsp_rip(vcpu); + set_cr0(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 3: + vcpu_load_rsp_rip(vcpu); + set_cr3(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 4: + vcpu_load_rsp_rip(vcpu); + set_cr4(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 8: + vcpu_load_rsp_rip(vcpu); + set_cr8(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + }; + break; + case 1: /*mov from cr*/ + switch (cr) { + case 3: + vcpu_load_rsp_rip(vcpu); + vcpu->regs[reg] = vcpu->cr3; + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; + case 8: + printk(KERN_DEBUG "handle_cr: read CR8 " + "cpu erratum AA15\n"); + vcpu_load_rsp_rip(vcpu); + vcpu->regs[reg] = vcpu->cr8; + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case 3: /* lmsw */ + lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + + skip_emulated_instruction(vcpu); + return 1; + default: + break; + } + kvm_run->exit_reason = 0; + printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + unsigned long val; + int dr, reg; + + /* + * FIXME: this code assumes the host is debugging the guest. + * need to deal with guest debugging itself too. + */ + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + dr = exit_qualification & 7; + reg = (exit_qualification >> 8) & 15; + vcpu_load_rsp_rip(vcpu); + if (exit_qualification & 16) { + /* mov from dr */ + switch (dr) { + case 6: + val = 0xffff0ff0; + break; + case 7: + val = 0x400; + break; + default: + val = 0; + } + vcpu->regs[reg] = val; + } else { + /* mov to dr */ + } + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_run->exit_reason = KVM_EXIT_CPUID; + return 0; +} + +static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data; + + if (vmx_get_msr(vcpu, ecx, &data)) { + vmx_inject_gp(vcpu, 0); + return 1; + } + + /* FIXME: handling of bits 32:63 of rax, rdx */ + vcpu->regs[VCPU_REGS_RAX] = data & -1u; + vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); + + if (vmx_set_msr(vcpu, ecx, data) != 0) { + vmx_inject_gp(vcpu, 0); + return 1; + } + + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + /* Turn off interrupt window reporting. */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) + & ~CPU_BASED_VIRTUAL_INTR_PENDING); + return 1; +} + +static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) + return 1; + + kvm_run->exit_reason = KVM_EXIT_HLT; + return 0; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, +}; + +static const int kvm_vmx_max_exit_handlers = + sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + + if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && + exit_reason != EXIT_REASON_EXCEPTION_NMI ) + printk(KERN_WARNING "%s: unexpected, valid vectoring info and " + "exit reason is 0x%x\n", __FUNCTION__, exit_reason); + kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + else { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = exit_reason; + } + return 0; +} + +static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u8 fail; + u16 fs_sel, gs_sel, ldt_sel; + int fs_gs_ldt_reload_needed; + +again: + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + fs_sel = read_fs(); + gs_sel = read_gs(); + ldt_sel = read_ldt(); + fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; + if (!fs_gs_ldt_reload_needed) { + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmcs_write16(HOST_GS_SELECTOR, 0); + } + +#ifdef __x86_64__ + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); +#endif + + if (vcpu->irq_summary && + !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) + kvm_try_inject_irq(vcpu); + + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); + + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + + save_msrs(vcpu->host_msrs, vcpu->nmsrs); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + + asm ( + /* Store host registers */ + "pushf \n\t" +#ifdef __x86_64__ + "push %%rax; push %%rbx; push %%rdx;" + "push %%rsi; push %%rdi; push %%rbp;" + "push %%r8; push %%r9; push %%r10; push %%r11;" + "push %%r12; push %%r13; push %%r14; push %%r15;" + "push %%rcx \n\t" + ASM_VMX_VMWRITE_RSP_RDX "\n\t" +#else + "pusha; push %%ecx \n\t" + ASM_VMX_VMWRITE_RSP_RDX "\n\t" +#endif + /* Check if vmlaunch of vmresume is needed */ + "cmp $0, %1 \n\t" + /* Load guest registers. Don't clobber flags. */ +#ifdef __x86_64__ + "mov %c[cr2](%3), %%rax \n\t" + "mov %%rax, %%cr2 \n\t" + "mov %c[rax](%3), %%rax \n\t" + "mov %c[rbx](%3), %%rbx \n\t" + "mov %c[rdx](%3), %%rdx \n\t" + "mov %c[rsi](%3), %%rsi \n\t" + "mov %c[rdi](%3), %%rdi \n\t" + "mov %c[rbp](%3), %%rbp \n\t" + "mov %c[r8](%3), %%r8 \n\t" + "mov %c[r9](%3), %%r9 \n\t" + "mov %c[r10](%3), %%r10 \n\t" + "mov %c[r11](%3), %%r11 \n\t" + "mov %c[r12](%3), %%r12 \n\t" + "mov %c[r13](%3), %%r13 \n\t" + "mov %c[r14](%3), %%r14 \n\t" + "mov %c[r15](%3), %%r15 \n\t" + "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ +#else + "mov %c[cr2](%3), %%eax \n\t" + "mov %%eax, %%cr2 \n\t" + "mov %c[rax](%3), %%eax \n\t" + "mov %c[rbx](%3), %%ebx \n\t" + "mov %c[rdx](%3), %%edx \n\t" + "mov %c[rsi](%3), %%esi \n\t" + "mov %c[rdi](%3), %%edi \n\t" + "mov %c[rbp](%3), %%ebp \n\t" + "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ +#endif + /* Enter guest mode */ + "jne launched \n\t" + ASM_VMX_VMLAUNCH "\n\t" + "jmp kvm_vmx_return \n\t" + "launched: " ASM_VMX_VMRESUME "\n\t" + ".globl kvm_vmx_return \n\t" + "kvm_vmx_return: " + /* Save guest registers, load host registers, keep flags */ +#ifdef __x86_64__ + "xchg %3, 0(%%rsp) \n\t" + "mov %%rax, %c[rax](%3) \n\t" + "mov %%rbx, %c[rbx](%3) \n\t" + "pushq 0(%%rsp); popq %c[rcx](%3) \n\t" + "mov %%rdx, %c[rdx](%3) \n\t" + "mov %%rsi, %c[rsi](%3) \n\t" + "mov %%rdi, %c[rdi](%3) \n\t" + "mov %%rbp, %c[rbp](%3) \n\t" + "mov %%r8, %c[r8](%3) \n\t" + "mov %%r9, %c[r9](%3) \n\t" + "mov %%r10, %c[r10](%3) \n\t" + "mov %%r11, %c[r11](%3) \n\t" + "mov %%r12, %c[r12](%3) \n\t" + "mov %%r13, %c[r13](%3) \n\t" + "mov %%r14, %c[r14](%3) \n\t" + "mov %%r15, %c[r15](%3) \n\t" + "mov %%cr2, %%rax \n\t" + "mov %%rax, %c[cr2](%3) \n\t" + "mov 0(%%rsp), %3 \n\t" + + "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" + "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" + "pop %%rbp; pop %%rdi; pop %%rsi;" + "pop %%rdx; pop %%rbx; pop %%rax \n\t" +#else + "xchg %3, 0(%%esp) \n\t" + "mov %%eax, %c[rax](%3) \n\t" + "mov %%ebx, %c[rbx](%3) \n\t" + "pushl 0(%%esp); popl %c[rcx](%3) \n\t" + "mov %%edx, %c[rdx](%3) \n\t" + "mov %%esi, %c[rsi](%3) \n\t" + "mov %%edi, %c[rdi](%3) \n\t" + "mov %%ebp, %c[rbp](%3) \n\t" + "mov %%cr2, %%eax \n\t" + "mov %%eax, %c[cr2](%3) \n\t" + "mov 0(%%esp), %3 \n\t" + + "pop %%ecx; popa \n\t" +#endif + "setbe %0 \n\t" + "popf \n\t" + : "=g" (fail) + : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), + "c"(vcpu), + [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), +#ifdef __x86_64__ + [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), + [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) + : "cc", "memory" ); + + ++kvm_stat.exits; + + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + +#ifndef __x86_64__ + asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); +#endif + + kvm_run->exit_type = 0; + if (fail) { + kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; + kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + } else { + if (fs_gs_ldt_reload_needed) { + load_ldt(ldt_sel); + load_fs(fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(gs_sel); +#ifdef __x86_64__ + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } + vcpu->launched = 1; + kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; + if (kvm_handle_exit(kvm_run, vcpu)) { + /* Give scheduler a change to reschedule. */ + if (signal_pending(current)) { + ++kvm_stat.signal_exits; + return -EINTR; + } + kvm_resched(vcpu); + goto again; + } + } + return 0; +} + +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); +} + +static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, + unsigned long addr, + u32 err_code) +{ + u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + ++kvm_stat.pf_guest; + + if (is_page_fault(vect_info)) { + printk(KERN_DEBUG "inject_page_fault: " + "double fault 0x%lx @ 0x%lx\n", + addr, vmcs_readl(GUEST_RIP)); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + DF_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); + return; + } + vcpu->cr2 = addr; + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + PF_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); + +} + +static void vmx_free_vmcs(struct kvm_vcpu *vcpu) +{ + if (vcpu->vmcs) { + on_each_cpu(__vcpu_clear, vcpu, 0, 1); + free_vmcs(vcpu->vmcs); + vcpu->vmcs = NULL; + } +} + +static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + vmx_free_vmcs(vcpu); +} + +static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +{ + struct vmcs *vmcs; + + vmcs = alloc_vmcs(); + if (!vmcs) + return -ENOMEM; + vmcs_clear(vmcs); + vcpu->vmcs = vmcs; + vcpu->launched = 0; + return 0; +} + +static struct kvm_arch_ops vmx_arch_ops = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .hardware_setup = hardware_setup, + .hardware_unsetup = hardware_unsetup, + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .set_guest_debug = set_guest_debug, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .is_long_mode = vmx_is_long_mode, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .set_cr0 = vmx_set_cr0, + .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, + .set_cr3 = vmx_set_cr3, + .set_cr4 = vmx_set_cr4, +#ifdef __x86_64__ + .set_efer = vmx_set_efer, +#endif + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .cache_regs = vcpu_load_rsp_rip, + .decache_regs = vcpu_put_rsp_rip, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + + .tlb_flush = vmx_flush_tlb, + .inject_page_fault = vmx_inject_page_fault, + + .inject_gp = vmx_inject_gp, + + .run = vmx_vcpu_run, + .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_setup = vmx_vcpu_setup, +}; + +static int __init vmx_init(void) +{ + kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + return 0; +} + +static void __exit vmx_exit(void) +{ + kvm_exit_arch(); +} + +module_init(vmx_init) +module_exit(vmx_exit) diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h new file mode 100644 index 000000000000..797278341581 --- /dev/null +++ b/drivers/kvm/vmx.h @@ -0,0 +1,296 @@ +#ifndef VMX_H +#define VMX_H + +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity + * Yaniv Kamay + * + */ + +#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVDPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000 +#define CPU_BASED_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 + +#define PIN_BASED_EXT_INTR_MASK 0x1 +#define PIN_BASED_NMI_EXITING 0x8 + +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200 + + +/* VMCS Encodings */ +enum vmcs_field { + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_A_HIGH = 0x00002001, + IO_BITMAP_B = 0x00002002, + IO_BITMAP_B_HIGH = 0x00002003, + MSR_BITMAP = 0x00002004, + MSR_BITMAP_HIGH = 0x00002005, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + TSC_OFFSET = 0x00002010, + TSC_OFFSET_HIGH = 0x00002011, + VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + VMCS_LINK_POINTER = 0x00002800, + VMCS_LINK_POINTER_HIGH = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + TPR_THRESHOLD = 0x0000401c, + SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + VM_INSTRUCTION_ERROR = 0x00004400, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_INSTRUCTION_INFO = 0x0000440e, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_SYSENTER_CS = 0x0000482A, + HOST_IA32_SYSENTER_CS = 0x00004c00, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUE0 = 0x00006008, + CR3_TARGET_VALUE1 = 0x0000600a, + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_RSP = 0x0000681c, + GUEST_RIP = 0x0000681e, + GUEST_RFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + GUEST_SYSENTER_ESP = 0x00006824, + GUEST_SYSENTER_EIP = 0x00006826, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_IA32_SYSENTER_ESP = 0x00006c10, + HOST_IA32_SYSENTER_EIP = 0x00006c12, + HOST_RSP = 0x00006c14, + HOST_RIP = 0x00006c16, +}; + +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 + +#define EXIT_REASON_PENDING_INTERRUPT 7 + +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 + +/* + * Interruption-information format + */ +#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ +#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ +#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ + +#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK +#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK +#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK +#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK + +#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ + +/* + * Exit Qualifications for MOV for Control Register Access + */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ +#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ +#define LMSW_SOURCE_DATA_SHIFT 16 +#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ +#define REG_EAX (0 << 8) +#define REG_ECX (1 << 8) +#define REG_EDX (2 << 8) +#define REG_EBX (3 << 8) +#define REG_ESP (4 << 8) +#define REG_EBP (5 << 8) +#define REG_ESI (6 << 8) +#define REG_EDI (7 << 8) +#define REG_R8 (8 << 8) +#define REG_R9 (9 << 8) +#define REG_R10 (10 << 8) +#define REG_R11 (11 << 8) +#define REG_R12 (12 << 8) +#define REG_R13 (13 << 8) +#define REG_R14 (14 << 8) +#define REG_R15 (15 << 8) + +/* + * Exit Qualifications for MOV for Debug Register Access + */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ +#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ +#define TYPE_MOV_TO_DR (0 << 4) +#define TYPE_MOV_FROM_DR (1 << 4) +#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ + + +/* segment AR */ +#define SEGMENT_AR_L_MASK (1 << 13) + +/* entry controls */ +#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9) + +#define AR_TYPE_ACCESSES_MASK 1 +#define AR_TYPE_READABLE_MASK (1 << 1) +#define AR_TYPE_WRITEABLE_MASK (1 << 2) +#define AR_TYPE_CODE_MASK (1 << 3) +#define AR_TYPE_MASK 0x0f +#define AR_TYPE_BUSY_64_TSS 11 +#define AR_TYPE_BUSY_32_TSS 11 +#define AR_TYPE_BUSY_16_TSS 3 +#define AR_TYPE_LDT 2 + +#define AR_UNUSABLE_MASK (1 << 16) +#define AR_S_MASK (1 << 4) +#define AR_P_MASK (1 << 7) +#define AR_L_MASK (1 << 13) +#define AR_DB_MASK (1 << 14) +#define AR_G_MASK (1 << 15) +#define AR_DPL_SHIFT 5 +#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) + +#define AR_RESERVD_MASK 0xfffe0f00 + +#define CR4_VMXE 0x2000 + +#define MSR_IA32_VMX_BASIC_MSR 0x480 +#define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_IA32_VMX_PINBASED_CTLS_MSR 0x481 +#define MSR_IA32_VMX_PROCBASED_CTLS_MSR 0x482 +#define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483 +#define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484 + +#endif diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c new file mode 100644 index 000000000000..7e838bf0592d --- /dev/null +++ b/drivers/kvm/x86_emulate.c @@ -0,0 +1,1409 @@ +/****************************************************************************** + * x86_emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privieged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include +#include +#include +#define DPRINTF(_f, _a ...) printf( _f , ## _a ) +#else +#include "kvm.h" +#define DPRINTF(x...) do {} while (0) +#endif +#include "x86_emulate.h" +#include + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov ,' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstMask (3<<1) +/* Source operand type. */ +#define SrcNone (0<<3) /* No source operand. */ +#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<3) /* Register operand. */ +#define SrcMem (2<<3) /* Memory operand. */ +#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ +#define SrcImm (5<<3) /* Immediate operand. */ +#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<3) +/* Generic ModRM decode. */ +#define ModRM (1<<6) +/* Destination is only written; never read. */ +#define Mov (1<<7) + +static u8 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x40 - 0x4F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x87 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + 0, 0, 0, DstMem | SrcNone | ModRM | Mov, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, + ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, + ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps, ImplicitOps, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps, ImplicitOps, + /* 0xB0 - 0xBF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, + 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, + DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + /* 0xF8 - 0xFF */ + 0, 0, 0, 0, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM +}; + +static u8 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* + * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we + * are interested only in invlpg and not in any of the rest. + * + * invlpg is a special instruction in that the data it references may not + * be mapped. + */ +void kvm_emulator_want_group7_invlpg(void) +{ + twobyte_table[1] &= ~SrcMem; +} +EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg); + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +/* EFLAGS bit definitions. */ +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(__x86_64__) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ + "push %"_sav"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + /* _sav &= ~msk; */ \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",%"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"w %"_wx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"l %"_lx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + case 8: \ + __emulate_2op_8byte(_op, _src, _dst, \ + _eflags, _qx, _qy); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"b %"_bx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _by ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + unsigned long _tmp; \ + \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"b %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"w %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"l %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 8: \ + __emulate_1op_8byte(_op, _dst, _eflags); \ + break; \ + } \ + } while (0) + +/* Emulate an instruction with quadword operands (x86/64 only). */ +#if defined(__x86_64__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"q %"_qx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ + } while (0) + +#define __emulate_1op_8byte(_op, _dst, _eflags) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"q %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + } while (0) + +#elif defined(__i386__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) +#define __emulate_1op_8byte(_op, _dst, _eflags) +#endif /* __i386__ */ + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ + (_size), ctxt); \ + if ( rc != 0 ) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +/* Access/update address held in a register, based on addressing mode. */ +#define register_address(base, reg) \ + ((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \ + ((reg) & ((1UL << (ad_bytes << 3)) - 1)))) + +#define register_address_increment(reg, inc) \ + do { \ + /* signed type ensures sign extension to long */ \ + int _inc = (inc); \ + if ( ad_bytes == sizeof(unsigned long) ) \ + (reg) += _inc; \ + else \ + (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ + (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ + } while (0) + +void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt); + return rc; +} + +int +x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + u8 b, d, sib, twobyte = 0, rex_prefix = 0; + u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; + unsigned long *override_base = NULL; + unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; + int rc = 0; + struct operand src, dst; + unsigned long cr2 = ctxt->cr2; + int mode = ctxt->mode; + unsigned long modrm_ea; + int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; + + /* Shadow copy of register state. Committed on successful emulation. */ + unsigned long _regs[NR_VCPU_REGS]; + unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; + unsigned long modrm_val = 0; + + memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + op_bytes = ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + op_bytes = ad_bytes = 4; + break; +#ifdef __x86_64__ + case X86EMUL_MODE_PROT64: + op_bytes = 4; + ad_bytes = 8; + break; +#endif + default: + return -1; + } + + /* Legacy prefixes. */ + for (i = 0; i < 8; i++) { + switch (b = insn_fetch(u8, 1, _eip)) { + case 0x66: /* operand-size override */ + op_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + ad_bytes ^= 12; /* switch between 4/8 bytes */ + else + ad_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x2e: /* CS override */ + override_base = &ctxt->cs_base; + break; + case 0x3e: /* DS override */ + override_base = &ctxt->ds_base; + break; + case 0x26: /* ES override */ + override_base = &ctxt->es_base; + break; + case 0x64: /* FS override */ + override_base = &ctxt->fs_base; + break; + case 0x65: /* GS override */ + override_base = &ctxt->gs_base; + break; + case 0x36: /* SS override */ + override_base = &ctxt->ss_base; + break; + case 0xf0: /* LOCK */ + lock_prefix = 1; + break; + case 0xf3: /* REP/REPE/REPZ */ + rep_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + break; + default: + goto done_prefixes; + } + } + +done_prefixes: + + /* REX prefix. */ + if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { + rex_prefix = b; + if (b & 8) + op_bytes = 8; /* REX.W */ + modrm_reg = (b & 4) << 1; /* REX.R */ + index_reg = (b & 2) << 2; /* REX.X */ + modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ + b = insn_fetch(u8, 1, _eip); + } + + /* Opcode byte(s). */ + d = opcode_table[b]; + if (d == 0) { + /* Two-byte opcode? */ + if (b == 0x0f) { + twobyte = 1; + b = insn_fetch(u8, 1, _eip); + d = twobyte_table[b]; + } + + /* Unrecognised? */ + if (d == 0) + goto cannot_emulate; + } + + /* ModRM and SIB bytes. */ + if (d & ModRM) { + modrm = insn_fetch(u8, 1, _eip); + modrm_mod |= (modrm & 0xc0) >> 6; + modrm_reg |= (modrm & 0x38) >> 3; + modrm_rm |= (modrm & 0x07); + modrm_ea = 0; + use_modrm_ea = 1; + + if (modrm_mod == 3) { + modrm_val = *(unsigned long *) + decode_register(modrm_rm, _regs, d & ByteOp); + goto modrm_done; + } + + if (ad_bytes == 2) { + unsigned bx = _regs[VCPU_REGS_RBX]; + unsigned bp = _regs[VCPU_REGS_RBP]; + unsigned si = _regs[VCPU_REGS_RSI]; + unsigned di = _regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (modrm_mod) { + case 0: + if (modrm_rm == 6) + modrm_ea += insn_fetch(u16, 2, _eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, _eip); + break; + case 2: + modrm_ea += insn_fetch(u16, 2, _eip); + break; + } + switch (modrm_rm) { + case 0: + modrm_ea += bx + si; + break; + case 1: + modrm_ea += bx + di; + break; + case 2: + modrm_ea += bp + si; + break; + case 3: + modrm_ea += bp + di; + break; + case 4: + modrm_ea += si; + break; + case 5: + modrm_ea += di; + break; + case 6: + if (modrm_mod != 0) + modrm_ea += bp; + break; + case 7: + modrm_ea += bx; + break; + } + if (modrm_rm == 2 || modrm_rm == 3 || + (modrm_rm == 6 && modrm_mod != 0)) + if (!override_base) + override_base = &ctxt->ss_base; + modrm_ea = (u16)modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + switch (modrm_rm) { + case 4: + case 12: + sib = insn_fetch(u8, 1, _eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + switch (base_reg) { + case 5: + if (modrm_mod != 0) + modrm_ea += _regs[base_reg]; + else + modrm_ea += insn_fetch(s32, 4, _eip); + break; + default: + modrm_ea += _regs[base_reg]; + } + switch (index_reg) { + case 4: + break; + default: + modrm_ea += _regs[index_reg] << scale; + + } + break; + case 5: + if (modrm_mod != 0) + modrm_ea += _regs[modrm_rm]; + else if (mode == X86EMUL_MODE_PROT64) + rip_relative = 1; + break; + default: + modrm_ea += _regs[modrm_rm]; + break; + } + switch (modrm_mod) { + case 0: + if (modrm_rm == 5) + modrm_ea += insn_fetch(s32, 4, _eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, _eip); + break; + case 2: + modrm_ea += insn_fetch(s32, 4, _eip); + break; + } + } + if (!override_base) + override_base = &ctxt->ds_base; + if (mode == X86EMUL_MODE_PROT64 && + override_base != &ctxt->fs_base && + override_base != &ctxt->gs_base) + override_base = NULL; + + if (override_base) + modrm_ea += *override_base; + + if (rip_relative) { + modrm_ea += _eip; + switch (d & SrcMask) { + case SrcImmByte: + modrm_ea += 1; + break; + case SrcImm: + if (d & ByteOp) + modrm_ea += 1; + else + if (op_bytes == 8) + modrm_ea += 4; + else + modrm_ea += op_bytes; + } + } + if (ad_bytes != 8) + modrm_ea = (u32)modrm_ea; + cr2 = modrm_ea; + modrm_done: + ; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + goto special_insn; + case DstReg: + dst.type = OP_REG; + if ((d & ByteOp) + && !(twobyte_table && (b == 0xb6 || b == 0xb7))) { + dst.ptr = decode_register(modrm_reg, _regs, + (rex_prefix == 0)); + dst.val = *(u8 *) dst.ptr; + dst.bytes = 1; + } else { + dst.ptr = decode_register(modrm_reg, _regs, 0); + switch ((dst.bytes = op_bytes)) { + case 2: + dst.val = *(u16 *)dst.ptr; + break; + case 4: + dst.val = *(u32 *)dst.ptr; + break; + case 8: + dst.val = *(u64 *)dst.ptr; + break; + } + } + break; + case DstMem: + dst.type = OP_MEM; + dst.ptr = (unsigned long *)cr2; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + if (!(d & Mov) && /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)dst.ptr, + &dst.val, dst.bytes, ctxt)) != 0)) + goto done; + break; + } + dst.orig_val = dst.val; + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (d & SrcMask) { + case SrcNone: + break; + case SrcReg: + src.type = OP_REG; + if (d & ByteOp) { + src.ptr = decode_register(modrm_reg, _regs, + (rex_prefix == 0)); + src.val = src.orig_val = *(u8 *) src.ptr; + src.bytes = 1; + } else { + src.ptr = decode_register(modrm_reg, _regs, 0); + switch ((src.bytes = op_bytes)) { + case 2: + src.val = src.orig_val = *(u16 *) src.ptr; + break; + case 4: + src.val = src.orig_val = *(u32 *) src.ptr; + break; + case 8: + src.val = src.orig_val = *(u64 *) src.ptr; + break; + } + } + break; + case SrcMem16: + src.bytes = 2; + goto srcmem_common; + case SrcMem32: + src.bytes = 4; + goto srcmem_common; + case SrcMem: + src.bytes = (d & ByteOp) ? 1 : op_bytes; + srcmem_common: + src.type = OP_MEM; + src.ptr = (unsigned long *)cr2; + if ((rc = ops->read_emulated((unsigned long)src.ptr, + &src.val, src.bytes, ctxt)) != 0) + goto done; + src.orig_val = src.val; + break; + case SrcImm: + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if (src.bytes == 8) + src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (src.bytes) { + case 1: + src.val = insn_fetch(s8, 1, _eip); + break; + case 2: + src.val = insn_fetch(s16, 2, _eip); + break; + case 4: + src.val = insn_fetch(s32, 4, _eip); + break; + } + break; + case SrcImmByte: + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = 1; + src.val = insn_fetch(s8, 1, _eip); + break; + } + + if (twobyte) + goto twobyte_insn; + + switch (b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", src, dst, _eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", src, dst, _eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", src, dst, _eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", src, dst, _eflags); + break; + case 0x20 ... 0x25: + and: /* and */ + emulate_2op_SrcV("and", src, dst, _eflags); + break; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", src, dst, _eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", src, dst, _eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", src, dst, _eflags); + break; + case 0x63: /* movsxd */ + if (mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + dst.val = (s32) src.val; + break; + case 0x80 ... 0x83: /* Grp1 */ + switch (modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + test: /* test */ + emulate_2op_SrcV("test", src, dst, _eflags); + break; + case 0x86 ... 0x87: /* xchg */ + /* Write back the register source. */ + switch (dst.bytes) { + case 1: + *(u8 *) src.ptr = (u8) dst.val; + break; + case 2: + *(u16 *) src.ptr = (u16) dst.val; + break; + case 4: + *src.ptr = (u32) dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *src.ptr = dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + dst.val = src.val; + lock_prefix = 1; + break; + case 0xa0 ... 0xa1: /* mov */ + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + dst.val = src.val; + _eip += ad_bytes; /* skip src displacement */ + break; + case 0xa2 ... 0xa3: /* mov */ + dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; + _eip += ad_bytes; /* skip dst displacement */ + break; + case 0x88 ... 0x8b: /* mov */ + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + dst.val = src.val; + break; + case 0x8f: /* pop (sole member of Grp1a) */ + /* 64-bit mode: POP always pops a 64-bit operand. */ + if (mode == X86EMUL_MODE_PROT64) + dst.bytes = 8; + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), + &dst.val, dst.bytes, ctxt)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); + break; + case 0xc0 ... 0xc1: + grp2: /* Grp2 */ + switch (modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", src, dst, _eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", src, dst, _eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", src, dst, _eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", src, dst, _eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", src, dst, _eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", src, dst, _eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", src, dst, _eflags); + break; + } + break; + case 0xd0 ... 0xd1: /* Grp2 */ + src.val = 1; + goto grp2; + case 0xd2 ... 0xd3: /* Grp2 */ + src.val = _regs[VCPU_REGS_RCX]; + goto grp2; + case 0xf6 ... 0xf7: /* Grp3 */ + switch (modrm_reg) { + case 0 ... 1: /* test */ + /* + * Special case in Grp3: test has an immediate + * source operand. + */ + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if (src.bytes == 8) + src.bytes = 4; + switch (src.bytes) { + case 1: + src.val = insn_fetch(s8, 1, _eip); + break; + case 2: + src.val = insn_fetch(s16, 2, _eip); + break; + case 4: + src.val = insn_fetch(s32, 4, _eip); + break; + } + goto test; + case 2: /* not */ + dst.val = ~dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", dst, _eflags); + break; + default: + goto cannot_emulate; + } + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + switch (modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", dst, _eflags); + break; + case 1: /* dec */ + emulate_1op("dec", dst, _eflags); + break; + case 6: /* push */ + /* 64-bit mode: PUSH always pushes a 64-bit operand. */ + if (mode == X86EMUL_MODE_PROT64) { + dst.bytes = 8; + if ((rc = ops->read_std((unsigned long)dst.ptr, + &dst.val, 8, + ctxt)) != 0) + goto done; + } + register_address_increment(_regs[VCPU_REGS_RSP], + -dst.bytes); + if ((rc = ops->write_std( + register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), + dst.val, dst.bytes, ctxt)) != 0) + goto done; + dst.val = dst.orig_val; /* skanky: disable writeback */ + break; + default: + goto cannot_emulate; + } + break; + } + +writeback: + if ((d & Mov) || (dst.orig_val != dst.val)) { + switch (dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (dst.bytes) { + case 1: + *(u8 *)dst.ptr = (u8)dst.val; + break; + case 2: + *(u16 *)dst.ptr = (u16)dst.val; + break; + case 4: + *dst.ptr = (u32)dst.val; + break; /* 64b: zero-ext */ + case 8: + *dst.ptr = dst.val; + break; + } + break; + case OP_MEM: + if (lock_prefix) + rc = ops->cmpxchg_emulated((unsigned long)dst. + ptr, dst.orig_val, + dst.val, dst.bytes, + ctxt); + else + rc = ops->write_emulated((unsigned long)dst.ptr, + dst.val, dst.bytes, + ctxt); + if (rc != 0) + goto done; + default: + break; + } + } + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); + ctxt->eflags = _eflags; + ctxt->vcpu->rip = _eip; + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + +special_insn: + if (twobyte) + goto twobyte_special_insn; + if (rep_prefix) { + if (_regs[VCPU_REGS_RCX] == 0) { + ctxt->vcpu->rip = _eip; + goto done; + } + _regs[VCPU_REGS_RCX]--; + _eip = ctxt->vcpu->rip; + } + switch (b) { + case 0xa4 ... 0xa5: /* movs */ + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)register_address(ctxt->es_base, + _regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address( + override_base ? *override_base : ctxt->ds_base, + _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + register_address_increment(_regs[VCPU_REGS_RDI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + DPRINTF("Urk! I don't handle CMPS.\n"); + goto cannot_emulate; + case 0xaa ... 0xab: /* stos */ + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)cr2; + dst.val = _regs[VCPU_REGS_RAX]; + register_address_increment(_regs[VCPU_REGS_RDI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + dst.type = OP_REG; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + } + goto writeback; + +twobyte_insn: + switch (b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (modrm_reg) { + u16 size; + unsigned long address; + + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, src.ptr, + &size, &address, op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + break; + case 3: /* lidt */ + rc = read_descriptor(ctxt, ops, src.ptr, + &size, &address, op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + break; + case 4: /* smsw */ + if (modrm_mod != 3) + goto cannot_emulate; + *(u16 *)&_regs[modrm_rm] + = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + if (modrm_mod != 3) + goto cannot_emulate; + realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, cr2); + break; + default: + goto cannot_emulate; + } + break; + case 0x21: /* mov from dr to reg */ + if (modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); + break; + case 0x23: /* mov from reg to dr */ + if (modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); + break; + case 0x40 ... 0x4f: /* cmov */ + dst.val = dst.orig_val = src.val; + d &= ~Mov; /* default to no move */ + /* + * First, assume we're decoding an even cmov opcode + * (lsb == 0). + */ + switch ((b & 15) >> 1) { + case 0: /* cmovo */ + d |= (_eflags & EFLG_OF) ? Mov : 0; + break; + case 1: /* cmovb/cmovc/cmovnae */ + d |= (_eflags & EFLG_CF) ? Mov : 0; + break; + case 2: /* cmovz/cmove */ + d |= (_eflags & EFLG_ZF) ? Mov : 0; + break; + case 3: /* cmovbe/cmovna */ + d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0; + break; + case 4: /* cmovs */ + d |= (_eflags & EFLG_SF) ? Mov : 0; + break; + case 5: /* cmovp/cmovpe */ + d |= (_eflags & EFLG_PF) ? Mov : 0; + break; + case 7: /* cmovle/cmovng */ + d |= (_eflags & EFLG_ZF) ? Mov : 0; + /* fall through */ + case 6: /* cmovl/cmovnge */ + d |= (!(_eflags & EFLG_SF) != + !(_eflags & EFLG_OF)) ? Mov : 0; + break; + } + /* Odd cmov opcodes (lsb == 1) have inverted sense. */ + d ^= (b & 1) ? Mov : 0; + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + src.orig_val = src.val; + src.val = _regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", src, dst, _eflags); + /* Always write back. The question is: where to? */ + d |= Mov; + if (_eflags & EFLG_ZF) { + /* Success: write back to memory. */ + dst.val = src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + dst.type = OP_REG; + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + } + break; + case 0xa3: + bt: /* bt */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); + break; + case 0xb3: + btr: /* btr */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); + break; + case 0xab: + bts: /* bts */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + dst.bytes = op_bytes; + dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; + break; + case 0xbb: + btc: /* btc */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); + break; + case 0xba: /* Grp8 */ + switch (modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbe ... 0xbf: /* movsx */ + dst.bytes = op_bytes; + dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; + break; + } + goto writeback; + +twobyte_special_insn: + /* Disable writeback. */ + dst.orig_val = dst.val; + switch (b) { + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + break; + case 0x06: + emulate_clts(ctxt->vcpu); + break; + case 0x20: /* mov cr, reg */ + if (modrm_mod != 3) + goto cannot_emulate; + _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); + break; + case 0x22: /* mov reg, cr */ + if (modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ +#if defined(__i386__) + { + unsigned long old_lo, old_hi; + if (((rc = ops->read_emulated(cr2 + 0, &old_lo, 4, + ctxt)) != 0) + || ((rc = ops->read_emulated(cr2 + 4, &old_hi, 4, + ctxt)) != 0)) + goto done; + if ((old_lo != _regs[VCPU_REGS_RAX]) + || (old_hi != _regs[VCPU_REGS_RDI])) { + _regs[VCPU_REGS_RAX] = old_lo; + _regs[VCPU_REGS_RDX] = old_hi; + _eflags &= ~EFLG_ZF; + } else if (ops->cmpxchg8b_emulated == NULL) { + rc = X86EMUL_UNHANDLEABLE; + goto done; + } else { + if ((rc = ops->cmpxchg8b_emulated(cr2, old_lo, + old_hi, + _regs[VCPU_REGS_RBX], + _regs[VCPU_REGS_RCX], + ctxt)) != 0) + goto done; + _eflags |= EFLG_ZF; + } + break; + } +#elif defined(__x86_64__) + { + unsigned long old, new; + if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) + goto done; + if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { + _regs[VCPU_REGS_RAX] = (u32) (old >> 0); + _regs[VCPU_REGS_RDX] = (u32) (old >> 32); + _eflags &= ~EFLG_ZF; + } else { + new = (_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX]; + if ((rc = ops->cmpxchg_emulated(cr2, old, + new, 8, ctxt)) != 0) + goto done; + _eflags |= EFLG_ZF; + } + break; + } +#endif + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", b); + return -1; +} + +#ifdef __XEN__ + +#include +#include + +int +x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + unsigned int rc; + + *val = 0; + + if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { + propagate_page_fault(addr + bytes - rc, 0); /* read fault */ + return X86EMUL_PROPAGATE_FAULT; + } + + return X86EMUL_CONTINUE; +} + +int +x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + unsigned int rc; + + if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { + propagate_page_fault(addr + bytes - rc, PGERR_write_access); + return X86EMUL_PROPAGATE_FAULT; + } + + return X86EMUL_CONTINUE; +} + +#endif diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h new file mode 100644 index 000000000000..658b58de30fc --- /dev/null +++ b/drivers/kvm/x86_emulate.h @@ -0,0 +1,185 @@ +/****************************************************************************** + * x86_emulate.h + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __X86_EMULATE_H__ +#define __X86_EMULATE_H__ + +struct x86_emulate_ctxt; + +/* + * x86_emulate_ops: + * + * These operations represent the instruction emulator's interface to memory. + * There are two categories of operation: those that act on ordinary memory + * regions (*_std), and those that act on memory regions known to require + * special treatment or emulation (*_emulated). + * + * The emulator assumes that an instruction accesses only one 'emulated memory' + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and + * stack operations are assumed never to access emulated memory. The emulator + * automatically deduces which operand of a string-move operation is accessing + * emulated memory, and assumes that the other operand accesses normal memory. + * + * NOTES: + * 1. The emulator isn't very smart about emulated vs. standard memory. + * 'Emulated memory' access addresses should be checked for sanity. + * 'Normal memory' accesses may fault, and the caller must arrange to + * detect and handle reentrancy into the emulator via recursive faults. + * Accesses may be unaligned and may cross page boundaries. + * 2. If the access fails (cannot emulate, or a standard access faults) then + * it is up to the memop to propagate the fault to the guest VM via + * some out-of-band mechanism, unknown to the emulator. The memop signals + * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will + * then immediately bail. + * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only + * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + */ +/* Access completed successfully: continue emulation as normal. */ +#define X86EMUL_CONTINUE 0 +/* Access is unhandleable: bail from emulation and return error to caller. */ +#define X86EMUL_UNHANDLEABLE 1 +/* Terminate emulation but return success to the caller. */ +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ +#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +struct x86_emulate_ops { + /* + * read_std: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch, stack operations, and others. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_std)(unsigned long addr, + unsigned long *val, + unsigned int bytes, struct x86_emulate_ctxt * ctxt); + + /* + * write_std: Write bytes of standard (non-emulated/special) memory. + * Used for stack operations, and others. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_std)(unsigned long addr, + unsigned long val, + unsigned int bytes, struct x86_emulate_ctxt * ctxt); + + /* + * read_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_emulated) (unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt * ctxt); + + /* + * write_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_emulated) (unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt * ctxt); + + /* + * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * @bytes: [IN ] Number of bytes to access using CMPXCHG. + */ + int (*cmpxchg_emulated) (unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt * ctxt); + + /* + * cmpxchg8b_emulated: Emulate an atomic (LOCKed) CMPXCHG8B operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * NOTES: + * 1. This function is only ever called when emulating a real CMPXCHG8B. + * 2. This function is *never* called on x86/64 systems. + * 2. Not defining this function (i.e., specifying NULL) is equivalent + * to defining a function that always returns X86EMUL_UNHANDLEABLE. + */ + int (*cmpxchg8b_emulated) (unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt * ctxt); +}; + +struct cpu_user_regs; + +struct x86_emulate_ctxt { + /* Register state before/after emulation. */ + struct kvm_vcpu *vcpu; + + /* Linear faulting address (if emulating a page-faulting instruction). */ + unsigned long eflags; + unsigned long cr2; + + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + + unsigned long cs_base; + unsigned long ds_base; + unsigned long es_base; + unsigned long ss_base; + unsigned long gs_base; + unsigned long fs_base; +}; + +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(__i386__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(__x86_64__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + +/* + * x86_emulate_memop: Emulate an instruction that faulted attempting to + * read/write a 'special' memory area. + * Returns -1 on failure, 0 on success. + */ +int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs); + +#endif /* __X86_EMULATE_H__ */ diff --git a/include/linux/kvm.h b/include/linux/kvm.h new file mode 100644 index 000000000000..5bb2c3c585c1 --- /dev/null +++ b/include/linux/kvm.h @@ -0,0 +1,227 @@ +#ifndef __LINUX_KVM_H +#define __LINUX_KVM_H + +/* + * Userspace interface for /dev/kvm - kernel based virtual machine + * + * Note: this interface is considered experimental and may change without + * notice. + */ + +#include +#include + +/* + * Architectural interrupt line count, and the size of the bitmap needed + * to hold them. + */ +#define KVM_NR_INTERRUPTS 256 +#define KVM_IRQ_BITMAP_SIZE_BYTES ((KVM_NR_INTERRUPTS + 7) / 8) +#define KVM_IRQ_BITMAP_SIZE(type) (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type)) + + +/* for KVM_CREATE_MEMORY_REGION */ +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL + + +#define KVM_EXIT_TYPE_FAIL_ENTRY 1 +#define KVM_EXIT_TYPE_VM_EXIT 2 + +enum kvm_exit_reason { + KVM_EXIT_UNKNOWN = 0, + KVM_EXIT_EXCEPTION = 1, + KVM_EXIT_IO = 2, + KVM_EXIT_CPUID = 3, + KVM_EXIT_DEBUG = 4, + KVM_EXIT_HLT = 5, + KVM_EXIT_MMIO = 6, +}; + +/* for KVM_RUN */ +struct kvm_run { + /* in */ + __u32 vcpu; + __u32 emulated; /* skip current instruction */ + __u32 mmio_completed; /* mmio request completed */ + + /* out */ + __u32 exit_type; + __u32 exit_reason; + __u32 instruction_length; + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u32 hardware_exit_reason; + } hw; + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u8 string; + __u8 string_down; + __u8 rep; + __u8 pad; + __u16 port; + __u64 count; + union { + __u64 address; + __u32 value; + }; + } io; + struct { + } debug; + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + }; +}; + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* in */ + __u32 vcpu; + __u32 padding; + + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* in */ + __u32 vcpu; + __u32 padding; + + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 vcpu; + __u32 nmsrs; /* number of msrs in entries */ + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +/* for KVM_TRANSLATE */ +struct kvm_translation { + /* in */ + __u64 linear_address; + __u32 vcpu; + __u32 padding; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; +}; + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 vcpu; + __u32 irq; +}; + +struct kvm_breakpoint { + __u32 enabled; + __u32 padding; + __u64 address; +}; + +/* for KVM_DEBUG_GUEST */ +struct kvm_debug_guest { + /* int */ + __u32 vcpu; + __u32 enabled; + struct kvm_breakpoint breakpoints[4]; + __u32 singlestep; +}; + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +#define KVMIO 0xAE + +#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run) +#define KVM_GET_REGS _IOWR(KVMIO, 3, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs) +#define KVM_GET_SREGS _IOWR(KVMIO, 5, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt) +#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest) +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region) +#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int /* vcpu_slot */) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log) +#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs) +#define KVM_SET_MSRS _IOWR(KVMIO, 14, struct kvm_msrs) +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list) + +#endif