59 files changed, 4475 insertions, 1053 deletions
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index f20ca84537f5..9ac4970882fe 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -615,7 +615,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * Enter the guest
 		 */
 		trace_kvm_entry(*vcpu_pc(vcpu));
-		__kvm_guest_enter();
+		guest_enter_irqoff();
 		vcpu->mode = IN_GUEST_MODE;
 
 		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@ -641,14 +641,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		local_irq_enable();
 
 		/*
-		 * We do local_irq_enable() before calling kvm_guest_exit() so
+		 * We do local_irq_enable() before calling guest_exit() so
 		 * that if a timer interrupt hits while running the guest we
 		 * account that tick as being spent in the guest.  We enable
-		 * preemption after calling kvm_guest_exit() so that if we get
+		 * preemption after calling guest_exit() so that if we get
 		 * preempted we make sure ticks after that is not counted as
 		 * guest time.
 		 */
-		kvm_guest_exit();
+		guest_exit();
 		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
 		/*
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b0773c6d622f..b54bcadd8aec 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -400,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
 	unsigned long temp;
 	do {
 		__asm__ __volatile__(
-		"	.set	mips3				\n"
+		"	.set	"MIPS_ISA_ARCH_LEVEL"		\n"
 		"	" __LL "%0, %1				\n"
 		"	or	%0, %2				\n"
 		"	" __SC	"%0, %1				\n"
@@ -416,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
 	unsigned long temp;
 	do {
 		__asm__ __volatile__(
-		"	.set	mips3				\n"
+		"	.set	"MIPS_ISA_ARCH_LEVEL"		\n"
 		"	" __LL "%0, %1				\n"
 		"	and	%0, %2				\n"
 		"	" __SC	"%0, %1				\n"
@@ -433,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 	unsigned long temp;
 	do {
 		__asm__ __volatile__(
-		"	.set	mips3				\n"
+		"	.set	"MIPS_ISA_ARCH_LEVEL"		\n"
 		"	" __LL "%0, %1				\n"
 		"	and	%0, %2				\n"
 		"	or	%0, %3				\n"
@@ -533,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
 /* Debug: dump vcpu state */
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
 
-/* Trampoline ASM routine to start running in "Guest" context */
-extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+/* Building of entry/exception code */
+int kvm_mips_entry_setup(void);
+void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_exception(void *addr, void *handler);
+void *kvm_mips_build_exit(void *addr);
 
 /* FPU/MSA context management */
 void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index b6ecfeee4dbe..f7929f65f7ca 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -104,8 +104,13 @@ Ip_u1s2(_bltz);
 Ip_u1s2(_bltzl);
 Ip_u1u2s3(_bne);
 Ip_u2s3u1(_cache);
+Ip_u1u2(_cfc1);
+Ip_u2u1(_cfcmsa);
+Ip_u1u2(_ctc1);
+Ip_u2u1(_ctcmsa);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
+Ip_u1(_di);
 Ip_u2u1msbu3(_dins);
 Ip_u2u1msbu3(_dinsm);
 Ip_u1u2(_divu);
@@ -141,6 +146,8 @@ Ip_u1(_mfhi);
 Ip_u1(_mflo);
 Ip_u1u2u3(_mtc0);
 Ip_u1u2u3(_mthc0);
+Ip_u1(_mthi);
+Ip_u1(_mtlo);
 Ip_u3u1u2(_mul);
 Ip_u3u1u2(_or);
 Ip_u2u1u3(_ori);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index a1ebf973725c..77429d1622b3 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -21,20 +21,20 @@
 enum major_op {
 	spec_op, bcond_op, j_op, jal_op,
 	beq_op, bne_op, blez_op, bgtz_op,
-	addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+	addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
 	andi_op, ori_op, xori_op, lui_op,
 	cop0_op, cop1_op, cop2_op, cop1x_op,
 	beql_op, bnel_op, blezl_op, bgtzl_op,
-	daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+	daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
 	spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
 	lb_op, lh_op, lwl_op, lw_op,
 	lbu_op, lhu_op, lwr_op, lwu_op,
 	sb_op, sh_op, swl_op, sw_op,
 	sdl_op, sdr_op, swr_op, cache_op,
 	ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
-	lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+	lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
 	sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
-	scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+	scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
 };
 
 /*
@@ -93,6 +93,50 @@ enum spec3_op {
 };
 
 /*
+ * Bits 10-6 minor opcode for r6 spec mult/div encodings
+ */
+enum mult_op {
+	mult_mult_op = 0x0,
+	mult_mul_op = 0x2,
+	mult_muh_op = 0x3,
+};
+enum multu_op {
+	multu_multu_op = 0x0,
+	multu_mulu_op = 0x2,
+	multu_muhu_op = 0x3,
+};
+enum div_op {
+	div_div_op = 0x0,
+	div_div6_op = 0x2,
+	div_mod_op = 0x3,
+};
+enum divu_op {
+	divu_divu_op = 0x0,
+	divu_divu6_op = 0x2,
+	divu_modu_op = 0x3,
+};
+enum dmult_op {
+	dmult_dmult_op = 0x0,
+	dmult_dmul_op = 0x2,
+	dmult_dmuh_op = 0x3,
+};
+enum dmultu_op {
+	dmultu_dmultu_op = 0x0,
+	dmultu_dmulu_op = 0x2,
+	dmultu_dmuhu_op = 0x3,
+};
+enum ddiv_op {
+	ddiv_ddiv_op = 0x0,
+	ddiv_ddiv6_op = 0x2,
+	ddiv_dmod_op = 0x3,
+};
+enum ddivu_op {
+	ddivu_ddivu_op = 0x0,
+	ddivu_ddivu6_op = 0x2,
+	ddivu_dmodu_op = 0x3,
+};
+
+/*
  * rt field of bcond opcodes.
  */
 enum rt_op {
@@ -238,6 +282,21 @@ enum bshfl_func {
 };
 
 /*
+ * MSA minor opcodes.
+ */
+enum msa_func {
+	msa_elm_op = 0x19,
+};
+
+/*
+ * MSA ELM opcodes.
+ */
+enum msa_elm {
+	msa_ctc_op = 0x3e,
+	msa_cfc_op = 0x7e,
+};
+
+/*
  * func field for MSA MI10 format.
  */
 enum msa_mi10_func {
@@ -264,7 +323,7 @@ enum mm_major_op {
 	mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op,
 	mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op,
 	mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op,
-	mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op,
+	mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op,
 	mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op,
 	mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op,
 	mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op,
@@ -360,7 +419,10 @@ enum mm_32axf_minor_op {
 	mm_mflo32_op = 0x075,
 	mm_jalrhb_op = 0x07c,
 	mm_tlbwi_op = 0x08d,
+	mm_mthi32_op = 0x0b5,
 	mm_tlbwr_op = 0x0cd,
+	mm_mtlo32_op = 0x0f5,
+	mm_di_op = 0x11d,
 	mm_jalrs_op = 0x13c,
 	mm_jalrshb_op = 0x17c,
 	mm_sync_op = 0x1ad,
@@ -479,6 +541,13 @@ enum mm_32f_73_minor_op {
 };
 
 /*
+ * (microMIPS) POOL32S minor opcodes.
+ */
+enum mm_32s_minor_op {
+	mm_32s_elm_op = 0x16,
+};
+
+/*
  * (microMIPS) POOL16C minor opcodes.
  */
 enum mm_16c_minor_op {
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index a1263d188a5a..fae2f9447792 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -339,67 +339,9 @@ void output_pm_defines(void)
 }
 #endif
 
-void output_cpuinfo_defines(void)
-{
-	COMMENT(" MIPS cpuinfo offsets. ");
-	DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips));
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask);
-#endif
-}
-
 void output_kvm_defines(void)
 {
 	COMMENT(" KVM/MIPS Specfic offsets. ");
-	DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
-	OFFSET(VCPU_RUN, kvm_vcpu, run);
-	OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
-
-	OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
-
-	OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
-	OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
-
-	OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
-	OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
-	OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
-
-	OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
-	OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
-	OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
-	OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
-	OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
-	OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
-	OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
-	OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
-	OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
-	OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
-	OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
-	OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
-	OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
-	OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
-	OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
-	OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
-	OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
-	OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
-	OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
-	OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
-	OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
-	OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
-	OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
-	OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
-	OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
-	OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
-	OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
-	OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
-	OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
-	OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
-	OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
-	OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
-	OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
-	OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
-	OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
-	BLANK();
 
 	OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
 	OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
@@ -437,14 +379,6 @@ void output_kvm_defines(void)
 	OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
 	OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
 	BLANK();
-
-	OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
-	OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
-	OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
-
-	OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
-	OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
-	BLANK();
 }
 
 #ifdef CONFIG_MIPS_CPS
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 6dc3f1fdaccc..46c227fc98f5 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		epc += 4 + (insn.i_format.simmediate << 2);
 		regs->cp0_epc = epc;
 		break;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
 			break;
@@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		/* Compact branch: BEQZC || JIC */
 		regs->cp0_epc += 8;
 		break;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
 			break;
@@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		regs->cp0_epc += 8;
 		break;
 #endif
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		/* Only valid for MIPS R6 */
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 2ae12825529f..7c56d6b124d1 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -17,6 +17,7 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
+	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select KVM_MMIO
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 0aabe40fcac9..847429de780d 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -7,7 +7,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
 
 common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
-kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
+kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
 	    interrupt.o stats.o commpage.o \
 	    dyntrans.o trap_emul.o fpu.o
 kvm-objs += mmu.o
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 8a1833b9eb38..91ebd2b6034f 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -72,7 +72,10 @@ int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
 	synci_inst.i_format.opcode = bcond_op;
 	synci_inst.i_format.rs = inst.i_format.rs;
 	synci_inst.i_format.rt = synci_op;
-	synci_inst.i_format.simmediate = inst.i_format.simmediate;
+	if (cpu_has_mips_r6)
+		synci_inst.i_format.simmediate = inst.spec3_format.simmediate;
+	else
+		synci_inst.i_format.simmediate = inst.i_format.simmediate;
 
 	return kvm_mips_trans_replace(vcpu, opc, synci_inst);
 }
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 5f0354c80c8e..be18dfe9ecaa 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		nextpc = epc;
 		break;
 
-	case blez_op:		/* not really i_format */
-	case blezl_op:
-		/* rt field assumed to be zero */
+	case blez_op:	/* POP06 */
+#ifndef CONFIG_CPU_MIPSR6
+	case blezl_op:	/* removed in R6 */
+#endif
+		if (insn.i_format.rt != 0)
+			goto compact_branch;
 		if ((long)arch->gprs[insn.i_format.rs] <= 0)
 			epc = epc + 4 + (insn.i_format.simmediate << 2);
 		else
@@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		nextpc = epc;
 		break;
 
-	case bgtz_op:
-	case bgtzl_op:
-		/* rt field assumed to be zero */
+	case bgtz_op:	/* POP07 */
+#ifndef CONFIG_CPU_MIPSR6
+	case bgtzl_op:	/* removed in R6 */
+#endif
+		if (insn.i_format.rt != 0)
+			goto compact_branch;
 		if ((long)arch->gprs[insn.i_format.rs] > 0)
 			epc = epc + 4 + (insn.i_format.simmediate << 2);
 		else
@@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 	case cop1_op:
 		kvm_err("%s: unsupported cop1_op\n", __func__);
 		break;
+
+#ifdef CONFIG_CPU_MIPSR6
+	/* R6 added the following compact branches with forbidden slots */
+	case blezl_op:	/* POP26 */
+	case bgtzl_op:	/* POP27 */
+		/* only rt == 0 isn't compact branch */
+		if (insn.i_format.rt != 0)
+			goto compact_branch;
+		break;
+	case pop10_op:
+	case pop30_op:
+		/* only rs == rt == 0 is reserved, rest are compact branches */
+		if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
+			goto compact_branch;
+		break;
+	case pop66_op:
+	case pop76_op:
+		/* only rs == 0 isn't compact branch */
+		if (insn.i_format.rs != 0)
+			goto compact_branch;
+		break;
+compact_branch:
+		/*
+		 * If we've hit an exception on the forbidden slot, then
+		 * the branch must not have been taken.
+		 */
+		epc += 8;
+		nextpc = epc;
+		break;
+#else
+compact_branch:
+		/* Compact branches not supported before R6 */
+		break;
+#endif
 	}
 
 	return nextpc;
@@ -1561,7 +1601,10 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 
 	base = inst.i_format.rs;
 	op_inst = inst.i_format.rt;
-	offset = inst.i_format.simmediate;
+	if (cpu_has_mips_r6)
+		offset = inst.spec3_format.simmediate;
+	else
+		offset = inst.i_format.simmediate;
 	cache = op_inst & CacheOp_Cache;
 	op = op_inst & CacheOp_Op;
 
@@ -1724,11 +1767,27 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
 		break;
 
+#ifndef CONFIG_CPU_MIPSR6
 	case cache_op:
 		++vcpu->stat.cache_exits;
 		trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
 		er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
 		break;
+#else
+	case spec3_op:
+		switch (inst.spec3_format.func) {
+		case cache6_op:
+			++vcpu->stat.cache_exits;
+			trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+			er = kvm_mips_emulate_cache(inst, opc, cause, run,
+						    vcpu);
+			break;
+		default:
+			goto unknown;
+		};
+		break;
+unknown:
+#endif
 
 	default:
 		kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,
@@ -2298,7 +2357,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	}
 
 	if (inst.r_format.opcode == spec3_op &&
-	    inst.r_format.func == rdhwr_op) {
+	    inst.r_format.func == rdhwr_op &&
+	    inst.r_format.rs == 0 &&
+	    (inst.r_format.re >> 3) == 0) {
 		int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
 		int rd = inst.r_format.rd;
 		int rt = inst.r_format.rt;
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
new file mode 100644
index 000000000000..75ba7c2ecb3d
--- /dev/null
+++ b/arch/mips/kvm/entry.c
@@ -0,0 +1,676 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Generation of main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.
+ * Authors: Sanjay Lal <[email protected]>
+ *
+ * Copyright (C) 2016 Imagination Technologies Ltd.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/msa.h>
+#include <asm/setup.h>
+#include <asm/uasm.h>
+
+/* Register names */
+#define ZERO		0
+#define AT		1
+#define V0		2
+#define V1		3
+#define A0		4
+#define A1		5
+
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#define T0		8
+#define T1		9
+#define T2		10
+#define T3		11
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+
+#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32
+#define T0		12
+#define T1		13
+#define T2		14
+#define T3		15
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */
+
+#define S0		16
+#define S1		17
+#define T9		25
+#define K0		26
+#define K1		27
+#define GP		28
+#define SP		29
+#define RA		31
+
+/* Some CP0 registers */
+#define C0_HWRENA	7, 0
+#define C0_BADVADDR	8, 0
+#define C0_ENTRYHI	10, 0
+#define C0_STATUS	12, 0
+#define C0_CAUSE	13, 0
+#define C0_EPC		14, 0
+#define C0_EBASE	15, 1
+#define C0_CONFIG5	16, 5
+#define C0_DDATA_LO	28, 3
+#define C0_ERROREPC	30, 0
+
+#define CALLFRAME_SIZ   32
+
+static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
+static unsigned int scratch_tmp[2] = { C0_ERROREPC };
+
+enum label_id {
+	label_fpu_1 = 1,
+	label_msa_1,
+	label_return_to_host,
+	label_kernel_asid,
+	label_exit_common,
+};
+
+UASM_L_LA(_fpu_1)
+UASM_L_LA(_msa_1)
+UASM_L_LA(_return_to_host)
+UASM_L_LA(_kernel_asid)
+UASM_L_LA(_exit_common)
+
+static void *kvm_mips_build_enter_guest(void *addr);
+static void *kvm_mips_build_ret_from_exit(void *addr);
+static void *kvm_mips_build_ret_to_guest(void *addr);
+static void *kvm_mips_build_ret_to_host(void *addr);
+
+/**
+ * kvm_mips_entry_setup() - Perform global setup for entry code.
+ *
+ * Perform global setup for entry code, such as choosing a scratch register.
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+int kvm_mips_entry_setup(void)
+{
+	/*
+	 * We prefer to use KScratchN registers if they are available over the
+	 * defaults above, which may not work on all cores.
+	 */
+	unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+
+	/* Pick a scratch register for storing VCPU */
+	if (kscratch_mask) {
+		scratch_vcpu[0] = 31;
+		scratch_vcpu[1] = ffs(kscratch_mask) - 1;
+		kscratch_mask &= ~BIT(scratch_vcpu[1]);
+	}
+
+	/* Pick a scratch register to use as a temp for saving state */
+	if (kscratch_mask) {
+		scratch_tmp[0] = 31;
+		scratch_tmp[1] = ffs(kscratch_mask) - 1;
+		kscratch_mask &= ~BIT(scratch_tmp[1]);
+	}
+
+	return 0;
+}
+
+static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
+					unsigned int frame)
+{
+	/* Save the VCPU scratch register value in cp0_epc of the stack frame */
+	uasm_i_mfc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+
+	/* Save the temp scratch register value in cp0_cause of stack frame */
+	if (scratch_tmp[0] == 31) {
+		uasm_i_mfc0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+		UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+	}
+}
+
+static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
+					   unsigned int frame)
+{
+	/*
+	 * Restore host scratch register values saved by
+	 * kvm_mips_build_save_scratch().
+	 */
+	UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+	uasm_i_mtc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+
+	if (scratch_tmp[0] == 31) {
+		UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+		uasm_i_mtc0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+	}
+}
+
+/**
+ * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the start of the vcpu_run function to run a guest VCPU. The function
+ * conforms to the following prototype:
+ *
+ * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ *
+ * The exit from the guest and return to the caller is handled by the code
+ * generated by kvm_mips_build_ret_to_host().
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_vcpu_run(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+
+	/*
+	 * A0: run
+	 * A1: vcpu
+	 */
+
+	/* k0/k1 not being used in host kernel context */
+	uasm_i_addiu(&p, K1, SP, -(int)sizeof(struct pt_regs));
+	for (i = 16; i < 32; ++i) {
+		if (i == 24)
+			i = 28;
+		UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+	}
+
+	/* Save host status */
+	uasm_i_mfc0(&p, V0, C0_STATUS);
+	UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
+
+	/* Save scratch registers, will be used to store pointer to vcpu etc */
+	kvm_mips_build_save_scratch(&p, V1, K1);
+
+	/* VCPU scratch register has pointer to vcpu */
+	uasm_i_mtc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+
+	/* Offset into vcpu->arch */
+	uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+
+	/*
+	 * Save the host stack to VCPU, used for exception processing
+	 * when we exit from the Guest
+	 */
+	UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+	/* Save the kernel gp as well */
+	UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+	/*
+	 * Setup status register for running the guest in UM, interrupts
+	 * are disabled
+	 */
+	UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV);
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	/* load up the new EBASE */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+	uasm_i_mtc0(&p, K0, C0_EBASE);
+
+	/*
+	 * Now that the new EBASE has been loaded, unset BEV, set
+	 * interrupt mask as it was but make sure that timer interrupts
+	 * are enabled
+	 */
+	uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE);
+	uasm_i_andi(&p, V0, V0, ST0_IM);
+	uasm_i_or(&p, K0, K0, V0);
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	p = kvm_mips_build_enter_guest(p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_enter_guest() - Assemble code to resume guest execution.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to resume guest execution. This code is common between the
+ * initial entry into the guest from the host, and returning from the exit
+ * handler back to the guest.
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_enter_guest(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Set Guest EPC */
+	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
+	uasm_i_mtc0(&p, T0, C0_EPC);
+
+	/* Set the ASID for the Guest Kernel */
+	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
+	UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
+		  T0);
+	uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
+	uasm_i_xori(&p, T0, T0, KSU_USER);
+	uasm_il_bnez(&p, &r, T0, label_kernel_asid);
+	 uasm_i_addiu(&p, T1, K1,
+		      offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+	/* else user */
+	uasm_i_addiu(&p, T1, K1,
+		     offsetof(struct kvm_vcpu_arch, guest_user_asid));
+	uasm_l_kernel_asid(&l, p);
+
+	/* t1: contains the base of the ASID array, need to get the cpu id  */
+	/* smp_processor_id */
+	UASM_i_LW(&p, T2, offsetof(struct thread_info, cpu), GP);
+	/* x4 */
+	uasm_i_sll(&p, T2, T2, 2);
+	UASM_i_ADDU(&p, T3, T1, T2);
+	UASM_i_LW(&p, K0, 0, T3);
+#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
+	/* x sizeof(struct cpuinfo_mips)/4 */
+	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+	uasm_i_mul(&p, T2, T2, T3);
+
+	UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
+	UASM_i_ADDU(&p, AT, AT, T2);
+	UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT);
+	uasm_i_and(&p, K0, K0, T2);
+#else
+	uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
+#endif
+	uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+	uasm_i_ehb(&p);
+
+	/* Disable RDHWR access */
+	uasm_i_mtc0(&p, ZERO, C0_HWRENA);
+
+	/* load the guest context from VCPU and return */
+	for (i = 1; i < 32; ++i) {
+		/* Guest k0/k1 loaded later */
+		if (i == K0 || i == K1)
+			continue;
+		UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+	}
+
+#ifndef CONFIG_CPU_MIPSR6
+	/* Restore hi/lo */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
+	uasm_i_mthi(&p, K0);
+
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
+	uasm_i_mtlo(&p, K0);
+#endif
+
+	/* Restore the guest's k0/k1 registers */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+	UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+	/* Jump to guest */
+	uasm_i_eret(&p);
+
+	uasm_resolve_relocs(relocs, labels);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_exception() - Assemble first level guest exception handler.
+ * @addr:	Address to start writing code.
+ * @handler:	Address of common handler (within range of @addr).
+ *
+ * Assemble exception vector code for guest execution. The generated vector will
+ * branch to the common exception handler generated by kvm_mips_build_exit().
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_exception(void *addr, void *handler)
+{
+	u32 *p = addr;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Save guest k1 into scratch register */
+	uasm_i_mtc0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+	/* Get the VCPU pointer from the VCPU scratch register */
+	uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+	/* Save guest k0 into VCPU structure */
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+
+	/* Branch to the common handler */
+	uasm_il_b(&p, &r, label_exit_common);
+	 uasm_i_nop(&p);
+
+	uasm_l_exit_common(&l, handler);
+	uasm_resolve_relocs(relocs, labels);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_exit() - Assemble common guest exit handler.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the generic guest exit handling code. This is called by the
+ * exception vectors (generated by kvm_mips_build_exception()), and calls
+ * kvm_mips_handle_exit(), then either resumes the guest or returns to the host
+ * depending on the return value.
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_exit(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+	struct uasm_label labels[3];
+	struct uasm_reloc relocs[3];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/*
+	 * Generic Guest exception handler. We end up here when the guest
+	 * does something that causes a trap to kernel mode.
+	 *
+	 * Both k0/k1 registers will have already been saved (k0 into the vcpu
+	 * structure, and k1 into the scratch_tmp register).
+	 *
+	 * The k1 register will already contain the kvm_vcpu_arch pointer.
+	 */
+
+	/* Start saving Guest context to VCPU */
+	for (i = 0; i < 32; ++i) {
+		/* Guest k0/k1 saved later */
+		if (i == K0 || i == K1)
+			continue;
+		UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+	}
+
+#ifndef CONFIG_CPU_MIPSR6
+	/* We need to save hi/lo and restore them on the way out */
+	uasm_i_mfhi(&p, T0);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
+
+	uasm_i_mflo(&p, T0);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+#endif
+
+	/* Finally save guest k1 to VCPU */
+	uasm_i_ehb(&p);
+	uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+	/* Now that context has been saved, we can use other registers */
+
+	/* Restore vcpu */
+	uasm_i_mfc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+	uasm_i_move(&p, S1, A1);
+
+	/* Restore run (vcpu->run) */
+	UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
+	/* Save pointer to run in s0, will be saved by the compiler */
+	uasm_i_move(&p, S0, A0);
+
+	/*
+	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
+	 * the exception
+	 */
+	uasm_i_mfc0(&p, K0, C0_EPC);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
+
+	uasm_i_mfc0(&p, K0, C0_BADVADDR);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
+		  K1);
+
+	uasm_i_mfc0(&p, K0, C0_CAUSE);
+	uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+
+	/* Now restore the host state just enough to run the handlers */
+
+	/* Switch EBASE to the one used by Linux */
+	/* load up the host EBASE */
+	uasm_i_mfc0(&p, V0, C0_STATUS);
+
+	uasm_i_lui(&p, AT, ST0_BEV >> 16);
+	uasm_i_or(&p, K0, V0, AT);
+
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	UASM_i_LA_mostly(&p, K0, (long)&ebase);
+	UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
+	uasm_i_mtc0(&p, K0, C0_EBASE);
+
+	if (raw_cpu_has_fpu) {
+		/*
+		 * If FPU is enabled, save FCR31 and clear it so that later
+		 * ctc1's don't trigger FPE for pending exceptions.
+		 */
+		uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+		uasm_i_and(&p, V1, V0, AT);
+		uasm_il_beqz(&p, &r, V1, label_fpu_1);
+		 uasm_i_nop(&p);
+		uasm_i_cfc1(&p, T0, 31);
+		uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31),
+			  K1);
+		uasm_i_ctc1(&p, ZERO, 31);
+		uasm_l_fpu_1(&l, p);
+	}
+
+	if (cpu_has_msa) {
+		/*
+		 * If MSA is enabled, save MSACSR and clear it so that later
+		 * instructions don't trigger MSAFPE for pending exceptions.
+		 */
+		uasm_i_mfc0(&p, T0, C0_CONFIG5);
+		uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+		uasm_il_beqz(&p, &r, T0, label_msa_1);
+		 uasm_i_nop(&p);
+		uasm_i_cfcmsa(&p, T0, MSA_CSR);
+		uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+			  K1);
+		uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+		uasm_l_msa_1(&l, p);
+	}
+
+	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+	uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
+	uasm_i_and(&p, V0, V0, AT);
+	uasm_i_lui(&p, AT, ST0_CU0 >> 16);
+	uasm_i_or(&p, V0, V0, AT);
+	uasm_i_mtc0(&p, V0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	/* Load up host GP */
+	UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+	/* Need a stack before we can jump to "C" */
+	UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+	/* Saved host state */
+	uasm_i_addiu(&p, SP, SP, -(int)sizeof(struct pt_regs));
+
+	/*
+	 * XXXKYMA do we need to load the host ASID, maybe not because the
+	 * kernel entries are marked GLOBAL, need to verify
+	 */
+
+	/* Restore host scratch registers, as we'll have clobbered them */
+	kvm_mips_build_restore_scratch(&p, K0, SP);
+
+	/* Restore RDHWR access */
+	UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+	uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+	uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+	/* Jump to handler */
+	/*
+	 * XXXKYMA: not sure if this is safe, how large is the stack??
+	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
+	 * with this in the kernel
+	 */
+	UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
+	uasm_i_jalr(&p, RA, T9);
+	 uasm_i_addiu(&p, SP, SP, -CALLFRAME_SIZ);
+
+	uasm_resolve_relocs(relocs, labels);
+
+	p = kvm_mips_build_ret_from_exit(p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to handle the return from kvm_mips_handle_exit(), either
+ * resuming the guest or returning to the host depending on the return value.
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_from_exit(void *addr)
+{
+	u32 *p = addr;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Return from handler Make sure interrupts are disabled */
+	uasm_i_di(&p, ZERO);
+	uasm_i_ehb(&p);
+
+	/*
+	 * XXXKYMA: k0/k1 could have been blown away if we processed
+	 * an exception while we were handling the exception from the
+	 * guest, reload k1
+	 */
+
+	uasm_i_move(&p, K1, S1);
+	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+	/*
+	 * Check return value, should tell us if we are returning to the
+	 * host (handle I/O etc)or resuming the guest
+	 */
+	uasm_i_andi(&p, T0, V0, RESUME_HOST);
+	uasm_il_bnez(&p, &r, T0, label_return_to_host);
+	 uasm_i_nop(&p);
+
+	p = kvm_mips_build_ret_to_guest(p);
+
+	uasm_l_return_to_host(&l, p);
+	p = kvm_mips_build_ret_to_host(p);
+
+	uasm_resolve_relocs(relocs, labels);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the guest.
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_guest(void *addr)
+{
+	u32 *p = addr;
+
+	/* Put the saved pointer to vcpu (s1) back into the scratch register */
+	uasm_i_mtc0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+
+	/* Load up the Guest EBASE to minimize the window where BEV is set */
+	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+
+	/* Switch EBASE back to the one used by KVM */
+	uasm_i_mfc0(&p, V1, C0_STATUS);
+	uasm_i_lui(&p, AT, ST0_BEV >> 16);
+	uasm_i_or(&p, K0, V1, AT);
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+	uasm_i_mtc0(&p, T0, C0_EBASE);
+
+	/* Setup status register for running guest in UM */
+	uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);
+	UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX));
+	uasm_i_and(&p, V1, V1, AT);
+	uasm_i_mtc0(&p, V1, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	p = kvm_mips_build_enter_guest(p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_host() - Assemble code to return to the host.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run
+ * function generated by kvm_mips_build_vcpu_run().
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_host(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+
+	/* EBASE is already pointing to Linux */
+	UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+	uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs));
+
+	/*
+	 * r2/v0 is the return code, shift it down by 2 (arithmetic)
+	 * to recover the err code
+	 */
+	uasm_i_sra(&p, K0, V0, 2);
+	uasm_i_move(&p, V0, K0);
+
+	/* Load context saved on the host stack */
+	for (i = 16; i < 31; ++i) {
+		if (i == 24)
+			i = 28;
+		UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+	}
+
+	/* Restore RDHWR access */
+	UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+	uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+	uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+	/* Restore RA, which is the address we will return to */
+	UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1);
+	uasm_i_jr(&p, RA);
+	 uasm_i_nop(&p);
+
+	return p;
+}
+
diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S
index 531fbf5131c0..16f17c6390dd 100644
--- a/arch/mips/kvm/fpu.S
+++ b/arch/mips/kvm/fpu.S
@@ -14,13 +14,16 @@
 #include <asm/mipsregs.h>
 #include <asm/regdef.h>
 
+/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */
+#undef fp
+
 	.set	noreorder
 	.set	noat
 
 LEAF(__kvm_save_fpu)
 	.set	push
-	.set	mips64r2
 	SET_HARDFLOAT
+	.set	fp=64
 	mfc0	t0, CP0_STATUS
 	sll     t0, t0, 5			# is Status.FR set?
 	bgez    t0, 1f				# no: skip odd doubles
@@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu)
 
 LEAF(__kvm_restore_fpu)
 	.set	push
-	.set	mips64r2
 	SET_HARDFLOAT
+	.set	fp=64
 	mfc0	t0, CP0_STATUS
 	sll     t0, t0, 5			# is Status.FR set?
 	bgez    t0, 1f				# no: skip odd doubles
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index d661c100b219..fb118a2c8379 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -28,10 +28,6 @@
 #define MIPS_EXC_MAX                12
 /* XXXSL More to follow */
 
-extern char __kvm_mips_vcpu_run_end[];
-extern char mips32_exception[], mips32_exceptionEnd[];
-extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
-
 #define C_TI        (_ULCAST_(1) << 30)
 
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
deleted file mode 100644
index 698286c0f732..000000000000
--- a/arch/mips/kvm/locore.S
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Main entry point for the guest, exception handling.
- *
- * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
- * Authors: Sanjay Lal <[email protected]>
- */
-
-#include <asm/asm.h>
-#include <asm/asmmacro.h>
-#include <asm/regdef.h>
-#include <asm/mipsregs.h>
-#include <asm/stackframe.h>
-#include <asm/asm-offsets.h>
-
-#define _C_LABEL(x)     x
-#define MIPSX(name)     mips32_ ## name
-#define CALLFRAME_SIZ   32
-
-/*
- * VECTOR
- *  exception vector entrypoint
- */
-#define VECTOR(x, regmask)      \
-    .ent    _C_LABEL(x),0;      \
-    EXPORT(x);
-
-#define VECTOR_END(x)      \
-    EXPORT(x);
-
-/* Overload, Danger Will Robinson!! */
-#define PT_HOST_USERLOCAL   PT_EPC
-
-#define CP0_DDATA_LO        $28,3
-
-/* Resume Flags */
-#define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
-
-#define RESUME_GUEST            0
-#define RESUME_HOST             RESUME_FLAG_HOST
-
-/*
- * __kvm_mips_vcpu_run: entry point to the guest
- * a0: run
- * a1: vcpu
- */
-	.set	noreorder
-
-FEXPORT(__kvm_mips_vcpu_run)
-	/* k0/k1 not being used in host kernel context */
-	INT_ADDIU k1, sp, -PT_SIZE
-	LONG_S	$16, PT_R16(k1)
-	LONG_S	$17, PT_R17(k1)
-	LONG_S	$18, PT_R18(k1)
-	LONG_S	$19, PT_R19(k1)
-	LONG_S	$20, PT_R20(k1)
-	LONG_S	$21, PT_R21(k1)
-	LONG_S	$22, PT_R22(k1)
-	LONG_S	$23, PT_R23(k1)
-
-	LONG_S	$28, PT_R28(k1)
-	LONG_S	$29, PT_R29(k1)
-	LONG_S	$30, PT_R30(k1)
-	LONG_S	$31, PT_R31(k1)
-
-	/* Save hi/lo */
-	mflo	v0
-	LONG_S	v0, PT_LO(k1)
-	mfhi	v1
-	LONG_S	v1, PT_HI(k1)
-
-	/* Save host status */
-	mfc0	v0, CP0_STATUS
-	LONG_S	v0, PT_STATUS(k1)
-
-	/* Save DDATA_LO, will be used to store pointer to vcpu */
-	mfc0	v1, CP0_DDATA_LO
-	LONG_S	v1, PT_HOST_USERLOCAL(k1)
-
-	/* DDATA_LO has pointer to vcpu */
-	mtc0	a1, CP0_DDATA_LO
-
-	/* Offset into vcpu->arch */
-	INT_ADDIU k1, a1, VCPU_HOST_ARCH
-
-	/*
-	 * Save the host stack to VCPU, used for exception processing
-	 * when we exit from the Guest
-	 */
-	LONG_S	sp, VCPU_HOST_STACK(k1)
-
-	/* Save the kernel gp as well */
-	LONG_S	gp, VCPU_HOST_GP(k1)
-
-	/*
-	 * Setup status register for running the guest in UM, interrupts
-	 * are disabled
-	 */
-	li	k0, (ST0_EXL | KSU_USER | ST0_BEV)
-	mtc0	k0, CP0_STATUS
-	ehb
-
-	/* load up the new EBASE */
-	LONG_L	k0, VCPU_GUEST_EBASE(k1)
-	mtc0	k0, CP0_EBASE
-
-	/*
-	 * Now that the new EBASE has been loaded, unset BEV, set
-	 * interrupt mask as it was but make sure that timer interrupts
-	 * are enabled
-	 */
-	li	k0, (ST0_EXL | KSU_USER | ST0_IE)
-	andi	v0, v0, ST0_IM
-	or	k0, k0, v0
-	mtc0	k0, CP0_STATUS
-	ehb
-
-	/* Set Guest EPC */
-	LONG_L	t0, VCPU_PC(k1)
-	mtc0	t0, CP0_EPC
-
-FEXPORT(__kvm_mips_load_asid)
-	/* Set the ASID for the Guest Kernel */
-	PTR_L	t0, VCPU_COP0(k1)
-	LONG_L	t0, COP0_STATUS(t0)
-	andi	t0, KSU_USER | ST0_ERL | ST0_EXL
-	xori	t0, KSU_USER
-	bnez	t0, 1f		/* If kernel */
-	 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-	INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-	/* t1: contains the base of the ASID array, need to get the cpu id */
-	LONG_L	t2, TI_CPU($28)             /* smp_processor_id */
-	INT_SLL	t2, t2, 2                   /* x4 */
-	REG_ADDU t3, t1, t2
-	LONG_L	k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	li	t3, CPUINFO_SIZE/4
-	mul	t2, t2, t3		/* x sizeof(struct cpuinfo_mips)/4 */
-	LONG_L	t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-	and	k0, k0, t2
-#else
-	andi	k0, k0, MIPS_ENTRYHI_ASID
-#endif
-	mtc0	k0, CP0_ENTRYHI
-	ehb
-
-	/* Disable RDHWR access */
-	mtc0	zero, CP0_HWRENA
-
-	.set	noat
-	/* Now load up the Guest Context from VCPU */
-	LONG_L	$1, VCPU_R1(k1)
-	LONG_L	$2, VCPU_R2(k1)
-	LONG_L	$3, VCPU_R3(k1)
-
-	LONG_L	$4, VCPU_R4(k1)
-	LONG_L	$5, VCPU_R5(k1)
-	LONG_L	$6, VCPU_R6(k1)
-	LONG_L	$7, VCPU_R7(k1)
-
-	LONG_L	$8, VCPU_R8(k1)
-	LONG_L	$9, VCPU_R9(k1)
-	LONG_L	$10, VCPU_R10(k1)
-	LONG_L	$11, VCPU_R11(k1)
-	LONG_L	$12, VCPU_R12(k1)
-	LONG_L	$13, VCPU_R13(k1)
-	LONG_L	$14, VCPU_R14(k1)
-	LONG_L	$15, VCPU_R15(k1)
-	LONG_L	$16, VCPU_R16(k1)
-	LONG_L	$17, VCPU_R17(k1)
-	LONG_L	$18, VCPU_R18(k1)
-	LONG_L	$19, VCPU_R19(k1)
-	LONG_L	$20, VCPU_R20(k1)
-	LONG_L	$21, VCPU_R21(k1)
-	LONG_L	$22, VCPU_R22(k1)
-	LONG_L	$23, VCPU_R23(k1)
-	LONG_L	$24, VCPU_R24(k1)
-	LONG_L	$25, VCPU_R25(k1)
-
-	/* k0/k1 loaded up later */
-
-	LONG_L	$28, VCPU_R28(k1)
-	LONG_L	$29, VCPU_R29(k1)
-	LONG_L	$30, VCPU_R30(k1)
-	LONG_L	$31, VCPU_R31(k1)
-
-	/* Restore hi/lo */
-	LONG_L	k0, VCPU_LO(k1)
-	mtlo	k0
-
-	LONG_L	k0, VCPU_HI(k1)
-	mthi	k0
-
-FEXPORT(__kvm_mips_load_k0k1)
-	/* Restore the guest's k0/k1 registers */
-	LONG_L	k0, VCPU_R26(k1)
-	LONG_L	k1, VCPU_R27(k1)
-
-	/* Jump to guest */
-	eret
-EXPORT(__kvm_mips_vcpu_run_end)
-
-VECTOR(MIPSX(exception), unknown)
-/* Find out what mode we came from and jump to the proper handler. */
-	mtc0	k0, CP0_ERROREPC	#01: Save guest k0
-	ehb				#02:
-
-	mfc0	k0, CP0_EBASE		#02: Get EBASE
-	INT_SRL	k0, k0, 10		#03: Get rid of CPUNum
-	INT_SLL	k0, k0, 10		#04
-	LONG_S	k1, 0x3000(k0)		#05: Save k1 @ offset 0x3000
-	INT_ADDIU k0, k0, 0x2000	#06: Exception handler is
-					#    installed @ offset 0x2000
-	j	k0			#07: jump to the function
-	 nop				#08: branch delay slot
-VECTOR_END(MIPSX(exceptionEnd))
-.end MIPSX(exception)
-
-/*
- * Generic Guest exception handler. We end up here when the guest
- * does something that causes a trap to kernel mode.
- */
-NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-	/* Get the VCPU pointer from DDTATA_LO */
-	mfc0	k1, CP0_DDATA_LO
-	INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-	/* Start saving Guest context to VCPU */
-	LONG_S	$0, VCPU_R0(k1)
-	LONG_S	$1, VCPU_R1(k1)
-	LONG_S	$2, VCPU_R2(k1)
-	LONG_S	$3, VCPU_R3(k1)
-	LONG_S	$4, VCPU_R4(k1)
-	LONG_S	$5, VCPU_R5(k1)
-	LONG_S	$6, VCPU_R6(k1)
-	LONG_S	$7, VCPU_R7(k1)
-	LONG_S	$8, VCPU_R8(k1)
-	LONG_S	$9, VCPU_R9(k1)
-	LONG_S	$10, VCPU_R10(k1)
-	LONG_S	$11, VCPU_R11(k1)
-	LONG_S	$12, VCPU_R12(k1)
-	LONG_S	$13, VCPU_R13(k1)
-	LONG_S	$14, VCPU_R14(k1)
-	LONG_S	$15, VCPU_R15(k1)
-	LONG_S	$16, VCPU_R16(k1)
-	LONG_S	$17, VCPU_R17(k1)
-	LONG_S	$18, VCPU_R18(k1)
-	LONG_S	$19, VCPU_R19(k1)
-	LONG_S	$20, VCPU_R20(k1)
-	LONG_S	$21, VCPU_R21(k1)
-	LONG_S	$22, VCPU_R22(k1)
-	LONG_S	$23, VCPU_R23(k1)
-	LONG_S	$24, VCPU_R24(k1)
-	LONG_S	$25, VCPU_R25(k1)
-
-	/* Guest k0/k1 saved later */
-
-	LONG_S	$28, VCPU_R28(k1)
-	LONG_S	$29, VCPU_R29(k1)
-	LONG_S	$30, VCPU_R30(k1)
-	LONG_S	$31, VCPU_R31(k1)
-
-	.set at
-
-	/* We need to save hi/lo and restore them on the way out */
-	mfhi	t0
-	LONG_S	t0, VCPU_HI(k1)
-
-	mflo	t0
-	LONG_S	t0, VCPU_LO(k1)
-
-	/* Finally save guest k0/k1 to VCPU */
-	mfc0	t0, CP0_ERROREPC
-	LONG_S	t0, VCPU_R26(k1)
-
-	/* Get GUEST k1 and save it in VCPU */
-	PTR_LI	t1, ~0x2ff
-	mfc0	t0, CP0_EBASE
-	and	t0, t0, t1
-	LONG_L	t0, 0x3000(t0)
-	LONG_S	t0, VCPU_R27(k1)
-
-	/* Now that context has been saved, we can use other registers */
-
-	/* Restore vcpu */
-	mfc0	a1, CP0_DDATA_LO
-	move	s1, a1
-
-	/* Restore run (vcpu->run) */
-	LONG_L	a0, VCPU_RUN(a1)
-	/* Save pointer to run in s0, will be saved by the compiler */
-	move	s0, a0
-
-	/*
-	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to
-	 * process the exception
-	 */
-	mfc0	k0,CP0_EPC
-	LONG_S	k0, VCPU_PC(k1)
-
-	mfc0	k0, CP0_BADVADDR
-	LONG_S	k0, VCPU_HOST_CP0_BADVADDR(k1)
-
-	mfc0	k0, CP0_CAUSE
-	sw	k0, VCPU_HOST_CP0_CAUSE(k1)
-
-	/* Now restore the host state just enough to run the handlers */
-
-	/* Switch EBASE to the one used by Linux */
-	/* load up the host EBASE */
-	mfc0	v0, CP0_STATUS
-
-	or	k0, v0, ST0_BEV
-
-	mtc0	k0, CP0_STATUS
-	ehb
-
-	LONG_L	k0, ebase
-	mtc0	k0,CP0_EBASE
-
-	/*
-	 * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
-	 * trigger FPE for pending exceptions.
-	 */
-	and	v1, v0, ST0_CU1
-	beqz	v1, 1f
-	 nop
-	.set	push
-	SET_HARDFLOAT
-	cfc1	t0, fcr31
-	sw	t0, VCPU_FCR31(k1)
-	ctc1	zero,fcr31
-	.set	pop
-1:
-
-#ifdef CONFIG_CPU_HAS_MSA
-	/*
-	 * If MSA is enabled, save MSACSR and clear it so that later
-	 * instructions don't trigger MSAFPE for pending exceptions.
-	 */
-	mfc0	t0, CP0_CONFIG3
-	ext	t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
-	beqz	t0, 1f
-	 nop
-	mfc0	t0, CP0_CONFIG5
-	ext	t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
-	beqz	t0, 1f
-	 nop
-	_cfcmsa	t0, MSA_CSR
-	sw	t0, VCPU_MSA_CSR(k1)
-	_ctcmsa	MSA_CSR, zero
-1:
-#endif
-
-	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-	and	v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-	or	v0, v0, ST0_CU0
-	mtc0	v0, CP0_STATUS
-	ehb
-
-	/* Load up host GP */
-	LONG_L	gp, VCPU_HOST_GP(k1)
-
-	/* Need a stack before we can jump to "C" */
-	LONG_L	sp, VCPU_HOST_STACK(k1)
-
-	/* Saved host state */
-	INT_ADDIU sp, sp, -PT_SIZE
-
-	/*
-	 * XXXKYMA do we need to load the host ASID, maybe not because the
-	 * kernel entries are marked GLOBAL, need to verify
-	 */
-
-	/* Restore host DDATA_LO */
-	LONG_L	k0, PT_HOST_USERLOCAL(sp)
-	mtc0	k0, CP0_DDATA_LO
-
-	/* Restore RDHWR access */
-	INT_L	k0, hwrena
-	mtc0	k0, CP0_HWRENA
-
-	/* Jump to handler */
-FEXPORT(__kvm_mips_jump_to_handler)
-	/*
-	 * XXXKYMA: not sure if this is safe, how large is the stack??
-	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
-	 * with this in the kernel
-	 */
-	PTR_LA	t9, kvm_mips_handle_exit
-	jalr.hb	t9
-	 INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
-
-	/* Return from handler Make sure interrupts are disabled */
-	di
-	ehb
-
-	/*
-	 * XXXKYMA: k0/k1 could have been blown away if we processed
-	 * an exception while we were handling the exception from the
-	 * guest, reload k1
-	 */
-
-	move	k1, s1
-	INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-	/*
-	 * Check return value, should tell us if we are returning to the
-	 * host (handle I/O etc)or resuming the guest
-	 */
-	andi	t0, v0, RESUME_HOST
-	bnez	t0, __kvm_mips_return_to_host
-	 nop
-
-__kvm_mips_return_to_guest:
-	/* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-	mtc0	s1, CP0_DDATA_LO
-
-	/* Load up the Guest EBASE to minimize the window where BEV is set */
-	LONG_L	t0, VCPU_GUEST_EBASE(k1)
-
-	/* Switch EBASE back to the one used by KVM */
-	mfc0	v1, CP0_STATUS
-	or	k0, v1, ST0_BEV
-	mtc0	k0, CP0_STATUS
-	ehb
-	mtc0	t0, CP0_EBASE
-
-	/* Setup status register for running guest in UM */
-	or	v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-	and	v1, v1, ~(ST0_CU0 | ST0_MX)
-	mtc0	v1, CP0_STATUS
-	ehb
-
-	/* Set Guest EPC */
-	LONG_L	t0, VCPU_PC(k1)
-	mtc0	t0, CP0_EPC
-
-	/* Set the ASID for the Guest Kernel */
-	PTR_L	t0, VCPU_COP0(k1)
-	LONG_L	t0, COP0_STATUS(t0)
-	andi	t0, KSU_USER | ST0_ERL | ST0_EXL
-	xori	t0, KSU_USER
-	bnez	t0, 1f		/* If kernel */
-	 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-	INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-	/* t1: contains the base of the ASID array, need to get the cpu id  */
-	LONG_L	t2, TI_CPU($28)		/* smp_processor_id */
-	INT_SLL	t2, t2, 2		/* x4 */
-	REG_ADDU t3, t1, t2
-	LONG_L	k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	li	t3, CPUINFO_SIZE/4
-	mul	t2, t2, t3		/* x sizeof(struct cpuinfo_mips)/4 */
-	LONG_L	t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-	and	k0, k0, t2
-#else
-	andi	k0, k0, MIPS_ENTRYHI_ASID
-#endif
-	mtc0	k0, CP0_ENTRYHI
-	ehb
-
-	/* Disable RDHWR access */
-	mtc0	zero, CP0_HWRENA
-
-	.set	noat
-	/* load the guest context from VCPU and return */
-	LONG_L	$0, VCPU_R0(k1)
-	LONG_L	$1, VCPU_R1(k1)
-	LONG_L	$2, VCPU_R2(k1)
-	LONG_L	$3, VCPU_R3(k1)
-	LONG_L	$4, VCPU_R4(k1)
-	LONG_L	$5, VCPU_R5(k1)
-	LONG_L	$6, VCPU_R6(k1)
-	LONG_L	$7, VCPU_R7(k1)
-	LONG_L	$8, VCPU_R8(k1)
-	LONG_L	$9, VCPU_R9(k1)
-	LONG_L	$10, VCPU_R10(k1)
-	LONG_L	$11, VCPU_R11(k1)
-	LONG_L	$12, VCPU_R12(k1)
-	LONG_L	$13, VCPU_R13(k1)
-	LONG_L	$14, VCPU_R14(k1)
-	LONG_L	$15, VCPU_R15(k1)
-	LONG_L	$16, VCPU_R16(k1)
-	LONG_L	$17, VCPU_R17(k1)
-	LONG_L	$18, VCPU_R18(k1)
-	LONG_L	$19, VCPU_R19(k1)
-	LONG_L	$20, VCPU_R20(k1)
-	LONG_L	$21, VCPU_R21(k1)
-	LONG_L	$22, VCPU_R22(k1)
-	LONG_L	$23, VCPU_R23(k1)
-	LONG_L	$24, VCPU_R24(k1)
-	LONG_L	$25, VCPU_R25(k1)
-
-	/* $/k1 loaded later */
-	LONG_L	$28, VCPU_R28(k1)
-	LONG_L	$29, VCPU_R29(k1)
-	LONG_L	$30, VCPU_R30(k1)
-	LONG_L	$31, VCPU_R31(k1)
-
-FEXPORT(__kvm_mips_skip_guest_restore)
-	LONG_L	k0, VCPU_HI(k1)
-	mthi	k0
-
-	LONG_L	k0, VCPU_LO(k1)
-	mtlo	k0
-
-	LONG_L	k0, VCPU_R26(k1)
-	LONG_L	k1, VCPU_R27(k1)
-
-	eret
-	.set	at
-
-__kvm_mips_return_to_host:
-	/* EBASE is already pointing to Linux */
-	LONG_L	k1, VCPU_HOST_STACK(k1)
-	INT_ADDIU k1,k1, -PT_SIZE
-
-	/* Restore host DDATA_LO */
-	LONG_L	k0, PT_HOST_USERLOCAL(k1)
-	mtc0	k0, CP0_DDATA_LO
-
-	/*
-	 * r2/v0 is the return code, shift it down by 2 (arithmetic)
-	 * to recover the err code
-	 */
-	INT_SRA	k0, v0, 2
-	move	$2, k0
-
-	/* Load context saved on the host stack */
-	LONG_L	$16, PT_R16(k1)
-	LONG_L	$17, PT_R17(k1)
-	LONG_L	$18, PT_R18(k1)
-	LONG_L	$19, PT_R19(k1)
-	LONG_L	$20, PT_R20(k1)
-	LONG_L	$21, PT_R21(k1)
-	LONG_L	$22, PT_R22(k1)
-	LONG_L	$23, PT_R23(k1)
-
-	LONG_L	$28, PT_R28(k1)
-	LONG_L	$29, PT_R29(k1)
-	LONG_L	$30, PT_R30(k1)
-
-	LONG_L	k0, PT_HI(k1)
-	mthi	k0
-
-	LONG_L	k0, PT_LO(k1)
-	mtlo	k0
-
-	/* Restore RDHWR access */
-	INT_L	k0, hwrena
-	mtc0	k0, CP0_HWRENA
-
-	/* Restore RA, which is the address we will return to */
-	LONG_L	ra, PT_R31(k1)
-	j	ra
-	 nop
-
-VECTOR_END(MIPSX(GuestExceptionEnd))
-.end MIPSX(GuestException)
-
-MIPSX(exceptions):
-	####
-	##### The exception handlers.
-	#####
-	.word _C_LABEL(MIPSX(GuestException))	#  0
-	.word _C_LABEL(MIPSX(GuestException))	#  1
-	.word _C_LABEL(MIPSX(GuestException))	#  2
-	.word _C_LABEL(MIPSX(GuestException))	#  3
-	.word _C_LABEL(MIPSX(GuestException))	#  4
-	.word _C_LABEL(MIPSX(GuestException))	#  5
-	.word _C_LABEL(MIPSX(GuestException))	#  6
-	.word _C_LABEL(MIPSX(GuestException))	#  7
-	.word _C_LABEL(MIPSX(GuestException))	#  8
-	.word _C_LABEL(MIPSX(GuestException))	#  9
-	.word _C_LABEL(MIPSX(GuestException))	# 10
-	.word _C_LABEL(MIPSX(GuestException))	# 11
-	.word _C_LABEL(MIPSX(GuestException))	# 12
-	.word _C_LABEL(MIPSX(GuestException))	# 13
-	.word _C_LABEL(MIPSX(GuestException))	# 14
-	.word _C_LABEL(MIPSX(GuestException))	# 15
-	.word _C_LABEL(MIPSX(GuestException))	# 16
-	.word _C_LABEL(MIPSX(GuestException))	# 17
-	.word _C_LABEL(MIPSX(GuestException))	# 18
-	.word _C_LABEL(MIPSX(GuestException))	# 19
-	.word _C_LABEL(MIPSX(GuestException))	# 20
-	.word _C_LABEL(MIPSX(GuestException))	# 21
-	.word _C_LABEL(MIPSX(GuestException))	# 22
-	.word _C_LABEL(MIPSX(GuestException))	# 23
-	.word _C_LABEL(MIPSX(GuestException))	# 24
-	.word _C_LABEL(MIPSX(GuestException))	# 25
-	.word _C_LABEL(MIPSX(GuestException))	# 26
-	.word _C_LABEL(MIPSX(GuestException))	# 27
-	.word _C_LABEL(MIPSX(GuestException))	# 28
-	.word _C_LABEL(MIPSX(GuestException))	# 29
-	.word _C_LABEL(MIPSX(GuestException))	# 30
-	.word _C_LABEL(MIPSX(GuestException))	# 31
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 5a2b9034a05c..414b00074e29 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -245,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	}
 }
 
+static inline void dump_handler(const char *symbol, void *start, void *end)
+{
+	u32 *p;
+
+	pr_debug("LEAF(%s)\n", symbol);
+
+	pr_debug("\t.set push\n");
+	pr_debug("\t.set noreorder\n");
+
+	for (p = start; p < (u32 *)end; ++p)
+		pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p);
+
+	pr_debug("\t.set\tpop\n");
+
+	pr_debug("\tEND(%s)\n", symbol);
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	int err, size, offset;
-	void *gebase;
+	int err, size;
+	void *gebase, *p, *handler;
 	int i;
 
 	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -286,41 +303,38 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Save new ebase */
 	vcpu->arch.guest_ebase = gebase;
 
-	/* Copy L1 Guest Exception handler to correct offset */
+	/* Build guest exception vectors dynamically in unmapped memory */
+	handler = gebase + 0x2000;
 
 	/* TLB Refill, EXL = 0 */
-	memcpy(gebase, mips32_exception,
-	       mips32_exceptionEnd - mips32_exception);
+	kvm_mips_build_exception(gebase, handler);
 
 	/* General Exception Entry point */
-	memcpy(gebase + 0x180, mips32_exception,
-	       mips32_exceptionEnd - mips32_exception);
+	kvm_mips_build_exception(gebase + 0x180, handler);
 
 	/* For vectored interrupts poke the exception code @ all offsets 0-7 */
 	for (i = 0; i < 8; i++) {
 		kvm_debug("L1 Vectored handler @ %p\n",
 			  gebase + 0x200 + (i * VECTORSPACING));
-		memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception,
-		       mips32_exceptionEnd - mips32_exception);
+		kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING,
+					 handler);
 	}
 
-	/* General handler, relocate to unmapped space for sanity's sake */
-	offset = 0x2000;
-	kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
-		  gebase + offset,
-		  mips32_GuestExceptionEnd - mips32_GuestException);
+	/* General exit handler */
+	p = handler;
+	p = kvm_mips_build_exit(p);
 
-	memcpy(gebase + offset, mips32_GuestException,
-	       mips32_GuestExceptionEnd - mips32_GuestException);
+	/* Guest entry routine */
+	vcpu->arch.vcpu_run = p;
+	p = kvm_mips_build_vcpu_run(p);
 
-#ifdef MODULE
-	offset += mips32_GuestExceptionEnd - mips32_GuestException;
-	memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run,
-	       __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run);
-	vcpu->arch.vcpu_run = gebase + offset;
-#else
-	vcpu->arch.vcpu_run = __kvm_mips_vcpu_run;
-#endif
+	/* Dump the generated code */
+	pr_debug("#include <asm/asm.h>\n");
+	pr_debug("#include <asm/regdef.h>\n");
+	pr_debug("\n");
+	dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+	dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
+	dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
 
 	/* Invalidate the icache for these ranges */
 	local_flush_icache_range((unsigned long)gebase,
@@ -406,7 +420,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	kvm_mips_deliver_interrupts(vcpu,
 				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-	__kvm_guest_enter();
+	guest_enter_irqoff();
 
 	/* Disable hardware page table walking while in guest */
 	htw_stop();
@@ -418,7 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	/* Re-enable HTW before enabling interrupts */
 	htw_start();
 
-	__kvm_guest_exit();
+	guest_exit_irqoff();
 	local_irq_enable();
 
 	if (vcpu->sigset_active)
@@ -507,8 +521,10 @@ static u64 kvm_mips_get_one_regs[] = {
 	KVM_REG_MIPS_R30,
 	KVM_REG_MIPS_R31,
 
+#ifndef CONFIG_CPU_MIPSR6
 	KVM_REG_MIPS_HI,
 	KVM_REG_MIPS_LO,
+#endif
 	KVM_REG_MIPS_PC,
 
 	KVM_REG_MIPS_CP0_INDEX,
@@ -652,12 +668,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
 		v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
 		break;
+#ifndef CONFIG_CPU_MIPSR6
 	case KVM_REG_MIPS_HI:
 		v = (long)vcpu->arch.hi;
 		break;
 	case KVM_REG_MIPS_LO:
 		v = (long)vcpu->arch.lo;
 		break;
+#endif
 	case KVM_REG_MIPS_PC:
 		v = (long)vcpu->arch.pc;
 		break;
@@ -873,12 +891,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31:
 		vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v;
 		break;
+#ifndef CONFIG_CPU_MIPSR6
 	case KVM_REG_MIPS_HI:
 		vcpu->arch.hi = v;
 		break;
 	case KVM_REG_MIPS_LO:
 		vcpu->arch.lo = v;
 		break;
+#endif
 	case KVM_REG_MIPS_PC:
 		vcpu->arch.pc = v;
 		break;
@@ -1763,6 +1783,10 @@ static int __init kvm_mips_init(void)
 {
 	int ret;
 
+	ret = kvm_mips_entry_setup();
+	if (ret)
+		return ret;
+
 	ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
 
 	if (ret)
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index a38bdab68574..c858cf168078 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -20,50 +20,32 @@
 /*
  * Tracepoints for VM enters
  */
-TRACE_EVENT(kvm_enter,
-	    TP_PROTO(struct kvm_vcpu *vcpu),
-	    TP_ARGS(vcpu),
-	    TP_STRUCT__entry(
-			__field(unsigned long, pc)
-	    ),
-
-	    TP_fast_assign(
-			__entry->pc = vcpu->arch.pc;
-	    ),
-
-	    TP_printk("PC: 0x%08lx",
-		      __entry->pc)
-);
-
-TRACE_EVENT(kvm_reenter,
-	    TP_PROTO(struct kvm_vcpu *vcpu),
-	    TP_ARGS(vcpu),
-	    TP_STRUCT__entry(
-			__field(unsigned long, pc)
-	    ),
-
-	    TP_fast_assign(
-			__entry->pc = vcpu->arch.pc;
-	    ),
-
-	    TP_printk("PC: 0x%08lx",
-		      __entry->pc)
+DECLARE_EVENT_CLASS(kvm_transition,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+	TP_STRUCT__entry(
+		__field(unsigned long, pc)
+	),
+
+	TP_fast_assign(
+		__entry->pc = vcpu->arch.pc;
+	),
+
+	TP_printk("PC: 0x%08lx",
+		  __entry->pc)
 );
 
-TRACE_EVENT(kvm_out,
-	    TP_PROTO(struct kvm_vcpu *vcpu),
-	    TP_ARGS(vcpu),
-	    TP_STRUCT__entry(
-			__field(unsigned long, pc)
-	    ),
+DEFINE_EVENT(kvm_transition, kvm_enter,
+	     TP_PROTO(struct kvm_vcpu *vcpu),
+	     TP_ARGS(vcpu));
 
-	    TP_fast_assign(
-			__entry->pc = vcpu->arch.pc;
-	    ),
+DEFINE_EVENT(kvm_transition, kvm_reenter,
+	     TP_PROTO(struct kvm_vcpu *vcpu),
+	     TP_ARGS(vcpu));
 
-	    TP_printk("PC: 0x%08lx",
-		      __entry->pc)
-);
+DEFINE_EVENT(kvm_transition, kvm_out,
+	     TP_PROTO(struct kvm_vcpu *vcpu),
+	     TP_ARGS(vcpu));
 
 /* The first 32 exit reasons correspond to Cause.ExcCode */
 #define KVM_TRACE_EXIT_INT		 0
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 00e8dc3d36cb..091553942bcb 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -431,9 +431,15 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	/*
 	 * Arch specific stuff, set up config registers properly so that the
-	 * guest will come up as expected, for now we simulate a MIPS 24kc
+	 * guest will come up as expected
 	 */
+#ifndef CONFIG_CPU_MIPSR6
+	/* r2-r5, simulate a MIPS 24kc */
 	kvm_write_c0_guest_prid(cop0, 0x00019300);
+#else
+	/* r6+, simulate a generic QEMU machine */
+	kvm_write_c0_guest_prid(cop0, 0x00010000);
+#endif
 	/*
 	 * Have config1, Cacheable, noncoherent, write-back, write allocate.
 	 * Endianness, arch revision & virtually tagged icache should match
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index d96e912b9d44..6dc07fba187f 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
 		return 1;
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (insn.i_format.rt && !insn.i_format.rs)
@@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6)
 			break;
 		*contpc = regs->cp0_epc + dec_insn.pc_inc +
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (!insn.i_format.rs)
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index d78178daea4b..277cf52d80e1 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = {
 	{ insn_bltzl, 0, 0 },
 	{ insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
 	{ insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
+	{ insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+	{ insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE },
+	{ insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
+	{ insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
 	{ insn_daddu, 0, 0 },
 	{ insn_daddiu, 0, 0 },
+	{ insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS },
 	{ insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
 	{ insn_dmfc0, 0, 0 },
 	{ insn_dmtc0, 0, 0 },
@@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = {
 	{ insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS },
 	{ insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS },
 	{ insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD },
+	{ insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS },
+	{ insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS },
 	{ insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD },
 	{ insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD },
 	{ insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM },
@@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...)
 	op = ip->match;
 	va_start(ap, opc);
 	if (ip->fields & RS) {
-		if (opc == insn_mfc0 || opc == insn_mtc0)
+		if (opc == insn_mfc0 || opc == insn_mtc0 ||
+		    opc == insn_cfc1 || opc == insn_ctc1)
 			op |= build_rt(va_arg(ap, u32));
 		else
 			op |= build_rs(va_arg(ap, u32));
 	}
 	if (ip->fields & RT) {
-		if (opc == insn_mfc0 || opc == insn_mtc0)
+		if (opc == insn_mfc0 || opc == insn_mtc0 ||
+		    opc == insn_cfc1 || opc == insn_ctc1)
 			op |= build_rs(va_arg(ap, u32));
 		else
 			op |= build_rt(va_arg(ap, u32));
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 9c2220a45189..cec524167822 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -67,9 +67,14 @@ static struct insn insn_table[] = {
 #else
 	{ insn_cache,  M6(cache_op, 0, 0, 0, cache6_op),  RS | RT | SIMM9 },
 #endif
+	{ insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+	{ insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE },
+	{ insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
+	{ insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE },
 	{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
 	{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
 	{ insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
+	{ insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT },
 	{ insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE },
 	{ insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT },
 	{ insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
@@ -114,7 +119,13 @@ static struct insn insn_table[] = {
 	{ insn_mflo,  M(spec_op, 0, 0, 0, 0, mflo_op), RD },
 	{ insn_mtc0,  M(cop0_op, mtc_op, 0, 0, 0, 0),  RT | RD | SET},
 	{ insn_mthc0,  M(cop0_op, mthc0_op, 0, 0, 0, 0),  RT | RD | SET},
+	{ insn_mthi,  M(spec_op, 0, 0, 0, 0, mthi_op), RS },
+	{ insn_mtlo,  M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
+#ifndef CONFIG_CPU_MIPSR6
 	{ insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+#else
+	{ insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD},
+#endif
 	{ insn_ori,  M(ori_op, 0, 0, 0, 0, 0),	RS | RT | UIMM },
 	{ insn_or,  M(spec_op, 0, 0, 0, 0, or_op),  RS | RT | RD },
 #ifndef CONFIG_CPU_MIPSR6
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index ad718debc35a..3e0282d301d6 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -49,18 +49,19 @@ enum opcode {
 	insn_invalid,
 	insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
 	insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
-	insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm,
-	insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
+	insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
+	insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu,
+	insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
 	insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
 	insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
 	insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
 	insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
-	insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe,
-	insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt,
-	insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu,
-	insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi,
-	insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield,
-	insn_lddir, insn_ldpte,
+	insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori,
+	insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+	insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+	insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+	insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+	insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -268,10 +269,15 @@ I_u1s2(_bltz)
 I_u1s2(_bltzl)
 I_u1u2s3(_bne)
 I_u2s3u1(_cache)
+I_u1u2(_cfc1)
+I_u2u1(_cfcmsa)
+I_u1u2(_ctc1)
+I_u2u1(_ctcmsa)
 I_u1u2u3(_dmfc0)
 I_u1u2u3(_dmtc0)
 I_u2u1s3(_daddiu)
 I_u3u1u2(_daddu)
+I_u1(_di);
 I_u1u2(_divu)
 I_u2u1u3(_dsll)
 I_u2u1u3(_dsll32)
@@ -301,6 +307,8 @@ I_u1(_mfhi)
 I_u1(_mflo)
 I_u1u2u3(_mtc0)
 I_u1u2u3(_mthc0)
+I_u1(_mthi)
+I_u1(_mtlo)
 I_u3u1u2(_mul)
 I_u2u1u3(_ori)
 I_u3u1u2(_or)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bc27c4d6383c..2fd5580c8f6e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2523,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
 			spin_unlock(&pvc->lock);
 
-	kvm_guest_enter();
+	guest_enter();
 
 	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
@@ -2571,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
-	kvm_guest_exit();
+	guest_exit();
 
 	for (sub = 0; sub < core_info.n_subcores; ++sub)
 		list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a910fef86bba..b6cd730c33ef 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* We get here with MSR.EE=1 */
 
 	trace_kvm_exit(exit_nr, vcpu);
-	kvm_guest_exit();
+	guest_exit();
 
 	switch (exit_nr) {
 	case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1541,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	kvmppc_clear_debug(vcpu);
 
-	/* No need for kvm_guest_exit. It's done in handle_exit.
+	/* No need for guest_exit. It's done in handle_exit.
 	   We also get here with interrupts enabled. */
 
 	/* Make sure we save the guest FPU/Altivec/VSX state */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4afae695899a..02b4672f7347 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
-	/* No need for kvm_guest_exit. It's done in handle_exit.
+	/* No need for guest_exit. It's done in handle_exit.
 	   We also get here with interrupts enabled. */
 
 	/* Switch back to user space debug context */
@@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	}
 
 	trace_kvm_exit(exit_nr, vcpu);
-	__kvm_guest_exit();
+	guest_exit_irqoff();
 
 	local_irq_enable();
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 02416fea7653..1ac036e45ed4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			continue;
 		}
 
-		__kvm_guest_enter();
+		guest_enter_irqoff();
 		return 1;
 	}
 
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index d054c1b07a3c..741ddba0bf11 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,14 +10,25 @@
 
 /**
  * struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
  * @crst_list: list of all crst tables used in the guest address space
  * @mm: pointer to the parent mm_struct
  * @guest_to_host: radix tree with guest to host address translation
  * @host_to_guest: radix tree with pointer to segment table entries
  * @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
  */
 struct gmap {
 	struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
 	struct radix_tree_root guest_to_host;
 	struct radix_tree_root host_to_guest;
 	spinlock_t guest_table_lock;
+	atomic_t ref_count;
 	unsigned long *table;
 	unsigned long asce;
 	unsigned long asce_end;
 	void *private;
 	bool pfault_enabled;
+	/* Additional data for shadow guest address spaces */
+	struct radix_tree_root host_to_rmap;
+	struct list_head children;
+	struct list_head pt_list;
+	spinlock_t shadow_lock;
+	struct gmap *parent;
+	unsigned long orig_asce;
+	int edat_level;
+	bool removed;
+	bool initialized;
 };
 
 /**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+	struct gmap_rmap *next;
+	unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+	for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+	for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+/**
  * struct gmap_notifier - notify function block for page invalidation
  * @notifier_call: address of callback function
  */
 struct gmap_notifier {
 	struct list_head list;
-	void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+	struct rcu_head rcu;
+	void (*notifier_call)(struct gmap *gmap, unsigned long start,
+			      unsigned long end);
 };
 
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+	return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
 void gmap_enable(struct gmap *gmap);
 void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
 int gmap_map_segment(struct gmap *gmap, unsigned long from,
 		     unsigned long to, unsigned long len);
 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
 
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+			 int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+		    int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+		    int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+		    int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+		    int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+			   unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+		     unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+			 unsigned long len, int prot);
 
 #endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9eed5c18a61c..946fc86202fd 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -145,7 +145,7 @@ struct kvm_s390_sie_block {
 	__u64	cputm;			/* 0x0028 */
 	__u64	ckc;			/* 0x0030 */
 	__u64	epoch;			/* 0x0038 */
-	__u8	reserved40[4];		/* 0x0040 */
+	__u32	svcc;			/* 0x0040 */
 #define LCTL_CR0	0x8000
 #define LCTL_CR6	0x0200
 #define LCTL_CR9	0x0040
@@ -167,6 +167,9 @@ struct kvm_s390_sie_block {
 #define ICPT_INST	0x04
 #define ICPT_PROGI	0x08
 #define ICPT_INSTPROGI	0x0C
+#define ICPT_EXTINT	0x14
+#define ICPT_VALIDITY	0x20
+#define ICPT_STOP	0x28
 #define ICPT_OPEREXC	0x2C
 #define ICPT_PARTEXEC	0x38
 #define ICPT_IOINST	0x40
@@ -226,7 +229,7 @@ struct kvm_s390_sie_block {
 	__u8	reserved1e6[2];		/* 0x01e6 */
 	__u64	itdba;			/* 0x01e8 */
 	__u64   riccbd;			/* 0x01f0 */
-	__u8    reserved1f8[8];		/* 0x01f8 */
+	__u64	gvrd;			/* 0x01f8 */
 } __attribute__((packed));
 
 struct kvm_s390_itdb {
@@ -281,6 +284,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_stsi;
 	u32 instruction_stfl;
 	u32 instruction_tprot;
+	u32 instruction_sie;
 	u32 instruction_essa;
 	u32 instruction_sthyi;
 	u32 instruction_sigp_sense;
@@ -545,12 +549,16 @@ struct kvm_guestdbg_info_arch {
 
 struct kvm_vcpu_arch {
 	struct kvm_s390_sie_block *sie_block;
+	/* if vsie is active, currently executed shadow sie control block */
+	struct kvm_s390_sie_block *vsie_block;
 	unsigned int      host_acrs[NUM_ACRS];
 	struct fpu	  host_fpregs;
 	struct kvm_s390_local_interrupt local_int;
 	struct hrtimer    ckc_timer;
 	struct kvm_s390_pgm_info pgm;
 	struct gmap *gmap;
+	/* backup location for the currently enabled gmap when scheduled out */
+	struct gmap *enabled_gmap;
 	struct kvm_guestdbg_info_arch guestdbg;
 	unsigned long pfault_token;
 	unsigned long pfault_select;
@@ -635,6 +643,14 @@ struct sie_page2 {
 	u8 reserved900[0x1000 - 0x900];			/* 0x0900 */
 } __packed;
 
+struct kvm_s390_vsie {
+	struct mutex mutex;
+	struct radix_tree_root addr_to_page;
+	int page_count;
+	int next;
+	struct page *pages[KVM_MAX_VCPUS];
+};
+
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -659,6 +675,7 @@ struct kvm_arch{
 	struct sie_page2 *sie_page2;
 	struct kvm_s390_cpu_model model;
 	struct kvm_s390_crypto crypto;
+	struct kvm_s390_vsie vsie;
 	u64 epoch;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 081b2ad99d73..b941528cc49e 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -8,8 +8,9 @@ typedef struct {
 	cpumask_t cpu_attach_mask;
 	atomic_t attach_count;
 	unsigned int flush_mm;
-	spinlock_t list_lock;
+	spinlock_t pgtable_lock;
 	struct list_head pgtable_list;
+	spinlock_t gmap_lock;
 	struct list_head gmap_list;
 	unsigned long asce;
 	unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
 	unsigned int use_skey:1;
 } mm_context_t;
 
-#define INIT_MM_CONTEXT(name)						      \
-	.context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+#define INIT_MM_CONTEXT(name)						   \
+	.context.pgtable_lock =						   \
+			__SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+	.context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
 	.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
 
 static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index c837b79b455d..3ce3854b7a41 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,8 +15,9 @@
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
-	spin_lock_init(&mm->context.list_lock);
+	spin_lock_init(&mm->context.pgtable_lock);
 	INIT_LIST_HEAD(&mm->context.pgtable_list);
+	spin_lock_init(&mm->context.gmap_lock);
 	INIT_LIST_HEAD(&mm->context.gmap_list);
 	cpumask_clear(&mm->context.cpu_attach_mask);
 	atomic_set(&mm->context.attach_count, 0);
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index f874e7d51c19..b5edff32e547 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -147,6 +147,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn)	__va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page)	pfn_to_virt(page_to_pfn(page))
 
 #define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | \
 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index da34cb6b1f3b..f4eb9843eed4 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
 void crst_table_free(struct mm_struct *, unsigned long *);
 
 unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
 extern int page_table_allocate_pgste;
 
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9951e7e59756..c7ebba483f09 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr)
 /* Bits in the region table entry */
 #define _REGION_ENTRY_ORIGIN	~0xfffUL/* region/segment table origin	    */
 #define _REGION_ENTRY_PROTECT	0x200	/* region protection bit	    */
+#define _REGION_ENTRY_OFFSET	0xc0	/* region table offset		    */
 #define _REGION_ENTRY_INVALID	0x20	/* invalid region table entry	    */
 #define _REGION_ENTRY_TYPE_MASK	0x0c	/* region/segment table type mask   */
 #define _REGION_ENTRY_TYPE_R1	0x0c	/* region first table type	    */
@@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr)
 #define PGSTE_GC_BIT	0x0002000000000000UL
 #define PGSTE_UC_BIT	0x0000800000000000UL	/* user dirty (migration) */
 #define PGSTE_IN_BIT	0x0000400000000000UL	/* IPTE notify bit */
+#define PGSTE_VSIE_BIT	0x0000200000000000UL	/* ref'd in a shadow table */
 
 /* Guest Page State used for virtualization */
 #define _PGSTE_GPS_ZERO		0x0000000080000000UL
@@ -885,10 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry);
 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+		 pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+		    pte_t *ptep, int prot, unsigned long bit);
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep , int reset);
 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+		    pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
 
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 9d4d311d7e52..8c2922f540f9 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -109,6 +109,8 @@ struct thread_struct {
         unsigned long ksp;              /* kernel stack pointer             */
 	mm_segment_t mm_segment;
 	unsigned long gmap_addr;	/* address of last gmap fault. */
+	unsigned int gmap_write_flag;	/* gmap fault write indication */
+	unsigned int gmap_int_code;	/* int code of last gmap fault */
 	unsigned int gmap_pfault;	/* signal of a pending guest pfault */
 	struct per_regs per_user;	/* User specified PER registers */
 	struct per_event per_event;	/* Cause of the last PER trap */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index f0818d70d73d..a2ffec4139ad 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -98,6 +98,18 @@ struct kvm_s390_vm_cpu_machine {
 
 #define KVM_S390_VM_CPU_FEAT_NR_BITS	1024
 #define KVM_S390_VM_CPU_FEAT_ESOP	0
+#define KVM_S390_VM_CPU_FEAT_SIEF2	1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO	2
+#define KVM_S390_VM_CPU_FEAT_SIIF	3
+#define KVM_S390_VM_CPU_FEAT_GPERE	4
+#define KVM_S390_VM_CPU_FEAT_GSLS	5
+#define KVM_S390_VM_CPU_FEAT_IB		6
+#define KVM_S390_VM_CPU_FEAT_CEI	7
+#define KVM_S390_VM_CPU_FEAT_IBS	8
+#define KVM_S390_VM_CPU_FEAT_SKEY	9
+#define KVM_S390_VM_CPU_FEAT_CMMA	10
+#define KVM_S390_VM_CPU_FEAT_PFMFI	11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF	12
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 82e73e2b953d..09a9e6dfc09f 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
 
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8e245e764c21..54200208bf24 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/err.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include <asm/switch_to.h>
@@ -946,3 +947,241 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 		return 0;
 	return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
 }
+
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+				  unsigned long *pgt, int *dat_protection,
+				  int *fake)
+{
+	struct gmap *parent;
+	union asce asce;
+	union vaddress vaddr;
+	unsigned long ptr;
+	int rc;
+
+	*fake = 0;
+	*dat_protection = 0;
+	parent = sg->parent;
+	vaddr.addr = saddr;
+	asce.val = sg->orig_asce;
+	ptr = asce.origin * 4096;
+	if (asce.r) {
+		*fake = 1;
+		asce.dt = ASCE_TYPE_REGION1;
+	}
+	switch (asce.dt) {
+	case ASCE_TYPE_REGION1:
+		if (vaddr.rfx01 > asce.tl && !asce.r)
+			return PGM_REGION_FIRST_TRANS;
+		break;
+	case ASCE_TYPE_REGION2:
+		if (vaddr.rfx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.rsx01 > asce.tl)
+			return PGM_REGION_SECOND_TRANS;
+		break;
+	case ASCE_TYPE_REGION3:
+		if (vaddr.rfx || vaddr.rsx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.rtx01 > asce.tl)
+			return PGM_REGION_THIRD_TRANS;
+		break;
+	case ASCE_TYPE_SEGMENT:
+		if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.sx01 > asce.tl)
+			return PGM_SEGMENT_TRANSLATION;
+		break;
+	}
+
+	switch (asce.dt) {
+	case ASCE_TYPE_REGION1: {
+		union region1_table_entry rfte;
+
+		if (*fake) {
+			/* offset in 16EB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+			rfte.val = ptr;
+			goto shadow_r2t;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+		if (rc)
+			return rc;
+		if (rfte.i)
+			return PGM_REGION_FIRST_TRANS;
+		if (rfte.tt != TABLE_TYPE_REGION1)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+			return PGM_REGION_SECOND_TRANS;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rfte.p;
+		ptr = rfte.rto << 12UL;
+shadow_r2t:
+		rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+		if (rc)
+			return rc;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_REGION2: {
+		union region2_table_entry rste;
+
+		if (*fake) {
+			/* offset in 8PB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+			rste.val = ptr;
+			goto shadow_r3t;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+		if (rc)
+			return rc;
+		if (rste.i)
+			return PGM_REGION_SECOND_TRANS;
+		if (rste.tt != TABLE_TYPE_REGION2)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+			return PGM_REGION_THIRD_TRANS;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rste.p;
+		ptr = rste.rto << 12UL;
+shadow_r3t:
+		rste.p |= *dat_protection;
+		rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+		if (rc)
+			return rc;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_REGION3: {
+		union region3_table_entry rtte;
+
+		if (*fake) {
+			/* offset in 4TB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+			rtte.val = ptr;
+			goto shadow_sgt;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+		if (rc)
+			return rc;
+		if (rtte.i)
+			return PGM_REGION_THIRD_TRANS;
+		if (rtte.tt != TABLE_TYPE_REGION3)
+			return PGM_TRANSLATION_SPEC;
+		if (rtte.cr && asce.p && sg->edat_level >= 2)
+			return PGM_TRANSLATION_SPEC;
+		if (rtte.fc && sg->edat_level >= 2) {
+			*dat_protection |= rtte.fc0.p;
+			*fake = 1;
+			ptr = rtte.fc1.rfaa << 31UL;
+			rtte.val = ptr;
+			goto shadow_sgt;
+		}
+		if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+			return PGM_SEGMENT_TRANSLATION;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rtte.fc0.p;
+		ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+		rtte.fc0.p |= *dat_protection;
+		rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+		if (rc)
+			return rc;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_SEGMENT: {
+		union segment_table_entry ste;
+
+		if (*fake) {
+			/* offset in 2G guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+			ste.val = ptr;
+			goto shadow_pgt;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+		if (rc)
+			return rc;
+		if (ste.i)
+			return PGM_SEGMENT_TRANSLATION;
+		if (ste.tt != TABLE_TYPE_SEGMENT)
+			return PGM_TRANSLATION_SPEC;
+		if (ste.cs && asce.p)
+			return PGM_TRANSLATION_SPEC;
+		*dat_protection |= ste.fc0.p;
+		if (ste.fc && sg->edat_level >= 1) {
+			*fake = 1;
+			ptr = ste.fc1.sfaa << 20UL;
+			ste.val = ptr;
+			goto shadow_pgt;
+		}
+		ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+		ste.fc0.p |= *dat_protection;
+		rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+		if (rc)
+			return rc;
+	}
+	}
+	/* Return the parent address of the page table */
+	*pgt = ptr;
+	return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ *	    - > 0 (pgm exception code) on exceptions while faulting
+ *	    - -EAGAIN if the caller can retry immediately
+ *	    - -EFAULT when accessing invalid guest addresses
+ *	    - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+			  unsigned long saddr)
+{
+	union vaddress vaddr;
+	union page_table_entry pte;
+	unsigned long pgt;
+	int dat_protection, fake;
+	int rc;
+
+	down_read(&sg->mm->mmap_sem);
+	/*
+	 * We don't want any guest-2 tables to change - so the parent
+	 * tables/pointers we read stay valid - unshadowing is however
+	 * always possible - only guest_table_lock protects us.
+	 */
+	ipte_lock(vcpu);
+
+	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+	if (rc)
+		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+					    &fake);
+
+	vaddr.addr = saddr;
+	if (fake) {
+		/* offset in 1MB guest memory block */
+		pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+		goto shadow_page;
+	}
+	if (!rc)
+		rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+	if (!rc && pte.i)
+		rc = PGM_PAGE_TRANSLATION;
+	if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+		rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+	pte.p |= dat_protection;
+	if (!rc)
+		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+	ipte_unlock(vcpu);
+	up_read(&sg->mm->mmap_sem);
+	return rc;
+}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd8159..8756569ad938 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+			  unsigned long saddr);
+
 #endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 1e0849e20965..31a05330d11c 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -439,6 +439,23 @@ exit_required:
 #define guest_per_enabled(vcpu) \
 			     (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
 
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+	const u8 ilen = kvm_s390_get_ilen(vcpu);
+	struct kvm_s390_pgm_info pgm_info = {
+		.code = PGM_PER,
+		.per_code = PER_EVENT_IFETCH >> 24,
+		.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+	};
+
+	/*
+	 * The PSW points to the next instruction, therefore the intercepted
+	 * instruction generated a PER i-fetch event. PER address therefore
+	 * points at the previous PSW address (could be an EXECUTE function).
+	 */
+	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
 static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
 	u32 perc = vcpu->arch.sie_block->perc << 24;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9359f65c8634..850be47c4cc9 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -364,6 +364,8 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
+	int rc, per_rc = 0;
+
 	if (kvm_is_ucontrol(vcpu->kvm))
 		return -EOPNOTSUPP;
 
@@ -372,7 +374,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 	case 0x18:
 		return handle_noop(vcpu);
 	case 0x04:
-		return handle_instruction(vcpu);
+		rc = handle_instruction(vcpu);
+		break;
 	case 0x08:
 		return handle_prog(vcpu);
 	case 0x14:
@@ -384,10 +387,18 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 	case 0x28:
 		return handle_stop(vcpu);
 	case 0x2c:
-		return handle_operexc(vcpu);
+		rc = handle_operexc(vcpu);
+		break;
 	case 0x38:
-		return handle_partial_execution(vcpu);
+		rc = handle_partial_execution(vcpu);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
+
+	/* process PER, also if the instrution is processed in user space */
+	if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+	    (!rc || rc == -EOPNOTSUPP))
+		per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+	return per_rc ? per_rc : rc;
 }
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d72c4a877622..ca19627779db 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -995,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 		swake_up(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
+	/*
+	 * The VCPU might not be sleeping but is executing the VSIE. Let's
+	 * kick it, so it leaves the SIE to process the request.
+	 */
+	kvm_s390_vsie_kick(vcpu);
 }
 
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0dcf9b8fc12c..d42428c11794 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/mman.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
@@ -98,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
 	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+	{ "instruction_sie", VCPU_STAT(instruction_sie) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
 	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
 	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -123,6 +125,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[16] = {
 	0xffe6000000000000UL,
@@ -141,6 +148,7 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
 static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 
 static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 
 /* Section: not file related */
@@ -150,7 +158,8 @@ int kvm_arch_hardware_enable(void)
 	return 0;
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+			      unsigned long end);
 
 /*
  * This callback is executed during stop_machine(). All CPUs are therefore
@@ -172,6 +181,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
 			vcpu->arch.sie_block->epoch -= *delta;
 			if (vcpu->arch.cputm_enabled)
 				vcpu->arch.cputm_start += *delta;
+			if (vcpu->arch.vsie_block)
+				vcpu->arch.vsie_block->epoch -= *delta;
 		}
 	}
 	return NOTIFY_OK;
@@ -184,7 +195,9 @@ static struct notifier_block kvm_clock_notifier = {
 int kvm_arch_hardware_setup(void)
 {
 	gmap_notifier.notifier_call = kvm_gmap_notifier;
-	gmap_register_ipte_notifier(&gmap_notifier);
+	gmap_register_pte_notifier(&gmap_notifier);
+	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+	gmap_register_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
 				       &kvm_clock_notifier);
 	return 0;
@@ -192,7 +205,8 @@ int kvm_arch_hardware_setup(void)
 
 void kvm_arch_hardware_unsetup(void)
 {
-	gmap_unregister_ipte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
 					 &kvm_clock_notifier);
 }
@@ -250,6 +264,46 @@ static void kvm_s390_cpu_feat_init(void)
 
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+	/*
+	 * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+	 */
+	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+	    !test_facility(3) || !nested)
+		return;
+	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+	if (sclp.has_64bscao)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+	if (sclp.has_siif)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+	if (sclp.has_gpere)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+	if (sclp.has_gsls)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+	if (sclp.has_ib)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+	if (sclp.has_cei)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+	if (sclp.has_ibs)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+	/*
+	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+	 * all skey handling functions read/set the skey from the PGSTE
+	 * instead of the real storage key.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+	 * pages being detected as preserved although they are resident.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+	 * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+	 *
+	 * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+	 * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+	 * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+	 * cannot easily shadow the SCA because of the ipte lock.
+	 */
 }
 
 int kvm_arch_init(void *opaque)
@@ -530,20 +584,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		if (!new_limit)
 			return -EINVAL;
 
-		/* gmap_alloc takes last usable address */
+		/* gmap_create takes last usable address */
 		if (new_limit != KVM_S390_NO_MEM_LIMIT)
 			new_limit -= 1;
 
 		ret = -EBUSY;
 		mutex_lock(&kvm->lock);
 		if (!kvm->created_vcpus) {
-			/* gmap_alloc will round the limit up */
-			struct gmap *new = gmap_alloc(current->mm, new_limit);
+			/* gmap_create will round the limit up */
+			struct gmap *new = gmap_create(current->mm, new_limit);
 
 			if (!new) {
 				ret = -ENOMEM;
 			} else {
-				gmap_free(kvm->arch.gmap);
+				gmap_remove(kvm->arch.gmap);
 				new->private = kvm;
 				kvm->arch.gmap = new;
 				ret = 0;
@@ -1392,7 +1446,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		else
 			kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
 						    sclp.hamax + 1);
-		kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+		kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
 		if (!kvm->arch.gmap)
 			goto out_err;
 		kvm->arch.gmap->private = kvm;
@@ -1404,6 +1458,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.epoch = 0;
 
 	spin_lock_init(&kvm->arch.start_stop_lock);
+	kvm_s390_vsie_init(kvm);
 	KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
 	return 0;
@@ -1425,7 +1480,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 		sca_del_vcpu(vcpu);
 
 	if (kvm_is_ucontrol(vcpu->kvm))
-		gmap_free(vcpu->arch.gmap);
+		gmap_remove(vcpu->arch.gmap);
 
 	if (vcpu->kvm->arch.use_cmma)
 		kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1458,16 +1513,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	debug_unregister(kvm->arch.dbf);
 	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
-		gmap_free(kvm->arch.gmap);
+		gmap_remove(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
+	kvm_s390_vsie_destroy(kvm);
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
 /* Section: vcpu related */
 static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+	vcpu->arch.gmap = gmap_create(current->mm, -1UL);
 	if (!vcpu->arch.gmap)
 		return -ENOMEM;
 	vcpu->arch.gmap->private = vcpu->kvm;
@@ -1717,7 +1773,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
-	gmap_enable(vcpu->arch.gmap);
+	gmap_enable(vcpu->arch.enabled_gmap);
 	atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__start_cpu_timer_accounting(vcpu);
@@ -1730,7 +1786,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__stop_cpu_timer_accounting(vcpu);
 	atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-	gmap_disable(vcpu->arch.gmap);
+	vcpu->arch.enabled_gmap = gmap_get_enabled();
+	gmap_disable(vcpu->arch.enabled_gmap);
 
 	/* Save guest register state */
 	save_fpu_regs();
@@ -1779,7 +1836,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 		vcpu->arch.gmap = vcpu->kvm->arch.gmap;
 		sca_add_vcpu(vcpu);
 	}
-
+	/* make vcpu_load load the right gmap on the first trigger */
+	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1976,16 +2034,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
 	kvm_s390_vcpu_request(vcpu);
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+			      unsigned long end)
 {
-	int i;
 	struct kvm *kvm = gmap->private;
 	struct kvm_vcpu *vcpu;
+	unsigned long prefix;
+	int i;
 
+	if (gmap_is_shadow(gmap))
+		return;
+	if (start >= 1UL << 31)
+		/* We are only interested in prefix pages */
+		return;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		/* match against both prefix pages */
-		if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+		prefix = kvm_s390_get_prefix(vcpu);
+		if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+				   start, end);
 			kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
 		}
 	}
@@ -2264,16 +2331,16 @@ retry:
 		return 0;
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
-	 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+	 * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
 	 * This ensures that the ipte instruction for this request has
 	 * already finished. We might race against a second unmapper that
 	 * wants to set the blocking bit. Lets just retry the request loop.
 	 */
 	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
 		int rc;
-		rc = gmap_ipte_notify(vcpu->arch.gmap,
-				      kvm_s390_get_prefix(vcpu),
-				      PAGE_SIZE * 2);
+		rc = gmap_mprotect_notify(vcpu->arch.gmap,
+					  kvm_s390_get_prefix(vcpu),
+					  PAGE_SIZE * 2, PROT_WRITE);
 		if (rc)
 			return rc;
 		goto retry;
@@ -2556,14 +2623,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		 * guest_enter and guest_exit should be no uaccess.
 		 */
 		local_irq_disable();
-		__kvm_guest_enter();
+		guest_enter_irqoff();
 		__disable_cpu_timer_accounting(vcpu);
 		local_irq_enable();
 		exit_reason = sie64a(vcpu->arch.sie_block,
 				     vcpu->run->s.regs.gprs);
 		local_irq_disable();
 		__enable_cpu_timer_accounting(vcpu);
-		__kvm_guest_exit();
+		guest_exit_irqoff();
 		local_irq_enable();
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 52aa47e112d8..b8432862a817 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+	return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -238,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
 }
 static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 {
+	/* don't inject PER events if we re-execute the instruction */
+	vcpu->arch.sie_block->icptstatus &= ~0x02;
 	kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
 }
 
@@ -252,6 +254,14 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+				 unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
@@ -369,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 			    struct kvm_guest_debug *dbg);
 void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
 void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
 void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 /* support for Basic/Extended SCA handling */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 3db3be139992..c77ad2dc334f 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -719,6 +719,7 @@ static const intercept_handler_t b2_handlers[256] = {
 	[0x10] = handle_set_prefix,
 	[0x11] = handle_store_prefix,
 	[0x12] = handle_store_cpu_address,
+	[0x14] = kvm_s390_handle_vsie,
 	[0x21] = handle_ipte_interlock,
 	[0x29] = handle_iske,
 	[0x2a] = handle_rrbe,
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1f1b..1a252f537081 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
 	const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
 	u16 p_asn, s_asn;
 	psw_t *psw;
-	u32 flags;
+	bool idle;
 
-	flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+	idle = is_vcpu_idle(vcpu);
 	psw = &dst_vcpu->arch.sie_block->gpsw;
 	p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff;  /* Primary ASN */
 	s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff;  /* Secondary ASN */
 
 	/* Inject the emergency signal? */
-	if (!(flags & CPUSTAT_STOPPED)
+	if (!is_vcpu_stopped(vcpu)
 	    || (psw->mask & psw_int_mask) != psw_int_mask
-	    || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
-	    || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+	    || (idle && psw->addr != 0)
+	    || (!idle && (asn == p_asn || asn == s_asn))) {
 		return __inject_sigp_emergency(vcpu, dst_vcpu);
 	} else {
 		*reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 000000000000..c106488b4137
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <[email protected]>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
+	/* the pinned originial scb */
+	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
+	/* the shadow gmap in use by the vsie_page */
+	struct gmap *gmap;			/* 0x0208 */
+	/* address of the last reported fault to guest2 */
+	unsigned long fault_addr;		/* 0x0210 */
+	__u8 reserved[0x0700 - 0x0218];		/* 0x0218 */
+	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
+	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+			     __u16 reason_code)
+{
+	scb->ipa = 0x1000;
+	scb->ipb = ((__u32) reason_code) << 16;
+	scb->icptcode = ICPT_VALIDITY;
+	return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+	atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+	prefix_unmapped(vsie_page);
+	if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+		atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+	while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+		cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+	atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+	return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+	const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+	int cpuflags;
+
+	cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+	atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+	atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags  */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+	/* we don't allow ESA/390 guests */
+	if (!(cpuflags & CPUSTAT_ZARCH))
+		return set_validity_icpt(scb_s, 0x0001U);
+
+	if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+		return set_validity_icpt(scb_s, 0x0001U);
+	else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+		return set_validity_icpt(scb_s, 0x0007U);
+
+	/* intervention requests will be set later */
+	newflags = CPUSTAT_ZARCH;
+	if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+		newflags |= CPUSTAT_GED;
+	if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+		if (cpuflags & CPUSTAT_GED)
+			return set_validity_icpt(scb_s, 0x0001U);
+		newflags |= CPUSTAT_GED2;
+	}
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+		newflags |= cpuflags & CPUSTAT_P;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+		newflags |= cpuflags & CPUSTAT_SM;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+		newflags |= cpuflags & CPUSTAT_IBS;
+
+	atomic_set(&scb_s->cpuflags, newflags);
+	return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+	unsigned long *b1, *b2;
+	u8 ecb3_flags;
+
+	scb_s->crycbd = 0;
+	if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+		return 0;
+	/* format-1 is supported with message-security-assist extension 3 */
+	if (!test_kvm_facility(vcpu->kvm, 76))
+		return 0;
+	/* we may only allow it if enabled for guest 2 */
+	ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+		     (ECB3_AES | ECB3_DEA);
+	if (!ecb3_flags)
+		return 0;
+
+	if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+		return set_validity_icpt(scb_s, 0x003CU);
+	else if (!crycb_addr)
+		return set_validity_icpt(scb_s, 0x0039U);
+
+	/* copy only the wrapping keys */
+	if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+		return set_validity_icpt(scb_s, 0x0035U);
+
+	scb_s->ecb3 |= ecb3_flags;
+	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+			CRYCB_FORMAT2;
+
+	/* xor both blocks in one run */
+	b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+	b2 = (unsigned long *)
+			    vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+	/* as 56%8 == 0, bitmap_xor won't overwrite any data */
+	bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+	return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	__u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+	scb_s->ibc = 0;
+	/* ibc installed in g2 and requested for g3 */
+	if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+		scb_s->ibc = scb_o->ibc & 0x0fffU;
+		/* takte care of the minimum ibc level of the machine */
+		if (scb_s->ibc < min_ibc)
+			scb_s->ibc = min_ibc;
+		/* take care of the maximum ibc level set for the guest */
+		if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+			scb_s->ibc = vcpu->kvm->arch.model.ibc;
+	}
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+	/* interception */
+	scb_o->icptcode = scb_s->icptcode;
+	scb_o->icptstatus = scb_s->icptstatus;
+	scb_o->ipa = scb_s->ipa;
+	scb_o->ipb = scb_s->ipb;
+	scb_o->gbea = scb_s->gbea;
+
+	/* timer */
+	scb_o->cputm = scb_s->cputm;
+	scb_o->ckc = scb_s->ckc;
+	scb_o->todpr = scb_s->todpr;
+
+	/* guest state */
+	scb_o->gpsw = scb_s->gpsw;
+	scb_o->gg14 = scb_s->gg14;
+	scb_o->gg15 = scb_s->gg15;
+	memcpy(scb_o->gcr, scb_s->gcr, 128);
+	scb_o->pp = scb_s->pp;
+
+	/* interrupt intercept */
+	switch (scb_s->icptcode) {
+	case ICPT_PROGI:
+	case ICPT_INSTPROGI:
+	case ICPT_EXTINT:
+		memcpy((void *)((u64)scb_o + 0xc0),
+		       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+		break;
+	case ICPT_PARTEXEC:
+		/* MVPG only */
+		memcpy((void *)((u64)scb_o + 0xc0),
+		       (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+		break;
+	}
+
+	if (scb_s->ihcpu != 0xffffU)
+		scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	bool had_tx = scb_s->ecb & 0x10U;
+	unsigned long new_mso = 0;
+	int rc;
+
+	/* make sure we don't have any leftovers when reusing the scb */
+	scb_s->icptcode = 0;
+	scb_s->eca = 0;
+	scb_s->ecb = 0;
+	scb_s->ecb2 = 0;
+	scb_s->ecb3 = 0;
+	scb_s->ecd = 0;
+	scb_s->fac = 0;
+
+	rc = prepare_cpuflags(vcpu, vsie_page);
+	if (rc)
+		goto out;
+
+	/* timer */
+	scb_s->cputm = scb_o->cputm;
+	scb_s->ckc = scb_o->ckc;
+	scb_s->todpr = scb_o->todpr;
+	scb_s->epoch = scb_o->epoch;
+
+	/* guest state */
+	scb_s->gpsw = scb_o->gpsw;
+	scb_s->gg14 = scb_o->gg14;
+	scb_s->gg15 = scb_o->gg15;
+	memcpy(scb_s->gcr, scb_o->gcr, 128);
+	scb_s->pp = scb_o->pp;
+
+	/* interception / execution handling */
+	scb_s->gbea = scb_o->gbea;
+	scb_s->lctl = scb_o->lctl;
+	scb_s->svcc = scb_o->svcc;
+	scb_s->ictl = scb_o->ictl;
+	/*
+	 * SKEY handling functions can't deal with false setting of PTE invalid
+	 * bits. Therefore we cannot provide interpretation and would later
+	 * have to provide own emulation handlers.
+	 */
+	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+	scb_s->icpua = scb_o->icpua;
+
+	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+		new_mso = scb_o->mso & 0xfffffffffff00000UL;
+	/* if the hva of the prefix changes, we have to remap the prefix */
+	if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+		prefix_unmapped(vsie_page);
+	 /* SIE will do mso/msl validity and exception checks for us */
+	scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+	scb_s->mso = new_mso;
+	scb_s->prefix = scb_o->prefix;
+
+	/* We have to definetly flush the tlb if this scb never ran */
+	if (scb_s->ihcpu != 0xffffU)
+		scb_s->ihcpu = scb_o->ihcpu;
+
+	/* MVPG and Protection Exception Interpretation are always available */
+	scb_s->eca |= scb_o->eca & 0x01002000U;
+	/* Host-protection-interruption introduced with ESOP */
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+		scb_s->ecb |= scb_o->ecb & 0x02U;
+	/* transactional execution */
+	if (test_kvm_facility(vcpu->kvm, 73)) {
+		/* remap the prefix is tx is toggled on */
+		if ((scb_o->ecb & 0x10U) && !had_tx)
+			prefix_unmapped(vsie_page);
+		scb_s->ecb |= scb_o->ecb & 0x10U;
+	}
+	/* SIMD */
+	if (test_kvm_facility(vcpu->kvm, 129)) {
+		scb_s->eca |= scb_o->eca & 0x00020000U;
+		scb_s->ecd |= scb_o->ecd & 0x20000000U;
+	}
+	/* Run-time-Instrumentation */
+	if (test_kvm_facility(vcpu->kvm, 64))
+		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+		scb_s->eca |= scb_o->eca & 0x00000001U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+		scb_s->eca |= scb_o->eca & 0x40000000U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+		scb_s->eca |= scb_o->eca & 0x80000000U;
+
+	prepare_ibc(vcpu, vsie_page);
+	rc = shadow_crycb(vcpu, vsie_page);
+out:
+	if (rc)
+		unshadow_scb(vcpu, vsie_page);
+	return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+				 unsigned long end)
+{
+	struct kvm *kvm = gmap->private;
+	struct vsie_page *cur;
+	unsigned long prefix;
+	struct page *page;
+	int i;
+
+	if (!gmap_is_shadow(gmap))
+		return;
+	if (start >= 1UL << 31)
+		/* We are only interested in prefix pages */
+		return;
+
+	/*
+	 * Only new shadow blocks are added to the list during runtime,
+	 * therefore we can safely reference them all the time.
+	 */
+	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+		page = READ_ONCE(kvm->arch.vsie.pages[i]);
+		if (!page)
+			continue;
+		cur = page_to_virt(page);
+		if (READ_ONCE(cur->gmap) != gmap)
+			continue;
+		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+		/* with mso/msl, the prefix lies at an offset */
+		prefix += cur->scb_s.mso;
+		if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+			prefix_unmapped_sync(cur);
+	}
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ *          - > 0 if control has to be given to guest 2
+ *          - -EAGAIN if the caller can retry immediately
+ *          - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+	int rc;
+
+	if (prefix_is_mapped(vsie_page))
+		return 0;
+
+	/* mark it as mapped so we can catch any concurrent unmappers */
+	prefix_mapped(vsie_page);
+
+	/* with mso/msl, the prefix lies at offset *mso* */
+	prefix += scb_s->mso;
+
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+	if (!rc && (scb_s->ecb & 0x10U))
+		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+					   prefix + PAGE_SIZE);
+	/*
+	 * We don't have to mprotect, we will be called for all unshadows.
+	 * SIE will detect if protection applies and trigger a validity.
+	 */
+	if (rc)
+		prefix_unmapped(vsie_page);
+	if (rc > 0 || rc == -EFAULT)
+		rc = set_validity_icpt(scb_s, 0x0037U);
+	return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ *          - -EINVAL if the gpa is not valid guest storage
+ *          - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+	struct page *page;
+	hva_t hva;
+	int rc;
+
+	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+	if (kvm_is_error_hva(hva))
+		return -EINVAL;
+	rc = get_user_pages_fast(hva, 1, 1, &page);
+	if (rc < 0)
+		return rc;
+	else if (rc != 1)
+		return -ENOMEM;
+	*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+	return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+	struct page *page;
+
+	page = virt_to_page(hpa);
+	set_page_dirty_lock(page);
+	put_page(page);
+	/* mark the page always as dirty for migration */
+	mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	hpa_t hpa;
+	gpa_t gpa;
+
+	hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+	if (hpa) {
+		gpa = scb_o->scaol & ~0xfUL;
+		if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+			gpa |= (u64) scb_o->scaoh << 32;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->scaol = 0;
+		scb_s->scaoh = 0;
+	}
+
+	hpa = scb_s->itdba;
+	if (hpa) {
+		gpa = scb_o->itdba & ~0xffUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->itdba = 0;
+	}
+
+	hpa = scb_s->gvrd;
+	if (hpa) {
+		gpa = scb_o->gvrd & ~0x1ffUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->gvrd = 0;
+	}
+
+	hpa = scb_s->riccbd;
+	if (hpa) {
+		gpa = scb_o->riccbd & ~0x3fUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->riccbd = 0;
+	}
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	hpa_t hpa;
+	gpa_t gpa;
+	int rc = 0;
+
+	gpa = scb_o->scaol & ~0xfUL;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+		gpa |= (u64) scb_o->scaoh << 32;
+	if (gpa) {
+		if (!(gpa & ~0x1fffUL))
+			rc = set_validity_icpt(scb_s, 0x0038U);
+		else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+			rc = set_validity_icpt(scb_s, 0x0011U);
+		else if ((gpa & PAGE_MASK) !=
+			 ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+			rc = set_validity_icpt(scb_s, 0x003bU);
+		if (!rc) {
+			rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+			if (rc == -EINVAL)
+				rc = set_validity_icpt(scb_s, 0x0034U);
+		}
+		if (rc)
+			goto unpin;
+		scb_s->scaoh = (u32)((u64)hpa >> 32);
+		scb_s->scaol = (u32)(u64)hpa;
+	}
+
+	gpa = scb_o->itdba & ~0xffUL;
+	if (gpa && (scb_s->ecb & 0x10U)) {
+		if (!(gpa & ~0x1fffU)) {
+			rc = set_validity_icpt(scb_s, 0x0080U);
+			goto unpin;
+		}
+		/* 256 bytes cannot cross page boundaries */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x0080U);
+		if (rc)
+			goto unpin;
+		scb_s->itdba = hpa;
+	}
+
+	gpa = scb_o->gvrd & ~0x1ffUL;
+	if (gpa && (scb_s->eca & 0x00020000U) &&
+	    !(scb_s->ecd & 0x20000000U)) {
+		if (!(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x1310U);
+			goto unpin;
+		}
+		/*
+		 * 512 bytes vector registers cannot cross page boundaries
+		 * if this block gets bigger, we have to shadow it.
+		 */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x1310U);
+		if (rc)
+			goto unpin;
+		scb_s->gvrd = hpa;
+	}
+
+	gpa = scb_o->riccbd & ~0x3fUL;
+	if (gpa && (scb_s->ecb3 & 0x01U)) {
+		if (!(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x0043U);
+			goto unpin;
+		}
+		/* 64 bytes cannot cross page boundaries */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x0043U);
+		/* Validity 0x0044 will be checked by SIE */
+		if (rc)
+			goto unpin;
+		scb_s->gvrd = hpa;
+	}
+	return 0;
+unpin:
+	unpin_blocks(vcpu, vsie_page);
+	return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+		      gpa_t gpa)
+{
+	hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+	if (hpa)
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+	vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+		   gpa_t gpa)
+{
+	hpa_t hpa;
+	int rc;
+
+	rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+	if (rc == -EINVAL) {
+		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		if (!rc)
+			rc = 1;
+	}
+	if (!rc)
+		vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+	return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ *            < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+			bool write_flag)
+{
+	struct kvm_s390_pgm_info pgm = {
+		.code = code,
+		.trans_exc_code =
+			/* 0-51: virtual address */
+			(vaddr & 0xfffffffffffff000UL) |
+			/* 52-53: store / fetch */
+			(((unsigned int) !write_flag) + 1) << 10,
+			/* 62-63: asce id (alway primary == 0) */
+		.exc_access_id = 0, /* always primary */
+		.op_access_id = 0, /* not MVPG */
+	};
+	int rc;
+
+	if (code == PGM_PROTECTION)
+		pgm.trans_exc_code |= 0x4UL;
+
+	rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+	return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	int rc;
+
+	if (current->thread.gmap_int_code == PGM_PROTECTION)
+		/* we can directly forward all protection exceptions */
+		return inject_fault(vcpu, PGM_PROTECTION,
+				    current->thread.gmap_addr, 1);
+
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+				   current->thread.gmap_addr);
+	if (rc > 0) {
+		rc = inject_fault(vcpu, rc,
+				  current->thread.gmap_addr,
+				  current->thread.gmap_write_flag);
+		if (rc >= 0)
+			vsie_page->fault_addr = current->thread.gmap_addr;
+	}
+	return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+			      struct vsie_page *vsie_page)
+{
+	if (vsie_page->fault_addr)
+		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+				      vsie_page->fault_addr);
+	vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+	vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int ilen = insn_length(scb_s->ipa >> 8);
+
+	/* take care of EXECUTE instructions */
+	if (scb_s->icptstatus & 1) {
+		ilen = (scb_s->icptstatus >> 4) & 0x6;
+		if (!ilen)
+			ilen = 4;
+	}
+	scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+	clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ *          - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	__u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+	if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+		retry_vsie_icpt(vsie_page);
+		if (read_guest_real(vcpu, fac, &vsie_page->fac,
+				    sizeof(vsie_page->fac)))
+			return set_validity_icpt(scb_s, 0x1090U);
+		scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+	}
+	return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	int rc;
+
+	handle_last_fault(vcpu, vsie_page);
+
+	if (need_resched())
+		schedule();
+	if (test_cpu_flag(CIF_MCCK_PENDING))
+		s390_handle_mcck();
+
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	local_irq_disable();
+	guest_enter_irqoff();
+	local_irq_enable();
+
+	rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+	local_irq_disable();
+	guest_exit_irqoff();
+	local_irq_enable();
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	if (rc > 0)
+		rc = 0; /* we could still have an icpt */
+	else if (rc == -EFAULT)
+		return handle_fault(vcpu, vsie_page);
+
+	switch (scb_s->icptcode) {
+	case ICPT_INST:
+		if (scb_s->ipa == 0xb2b0)
+			rc = handle_stfle(vcpu, vsie_page);
+		break;
+	case ICPT_STOP:
+		/* stop not requested by g2 - must have been a kick */
+		if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+			clear_vsie_icpt(vsie_page);
+		break;
+	case ICPT_VALIDITY:
+		if ((scb_s->ipa & 0xf000) != 0xf000)
+			scb_s->ipa += 0x1000;
+		break;
+	}
+	return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+	if (vsie_page->gmap)
+		gmap_put(vsie_page->gmap);
+	WRITE_ONCE(vsie_page->gmap, NULL);
+	prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+			       struct vsie_page *vsie_page)
+{
+	unsigned long asce;
+	union ctlreg0 cr0;
+	struct gmap *gmap;
+	int edat;
+
+	asce = vcpu->arch.sie_block->gcr[1];
+	cr0.val = vcpu->arch.sie_block->gcr[0];
+	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+	edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+	/*
+	 * ASCE or EDAT could have changed since last icpt, or the gmap
+	 * we're holding has been unshadowed. If the gmap is still valid,
+	 * we can safely reuse it.
+	 */
+	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+		return 0;
+
+	/* release the old shadow - if any, and mark the prefix as unmapped */
+	release_gmap_shadow(vsie_page);
+	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+	if (IS_ERR(gmap))
+		return PTR_ERR(gmap);
+	gmap->private = vcpu->kvm;
+	WRITE_ONCE(vsie_page->gmap, gmap);
+	return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+				struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+	WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+	/*
+	 * External calls have to lead to a kick of the vcpu and
+	 * therefore the vsie -> Simulate Wait state.
+	 */
+	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	/*
+	 * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+	 * automatically be adjusted on tod clock changes via kvm_sync_clock.
+	 */
+	preempt_disable();
+	scb_s->epoch += vcpu->kvm->arch.epoch;
+	preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+	atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int rc = 0;
+
+	while (1) {
+		rc = acquire_gmap_shadow(vcpu, vsie_page);
+		if (!rc)
+			rc = map_prefix(vcpu, vsie_page);
+		if (!rc) {
+			gmap_enable(vsie_page->gmap);
+			update_intervention_requests(vsie_page);
+			rc = do_vsie_run(vcpu, vsie_page);
+			gmap_enable(vcpu->arch.gmap);
+		}
+		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+		if (rc == -EAGAIN)
+			rc = 0;
+		if (rc || scb_s->icptcode || signal_pending(current) ||
+		    kvm_s390_vcpu_has_irq(vcpu, 0))
+			break;
+	};
+
+	if (rc == -EFAULT) {
+		/*
+		 * Addressing exceptions are always presentes as intercepts.
+		 * As addressing exceptions are suppressing and our guest 3 PSW
+		 * points at the responsible instruction, we have to
+		 * forward the PSW and set the ilc. If we can't read guest 3
+		 * instruction, we can use an arbitrary ilc. Let's always use
+		 * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+		 * memory. (we could also fake the shadow so the hardware
+		 * handles it).
+		 */
+		scb_s->icptcode = ICPT_PROGI;
+		scb_s->iprcc = PGM_ADDRESSING;
+		scb_s->pgmilc = 4;
+		scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+	}
+	return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ *          - NULL if the same scb address is already used by another VCPU
+ *          - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+	struct vsie_page *vsie_page;
+	struct page *page;
+	int nr_vcpus;
+
+	rcu_read_lock();
+	page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+	rcu_read_unlock();
+	if (page) {
+		if (page_ref_inc_return(page) == 2)
+			return page_to_virt(page);
+		page_ref_dec(page);
+	}
+
+	/*
+	 * We want at least #online_vcpus shadows, so every VCPU can execute
+	 * the VSIE in parallel.
+	 */
+	nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+	mutex_lock(&kvm->arch.vsie.mutex);
+	if (kvm->arch.vsie.page_count < nr_vcpus) {
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+		if (!page) {
+			mutex_unlock(&kvm->arch.vsie.mutex);
+			return ERR_PTR(-ENOMEM);
+		}
+		page_ref_inc(page);
+		kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+		kvm->arch.vsie.page_count++;
+	} else {
+		/* reuse an existing entry that belongs to nobody */
+		while (true) {
+			page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+			if (page_ref_inc_return(page) == 2)
+				break;
+			page_ref_dec(page);
+			kvm->arch.vsie.next++;
+			kvm->arch.vsie.next %= nr_vcpus;
+		}
+		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+	}
+	page->index = addr;
+	/* double use of the same address */
+	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+		page_ref_dec(page);
+		mutex_unlock(&kvm->arch.vsie.mutex);
+		return NULL;
+	}
+	mutex_unlock(&kvm->arch.vsie.mutex);
+
+	vsie_page = page_to_virt(page);
+	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+	release_gmap_shadow(vsie_page);
+	vsie_page->fault_addr = 0;
+	vsie_page->scb_s.ihcpu = 0xffffU;
+	return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+	struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+	page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+	struct vsie_page *vsie_page;
+	unsigned long scb_addr;
+	int rc;
+
+	vcpu->stat.instruction_sie++;
+	if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+		return -EOPNOTSUPP;
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+	scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+	/* 512 byte alignment */
+	if (unlikely(scb_addr & 0x1ffUL))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+		return 0;
+
+	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+	if (IS_ERR(vsie_page))
+		return PTR_ERR(vsie_page);
+	else if (!vsie_page)
+		/* double use of sie control block - simply do nothing */
+		return 0;
+
+	rc = pin_scb(vcpu, vsie_page, scb_addr);
+	if (rc)
+		goto out_put;
+	rc = shadow_scb(vcpu, vsie_page);
+	if (rc)
+		goto out_unpin_scb;
+	rc = pin_blocks(vcpu, vsie_page);
+	if (rc)
+		goto out_unshadow;
+	register_shadow_scb(vcpu, vsie_page);
+	rc = vsie_run(vcpu, vsie_page);
+	unregister_shadow_scb(vcpu);
+	unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+	unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+	unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+	put_vsie_page(vcpu->kvm, vsie_page);
+
+	return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+	mutex_init(&kvm->arch.vsie.mutex);
+	INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+	struct vsie_page *vsie_page;
+	struct page *page;
+	int i;
+
+	mutex_lock(&kvm->arch.vsie.mutex);
+	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+		page = kvm->arch.vsie.pages[i];
+		kvm->arch.vsie.pages[i] = NULL;
+		vsie_page = page_to_virt(page);
+		release_gmap_shadow(vsie_page);
+		/* free the radix tree entry */
+		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+		__free_page(page);
+	}
+	kvm->arch.vsie.page_count = 0;
+	mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+	/*
+	 * Even if the VCPU lets go of the shadow sie block reference, it is
+	 * still valid in the cache. So we can safely kick it.
+	 */
+	if (scb) {
+		atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+		if (scb->prog0c & PROG_IN_SIE)
+			atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+	}
+}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 19288c1b36d3..730e0d3aa840 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 		(struct gmap *) S390_lowcore.gmap : NULL;
 	if (gmap) {
 		current->thread.gmap_addr = address;
+		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+		current->thread.gmap_int_code = regs->int_code & 0xffff;
 		address = __gmap_translate(gmap, address);
 		if (address == -EFAULT) {
 			fault = VM_FAULT_BADMAP;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index cace818d86eb..af0ae6d7ac59 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,14 +20,16 @@
 #include <asm/gmap.h>
 #include <asm/tlb.h>
 
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
 /**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
  * @mm: pointer to the parent mm_struct
  * @limit: maximum address of the gmap address space
  *
  * Returns a guest address space structure.
  */
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
 {
 	struct gmap *gmap;
 	struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 	if (!gmap)
 		goto out;
 	INIT_LIST_HEAD(&gmap->crst_list);
+	INIT_LIST_HEAD(&gmap->children);
+	INIT_LIST_HEAD(&gmap->pt_list);
 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
 	spin_lock_init(&gmap->guest_table_lock);
-	gmap->mm = mm;
+	spin_lock_init(&gmap->shadow_lock);
+	atomic_set(&gmap->ref_count, 1);
 	page = alloc_pages(GFP_KERNEL, 2);
 	if (!page)
 		goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
 		_ASCE_USER_BITS | __pa(table);
 	gmap->asce_end = limit;
-	down_write(&mm->mmap_sem);
-	list_add(&gmap->list, &mm->context.gmap_list);
-	up_write(&mm->mmap_sem);
 	return gmap;
 
 out_free:
@@ -80,7 +83,28 @@ out_free:
 out:
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+	struct gmap *gmap;
+
+	gmap = gmap_alloc(limit);
+	if (!gmap)
+		return NULL;
+	gmap->mm = mm;
+	spin_lock(&mm->context.gmap_lock);
+	list_add_rcu(&gmap->list, &mm->context.gmap_list);
+	spin_unlock(&mm->context.gmap_lock);
+	return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
 	} while (nr > 0);
 }
 
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+	struct gmap_rmap *rmap, *rnext, *head;
+	struct radix_tree_iter iter;
+	unsigned long indices[16];
+	unsigned long index;
+	void **slot;
+	int i, nr;
+
+	/* A radix tree is freed by deleting all of its entries */
+	index = 0;
+	do {
+		nr = 0;
+		radix_tree_for_each_slot(slot, root, &iter, index) {
+			indices[nr] = iter.index;
+			if (++nr == 16)
+				break;
+		}
+		for (i = 0; i < nr; i++) {
+			index = indices[i];
+			head = radix_tree_delete(root, index);
+			gmap_for_each_rmap_safe(rmap, rnext, head)
+				kfree(rmap);
+		}
+	} while (nr > 0);
+}
+
 /**
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
  */
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
 {
 	struct page *page, *next;
 
-	/* Flush tlb. */
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_asce(gmap->mm, gmap->asce);
-	else
-		__tlb_flush_global();
-
+	/* Flush tlb of all gmaps (if not already done for shadows) */
+	if (!(gmap_is_shadow(gmap) && gmap->removed))
+		gmap_flush_tlb(gmap);
 	/* Free all segment & region tables. */
 	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 		__free_pages(page, 2);
 	gmap_radix_tree_free(&gmap->guest_to_host);
 	gmap_radix_tree_free(&gmap->host_to_guest);
-	down_write(&gmap->mm->mmap_sem);
-	list_del(&gmap->list);
-	up_write(&gmap->mm->mmap_sem);
+
+	/* Free additional data for a shadow gmap */
+	if (gmap_is_shadow(gmap)) {
+		/* Free all page tables. */
+		list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+			page_table_free_pgste(page);
+		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+		/* Release reference to the parent */
+		gmap_put(gmap->parent);
+	}
+
 	kfree(gmap);
 }
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+	atomic_inc(&gmap->ref_count);
+	return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+	if (atomic_dec_return(&gmap->ref_count) == 0)
+		gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+	struct gmap *sg, *next;
+
+	/* Remove all shadow gmaps linked to this gmap */
+	if (!list_empty(&gmap->children)) {
+		spin_lock(&gmap->shadow_lock);
+		list_for_each_entry_safe(sg, next, &gmap->children, list) {
+			list_del(&sg->list);
+			gmap_put(sg);
+		}
+		spin_unlock(&gmap->shadow_lock);
+	}
+	/* Remove gmap from the pre-mm list */
+	spin_lock(&gmap->mm->context.gmap_lock);
+	list_del_rcu(&gmap->list);
+	spin_unlock(&gmap->mm->context.gmap_lock);
+	synchronize_rcu();
+	/* Put reference */
+	gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
 
 /**
  * gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_disable);
 
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+	return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
 /*
  * gmap_alloc_table is assumed to be called with mmap_sem held
  */
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 		return -ENOMEM;
 	new = (unsigned long *) page_to_phys(page);
 	crst_table_init(new, init);
-	spin_lock(&gmap->mm->page_table_lock);
+	spin_lock(&gmap->guest_table_lock);
 	if (*table & _REGION_ENTRY_INVALID) {
 		list_add(&page->lru, &gmap->crst_list);
 		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 		page->index = gaddr;
 		page = NULL;
 	}
-	spin_unlock(&gmap->mm->page_table_lock);
+	spin_unlock(&gmap->guest_table_lock);
 	if (page)
 		__free_pages(page, 2);
 	return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 	unsigned long *entry;
 	int flush = 0;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	spin_lock(&gmap->guest_table_lock);
 	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 	if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 	unsigned long off;
 	int flush;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	if ((to | len) & (PMD_SIZE - 1))
 		return -EINVAL;
 	if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
 	unsigned long off;
 	int flush;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	if ((from | to | len) & (PMD_SIZE - 1))
 		return -EINVAL;
 	if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
  * This function does not establish potentially missing page table entries.
  * The mmap_sem of the mm that belongs to the address space must be held
  * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 {
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 
 	vmaddr = (unsigned long)
 		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+	/* Note: guest_to_host is empty for a shadow gmap */
 	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 }
 EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 	struct gmap *gmap;
 	int flush;
 
-	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 		if (flush)
 			gmap_flush_tlb(gmap);
 	}
+	rcu_read_unlock();
 }
 
 /**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	pmd_t *pmd;
 	int rc;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	/* Create higher level tables in the gmap page table */
 	table = gmap->table;
 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -549,116 +679,1412 @@ static LIST_HEAD(gmap_notifier_list);
 static DEFINE_SPINLOCK(gmap_notifier_lock);
 
 /**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
 {
 	spin_lock(&gmap_notifier_lock);
-	list_add(&nb->list, &gmap_notifier_list);
+	list_add_rcu(&nb->list, &gmap_notifier_list);
 	spin_unlock(&gmap_notifier_lock);
 }
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 
 /**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 {
 	spin_lock(&gmap_notifier_lock);
-	list_del_init(&nb->list);
+	list_del_rcu(&nb->list);
 	spin_unlock(&gmap_notifier_lock);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+			       unsigned long end)
+{
+	struct gmap_notifier *nb;
+
+	list_for_each_entry(nb, &gmap_notifier_list, list)
+		nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+					     unsigned long gaddr, int level)
+{
+	unsigned long *table;
+
+	if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+		return NULL;
+	if (gmap_is_shadow(gmap) && gmap->removed)
+		return NULL;
+	if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+		return NULL;
+	table = gmap->table;
+	switch (gmap->asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_REGION1:
+		table += (gaddr >> 53) & 0x7ff;
+		if (level == 4)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		/* Fallthrough */
+	case _ASCE_TYPE_REGION2:
+		table += (gaddr >> 42) & 0x7ff;
+		if (level == 3)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		/* Fallthrough */
+	case _ASCE_TYPE_REGION3:
+		table += (gaddr >> 31) & 0x7ff;
+		if (level == 2)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		/* Fallthrough */
+	case _ASCE_TYPE_SEGMENT:
+		table += (gaddr >> 20) & 0x7ff;
+		if (level == 1)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+		table += (gaddr >> 12) & 0xff;
+	}
+	return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ *		      and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+			       spinlock_t **ptl)
+{
+	unsigned long *table;
+
+	if (gmap_is_shadow(gmap))
+		spin_lock(&gmap->guest_table_lock);
+	/* Walk the gmap page table, lock and get pte pointer */
+	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+	if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+		if (gmap_is_shadow(gmap))
+			spin_unlock(&gmap->guest_table_lock);
+		return NULL;
+	}
+	if (gmap_is_shadow(gmap)) {
+		*ptl = &gmap->guest_table_lock;
+		return pte_offset_map((pmd_t *) table, gaddr);
+	}
+	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+			     unsigned long vmaddr, int prot)
+{
+	struct mm_struct *mm = gmap->mm;
+	unsigned int fault_flags;
+	bool unlocked = false;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+	if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+		return -EFAULT;
+	if (unlocked)
+		/* lost mmap_sem, caller has to retry __gmap_translate */
+		return 0;
+	/* Connect the page tables */
+	return __gmap_link(gmap, gaddr, vmaddr);
 }
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 
 /**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+	spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
  *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+			      unsigned long len, int prot, unsigned long bits)
 {
-	unsigned long addr;
+	unsigned long vmaddr;
 	spinlock_t *ptl;
 	pte_t *ptep;
-	bool unlocked;
-	int rc = 0;
+	int rc;
+
+	while (len) {
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+		if (ptep) {
+			rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+			gmap_pte_op_end(ptl);
+		}
+		if (rc) {
+			vmaddr = __gmap_translate(gmap, gaddr);
+			if (IS_ERR_VALUE(vmaddr))
+				return vmaddr;
+			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+			if (rc)
+				return rc;
+			continue;
+		}
+		gaddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+	return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ *                        call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+			 unsigned long len, int prot)
+{
+	int rc;
 
-	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+		return -EINVAL;
+	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
 		return -EINVAL;
 	down_read(&gmap->mm->mmap_sem);
-	while (len) {
-		unlocked = false;
-		/* Convert gmap address and connect the page tables */
-		addr = __gmap_translate(gmap, gaddr);
-		if (IS_ERR_VALUE(addr)) {
-			rc = addr;
+	rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+	up_read(&gmap->mm->mmap_sem);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ *                   absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+	unsigned long address, vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep, pte;
+	int rc;
+
+	while (1) {
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+		if (ptep) {
+			pte = *ptep;
+			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+				address = pte_val(pte) & PAGE_MASK;
+				address += gaddr & ~PAGE_MASK;
+				*val = *(unsigned long *) address;
+				pte_val(*ptep) |= _PAGE_YOUNG;
+				/* Do *NOT* clear the _PAGE_INVALID bit! */
+				rc = 0;
+			}
+			gmap_pte_op_end(ptl);
+		}
+		if (!rc)
+			break;
+		vmaddr = __gmap_translate(gmap, gaddr);
+		if (IS_ERR_VALUE(vmaddr)) {
+			rc = vmaddr;
 			break;
 		}
-		/* Get the page mapped */
-		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-				     &unlocked)) {
-			rc = -EFAULT;
+		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+		if (rc)
 			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+				    struct gmap_rmap *rmap)
+{
+	void **slot;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+	if (slot) {
+		rmap->next = radix_tree_deref_slot_protected(slot,
+							&sg->guest_table_lock);
+		radix_tree_replace_slot(slot, rmap);
+	} else {
+		rmap->next = NULL;
+		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+				  rmap);
+	}
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+			     unsigned long paddr, unsigned long len, int prot)
+{
+	struct gmap *parent;
+	struct gmap_rmap *rmap;
+	unsigned long vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	parent = sg->parent;
+	while (len) {
+		vmaddr = __gmap_translate(parent, paddr);
+		if (IS_ERR_VALUE(vmaddr))
+			return vmaddr;
+		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+		if (!rmap)
+			return -ENOMEM;
+		rmap->raddr = raddr;
+		rc = radix_tree_preload(GFP_KERNEL);
+		if (rc) {
+			kfree(rmap);
+			return rc;
+		}
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+		if (ptep) {
+			spin_lock(&sg->guest_table_lock);
+			rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+					     PGSTE_VSIE_BIT);
+			if (!rc)
+				gmap_insert_rmap(sg, vmaddr, rmap);
+			spin_unlock(&sg->guest_table_lock);
+			gmap_pte_op_end(ptl);
 		}
-		/* While trying to map mmap_sem got unlocked. Let us retry */
-		if (unlocked)
+		radix_tree_preload_end();
+		if (rc) {
+			kfree(rmap);
+			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+			if (rc)
+				return rc;
 			continue;
-		rc = __gmap_link(gmap, gaddr, addr);
+		}
+		paddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+	return 0;
+}
+
+#define _SHADOW_RMAP_MASK	0x7
+#define _SHADOW_RMAP_REGION1	0x5
+#define _SHADOW_RMAP_REGION2	0x4
+#define _SHADOW_RMAP_REGION3	0x3
+#define _SHADOW_RMAP_SEGMENT	0x2
+#define _SHADOW_RMAP_PGTABLE	0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+	asm volatile(
+		"	.insn	rrf,0xb98e0000,%0,%1,0,0"
+		: : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+	if (!table || *table & _PAGE_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+				unsigned long *pgt)
+{
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	for (i = 0; i < 256; i++, raddr += 1UL << 12)
+		pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long sto, *ste, *pgt;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+	sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+	pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+	*ste = _SEGMENT_ENTRY_EMPTY;
+	__gmap_unshadow_pgt(sg, raddr, pgt);
+	/* Free page table */
+	page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+				unsigned long *sgt)
+{
+	unsigned long asce, *pgt;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+			continue;
+		pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+		sgt[i] = _SEGMENT_ENTRY_EMPTY;
+		__gmap_unshadow_pgt(sg, raddr, pgt);
+		/* Free page table */
+		page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		page_table_free_pgste(page);
+	}
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r3o, *r3e, *sgt;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+	r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+	gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+	sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+	*r3e = _REGION3_ENTRY_EMPTY;
+	__gmap_unshadow_sgt(sg, raddr, sgt);
+	/* Free segment table */
+	page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	__free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r3t)
+{
+	unsigned long asce, *sgt;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+			continue;
+		sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+		r3t[i] = _REGION3_ENTRY_EMPTY;
+		__gmap_unshadow_sgt(sg, raddr, sgt);
+		/* Free segment table */
+		page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		__free_pages(page, 2);
+	}
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r2o, *r2e, *r3t;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+	r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+	gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+	r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+	*r2e = _REGION2_ENTRY_EMPTY;
+	__gmap_unshadow_r3t(sg, raddr, r3t);
+	/* Free region 3 table */
+	page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	__free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r2t)
+{
+	unsigned long asce, *r3t;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+			continue;
+		r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+		r2t[i] = _REGION2_ENTRY_EMPTY;
+		__gmap_unshadow_r3t(sg, raddr, r3t);
+		/* Free region 3 table */
+		page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		__free_pages(page, 2);
+	}
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r1o, *r1e, *r2t;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+	r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+	gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+	r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+	*r1e = _REGION1_ENTRY_EMPTY;
+	__gmap_unshadow_r2t(sg, raddr, r2t);
+	/* Free region 2 table */
+	page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	__free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r1t)
+{
+	unsigned long asce, *r2t;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+			continue;
+		r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+		__gmap_unshadow_r2t(sg, raddr, r2t);
+		/* Clear entry and flush translation r1t -> r2t */
+		gmap_idte_one(asce, raddr);
+		r1t[i] = _REGION1_ENTRY_EMPTY;
+		/* Free region 2 table */
+		page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		__free_pages(page, 2);
+	}
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	if (sg->removed)
+		return;
+	sg->removed = 1;
+	gmap_call_notifier(sg, 0, -1UL);
+	gmap_flush_tlb(sg);
+	table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+	switch (sg->asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_REGION1:
+		__gmap_unshadow_r1t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_REGION2:
+		__gmap_unshadow_r2t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_REGION3:
+		__gmap_unshadow_r3t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_SEGMENT:
+		__gmap_unshadow_sgt(sg, 0, table);
+		break;
+	}
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+				     int edat_level)
+{
+	struct gmap *sg;
+
+	list_for_each_entry(sg, &parent->children, list) {
+		if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+		    sg->removed)
+			continue;
+		if (!sg->initialized)
+			return ERR_PTR(-EAGAIN);
+		atomic_inc(&sg->ref_count);
+		return sg;
+	}
+	return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ *                     given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+	if (sg->removed)
+		return 0;
+	return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+			 int edat_level)
+{
+	struct gmap *sg, *new;
+	unsigned long limit;
+	int rc;
+
+	BUG_ON(gmap_is_shadow(parent));
+	spin_lock(&parent->shadow_lock);
+	sg = gmap_find_shadow(parent, asce, edat_level);
+	spin_unlock(&parent->shadow_lock);
+	if (sg)
+		return sg;
+	/* Create a new shadow gmap */
+	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+	if (asce & _ASCE_REAL_SPACE)
+		limit = -1UL;
+	new = gmap_alloc(limit);
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+	new->mm = parent->mm;
+	new->parent = gmap_get(parent);
+	new->orig_asce = asce;
+	new->edat_level = edat_level;
+	new->initialized = false;
+	spin_lock(&parent->shadow_lock);
+	/* Recheck if another CPU created the same shadow */
+	sg = gmap_find_shadow(parent, asce, edat_level);
+	if (sg) {
+		spin_unlock(&parent->shadow_lock);
+		gmap_free(new);
+		return sg;
+	}
+	if (asce & _ASCE_REAL_SPACE) {
+		/* only allow one real-space gmap shadow */
+		list_for_each_entry(sg, &parent->children, list) {
+			if (sg->orig_asce & _ASCE_REAL_SPACE) {
+				spin_lock(&sg->guest_table_lock);
+				gmap_unshadow(sg);
+				spin_unlock(&sg->guest_table_lock);
+				list_del(&sg->list);
+				gmap_put(sg);
+				break;
+			}
+		}
+	}
+	atomic_set(&new->ref_count, 2);
+	list_add(&new->list, &parent->children);
+	if (asce & _ASCE_REAL_SPACE) {
+		/* nothing to protect, return right away */
+		new->initialized = true;
+		spin_unlock(&parent->shadow_lock);
+		return new;
+	}
+	spin_unlock(&parent->shadow_lock);
+	/* protect after insertion, so it will get properly invalidated */
+	down_read(&parent->mm->mmap_sem);
+	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+				((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+				PROT_READ, PGSTE_VSIE_BIT);
+	up_read(&parent->mm->mmap_sem);
+	spin_lock(&parent->shadow_lock);
+	new->initialized = true;
+	if (rc) {
+		list_del(&new->list);
+		gmap_free(new);
+		new = ERR_PTR(rc);
+	}
+	spin_unlock(&parent->shadow_lock);
+	return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+		    int fake)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *s_r2t, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow region second table */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	page->index = r2t & _REGION_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
+	s_r2t = (unsigned long *) page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= (r2t & _REGION_ENTRY_PROTECT);
+	list_add(&page->lru, &sg->crst_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make r2t read-only in parent gmap page table */
+	raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+	origin = r2t & _REGION_ENTRY_ORIGIN;
+	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 4);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+			      (unsigned long) s_r2t)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_r2t(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, 2);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+		    int fake)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *s_r3t, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow region second table */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	page->index = r3t & _REGION_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
+	s_r3t = (unsigned long *) page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+	}
+	crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= (r3t & _REGION_ENTRY_PROTECT);
+	list_add(&page->lru, &sg->crst_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make r3t read-only in parent gmap page table */
+	raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+	origin = r3t & _REGION_ENTRY_ORIGIN;
+	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 3);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+			      (unsigned long) s_r3t)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_r3t(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, 2);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+		    int fake)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *s_sgt, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+	/* Allocate a shadow segment table */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	page->index = sgt & _REGION_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
+	s_sgt = (unsigned long *) page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= sgt & _REGION_ENTRY_PROTECT;
+	list_add(&page->lru, &sg->crst_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make sgt read-only in parent gmap page table */
+	raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+	origin = sgt & _REGION_ENTRY_ORIGIN;
+	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 2);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+			      (unsigned long) s_sgt)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_sgt(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, 2);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+			   unsigned long *pgt, int *dat_protection,
+			   int *fake)
+{
+	unsigned long *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+		/* Shadow page tables are full pages (pte+pgste) */
+		page = pfn_to_page(*table >> PAGE_SHIFT);
+		*pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+		*fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+		rc = 0;
+	} else  {
+		rc = -EAGAIN;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+		    int fake)
+{
+	unsigned long raddr, origin;
+	unsigned long *s_pgt, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+	/* Allocate a shadow page table */
+	page = page_table_alloc_pgste(sg->mm);
+	if (!page)
+		return -ENOMEM;
+	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
+	s_pgt = (unsigned long *) page_to_phys(page);
+	/* Install shadow page table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
+	}
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+	list_add(&page->lru, &sg->pt_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_SEGMENT_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	/* Make pgt read-only in parent gmap page table (not the pgste) */
+	raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 1);
+		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+			      (unsigned long) s_pgt)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_SEGMENT_ENTRY_INVALID;
+	} else {
+		gmap_unshadow_pgt(sg, raddr);
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	page_table_free_pgste(page);
+	return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+	struct gmap *parent;
+	struct gmap_rmap *rmap;
+	unsigned long vmaddr, paddr;
+	spinlock_t *ptl;
+	pte_t *sptep, *tptep;
+	int prot;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	parent = sg->parent;
+	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+	if (!rmap)
+		return -ENOMEM;
+	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+	while (1) {
+		paddr = pte_val(pte) & PAGE_MASK;
+		vmaddr = __gmap_translate(parent, paddr);
+		if (IS_ERR_VALUE(vmaddr)) {
+			rc = vmaddr;
+			break;
+		}
+		rc = radix_tree_preload(GFP_KERNEL);
 		if (rc)
 			break;
-		/* Walk the process page table, lock and get pte pointer */
-		ptep = get_locked_pte(gmap->mm, addr, &ptl);
-		VM_BUG_ON(!ptep);
-		/* Set notification bit in the pgste of the pte */
-		if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-			ptep_set_notify(gmap->mm, addr, ptep);
-			gaddr += PAGE_SIZE;
-			len -= PAGE_SIZE;
+		rc = -EAGAIN;
+		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+		if (sptep) {
+			spin_lock(&sg->guest_table_lock);
+			/* Get page table pointer */
+			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+			if (!tptep) {
+				spin_unlock(&sg->guest_table_lock);
+				gmap_pte_op_end(ptl);
+				radix_tree_preload_end();
+				break;
+			}
+			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+			if (rc > 0) {
+				/* Success and a new mapping */
+				gmap_insert_rmap(sg, vmaddr, rmap);
+				rmap = NULL;
+				rc = 0;
+			}
+			gmap_pte_op_end(ptl);
+			spin_unlock(&sg->guest_table_lock);
 		}
-		pte_unmap_unlock(ptep, ptl);
+		radix_tree_preload_end();
+		if (!rc)
+			break;
+		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+		if (rc)
+			break;
 	}
-	up_read(&gmap->mm->mmap_sem);
+	kfree(rmap);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+			       unsigned long offset, pte_t *pte)
+{
+	struct gmap_rmap *rmap, *rnext, *head;
+	unsigned long gaddr, start, end, bits, raddr;
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	spin_lock(&sg->parent->guest_table_lock);
+	table = radix_tree_lookup(&sg->parent->host_to_guest,
+				  vmaddr >> PMD_SHIFT);
+	gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+	spin_unlock(&sg->parent->guest_table_lock);
+	if (!table)
+		return;
+
+	spin_lock(&sg->guest_table_lock);
+	if (sg->removed) {
+		spin_unlock(&sg->guest_table_lock);
+		return;
+	}
+	/* Check for top level table */
+	start = sg->orig_asce & _ASCE_ORIGIN;
+	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+	    gaddr < end) {
+		/* The complete shadow table has to go */
+		gmap_unshadow(sg);
+		spin_unlock(&sg->guest_table_lock);
+		list_del(&sg->list);
+		gmap_put(sg);
+		return;
+	}
+	/* Remove the page table tree from on specific entry */
+	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+	gmap_for_each_rmap_safe(rmap, rnext, head) {
+		bits = rmap->raddr & _SHADOW_RMAP_MASK;
+		raddr = rmap->raddr ^ bits;
+		switch (bits) {
+		case _SHADOW_RMAP_REGION1:
+			gmap_unshadow_r2t(sg, raddr);
+			break;
+		case _SHADOW_RMAP_REGION2:
+			gmap_unshadow_r3t(sg, raddr);
+			break;
+		case _SHADOW_RMAP_REGION3:
+			gmap_unshadow_sgt(sg, raddr);
+			break;
+		case _SHADOW_RMAP_SEGMENT:
+			gmap_unshadow_pgt(sg, raddr);
+			break;
+		case _SHADOW_RMAP_PGTABLE:
+			gmap_unshadow_page(sg, raddr);
+			break;
+		}
+		kfree(rmap);
+	}
+	spin_unlock(&sg->guest_table_lock);
+}
 
 /**
  * ptep_notify - call all invalidation callbacks for a specific pte.
  * @mm: pointer to the process mm_struct
  * @addr: virtual address in the process address space
  * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
  *
  * This function is assumed to be called with the page table lock held
  * for the pte to notify.
  */
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+		 pte_t *pte, unsigned long bits)
 {
 	unsigned long offset, gaddr;
 	unsigned long *table;
-	struct gmap_notifier *nb;
-	struct gmap *gmap;
+	struct gmap *gmap, *sg, *next;
 
 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
 	offset = offset * (4096 / sizeof(pte_t));
-	spin_lock(&gmap_notifier_lock);
-	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+			spin_lock(&gmap->shadow_lock);
+			list_for_each_entry_safe(sg, next,
+						 &gmap->children, list)
+				gmap_shadow_notify(sg, vmaddr, offset, pte);
+			spin_unlock(&gmap->shadow_lock);
+		}
+		if (!(bits & PGSTE_IN_BIT))
+			continue;
+		spin_lock(&gmap->guest_table_lock);
 		table = radix_tree_lookup(&gmap->host_to_guest,
 					  vmaddr >> PMD_SHIFT);
-		if (!table)
-			continue;
-		gaddr = __gmap_segment_gaddr(table) + offset;
-		list_for_each_entry(nb, &gmap_notifier_list, list)
-			nb->notifier_call(gmap, gaddr);
+		if (table)
+			gaddr = __gmap_segment_gaddr(table) + offset;
+		spin_unlock(&gmap->guest_table_lock);
+		if (table)
+			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
 	}
-	spin_unlock(&gmap_notifier_lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ptep_notify);
 
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e8b5962ac12a..9c57a295a045 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 	return new;
 }
 
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+	struct page *page;
+	unsigned long *table;
+
+	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+	if (page) {
+		table = (unsigned long *) page_to_phys(page);
+		clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+		clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+	}
+	return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+	__free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
 /*
  * page table entry allocation/free routines.
  */
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	/* Try to get a fragment of a 4K page as a 2K page table */
 	if (!mm_alloc_pgste(mm)) {
 		table = NULL;
-		spin_lock_bh(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.pgtable_lock);
 		if (!list_empty(&mm->context.pgtable_list)) {
 			page = list_first_entry(&mm->context.pgtable_list,
 						struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 				list_del(&page->lru);
 			}
 		}
-		spin_unlock_bh(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.pgtable_lock);
 		if (table)
 			return table;
 	}
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 		/* Return the first 2K fragment of the page */
 		atomic_set(&page->_mapcount, 1);
 		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-		spin_lock_bh(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.pgtable_lock);
 		list_add(&page->lru, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.pgtable_lock);
 	}
 	return table;
 }
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	if (!mm_alloc_pgste(mm)) {
 		/* Free 2K page table fragment of a 4K page */
 		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.pgtable_lock);
 		mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
 		if (mask & 3)
 			list_add(&page->lru, &mm->context.pgtable_list);
 		else
 			list_del(&page->lru);
-		spin_unlock_bh(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.pgtable_lock);
 		if (mask != 0)
 			return;
 	}
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 		return;
 	}
 	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.pgtable_lock);
 	mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
 	if (mask & 3)
 		list_add_tail(&page->lru, &mm->context.pgtable_list);
 	else
 		list_del(&page->lru);
-	spin_unlock_bh(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.pgtable_lock);
 	table = (unsigned long *) (__pa(table) | (1U << bit));
 	tlb_remove_table(tlb, table);
 }
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index fa286d0c0f2d..293130b5aee7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -179,14 +179,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 	return pgste;
 }
 
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-					unsigned long addr,
-					pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+				       unsigned long addr,
+				       pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-	if (pgste_val(pgste) & PGSTE_IN_BIT) {
-		pgste_val(pgste) &= ~PGSTE_IN_BIT;
-		ptep_notify(mm, addr, ptep);
+	unsigned long bits;
+
+	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+	if (bits) {
+		pgste_val(pgste) ^= bits;
+		ptep_notify(mm, addr, ptep, bits);
 	}
 #endif
 	return pgste;
@@ -199,7 +202,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
 
 	if (mm_has_pgste(mm)) {
 		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 	}
 	return pgste;
 }
@@ -414,6 +417,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 	pgste_set_unlock(ptep, pgste);
 }
 
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, int prot, unsigned long bit)
+{
+	pte_t entry;
+	pgste_t pgste;
+	int pte_i, pte_p;
+
+	pgste = pgste_get_lock(ptep);
+	entry = *ptep;
+	/* Check pte entry after all locks have been acquired */
+	pte_i = pte_val(entry) & _PAGE_INVALID;
+	pte_p = pte_val(entry) & _PAGE_PROTECT;
+	if ((pte_i && (prot != PROT_NONE)) ||
+	    (pte_p && (prot & PROT_WRITE))) {
+		pgste_set_unlock(ptep, pgste);
+		return -EAGAIN;
+	}
+	/* Change access rights and set pgste bit */
+	if (prot == PROT_NONE && !pte_i) {
+		ptep_flush_direct(mm, addr, ptep);
+		pgste = pgste_update_all(entry, pgste, mm);
+		pte_val(entry) |= _PAGE_INVALID;
+	}
+	if (prot == PROT_READ && !pte_p) {
+		ptep_flush_direct(mm, addr, ptep);
+		pte_val(entry) &= ~_PAGE_INVALID;
+		pte_val(entry) |= _PAGE_PROTECT;
+	}
+	pgste_val(pgste) |= bit;
+	pgste = pgste_set_pte(ptep, pgste, entry);
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+		    pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+	pgste_t spgste, tpgste;
+	pte_t spte, tpte;
+	int rc = -EAGAIN;
+
+	if (!(pte_val(*tptep) & _PAGE_INVALID))
+		return 0;	/* already shadowed */
+	spgste = pgste_get_lock(sptep);
+	spte = *sptep;
+	if (!(pte_val(spte) & _PAGE_INVALID) &&
+	    !((pte_val(spte) & _PAGE_PROTECT) &&
+	      !(pte_val(pte) & _PAGE_PROTECT))) {
+		pgste_val(spgste) |= PGSTE_VSIE_BIT;
+		tpgste = pgste_get_lock(tptep);
+		pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+				(pte_val(pte) & _PAGE_PROTECT);
+		/* don't touch the storage key - it belongs to parent pgste */
+		tpgste = pgste_set_pte(tptep, tpgste, tpte);
+		pgste_set_unlock(tptep, tpgste);
+		rc = 1;
+	}
+	pgste_set_unlock(sptep, spgste);
+	return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+	pgste_t pgste;
+
+	pgste = pgste_get_lock(ptep);
+	/* notifier is called by the caller */
+	ptep_flush_direct(mm, saddr, ptep);
+	/* don't touch the storage key - it belongs to parent pgste */
+	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+	pgste_set_unlock(ptep, pgste);
+}
+
 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 {
 	if (!non_swap_entry(entry))
@@ -483,7 +570,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
 	pgste_val(pgste) &= ~PGSTE_UC_BIT;
 	pte = *ptep;
 	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 		__ptep_ipte(addr, ptep);
 		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
 			pte_val(pte) |= _PAGE_PROTECT;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 360c5171ea1a..7a628fb6a2c2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,6 +598,7 @@ struct kvm_vcpu_arch {
 	u64 mcg_cap;
 	u64 mcg_status;
 	u64 mcg_ctl;
+	u64 mcg_ext_ctl;
 	u64 *mce_banks;
 
 	/* Cache MMIO info */
@@ -1008,6 +1009,8 @@ struct kvm_x86_ops {
 
 	int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
 	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+	void (*setup_mce)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1082,6 +1085,8 @@ extern u64  kvm_max_tsc_scaling_ratio;
 /* 1ull << kvm_tsc_scaling_ratio_frac_bits */
 extern u64  kvm_default_tsc_scaling_ratio;
 
+extern u64 kvm_mce_cap_supported;
+
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
 	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fdc05ae08bac..22a6474af220 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1349,41 +1349,52 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+}
+
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
 	WARN_ON(swait_active(&vcpu->wq));
-	kvm_x86_ops->cancel_hv_timer(vcpu);
-	apic->lapic_timer.hv_timer_in_use = false;
+	cancel_hv_tscdeadline(apic);
 	apic_timer_expired(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
 
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+	u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+	if (atomic_read(&apic->lapic_timer.pending) ||
+		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+		if (apic->lapic_timer.hv_timer_in_use)
+			cancel_hv_tscdeadline(apic);
+	} else {
+		apic->lapic_timer.hv_timer_in_use = true;
+		hrtimer_cancel(&apic->lapic_timer.timer);
+
+		/* In case the sw timer triggered in the window */
+		if (atomic_read(&apic->lapic_timer.pending))
+			cancel_hv_tscdeadline(apic);
+	}
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+			apic->lapic_timer.hv_timer_in_use);
+	return apic->lapic_timer.hv_timer_in_use;
+}
+
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(apic->lapic_timer.hv_timer_in_use);
 
-	if (apic_lvtt_tscdeadline(apic) &&
-	    !atomic_read(&apic->lapic_timer.pending)) {
-		u64 tscdeadline = apic->lapic_timer.tscdeadline;
-
-		if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) {
-			apic->lapic_timer.hv_timer_in_use = true;
-			hrtimer_cancel(&apic->lapic_timer.timer);
-
-			/* In case the sw timer triggered in the window */
-			if (atomic_read(&apic->lapic_timer.pending)) {
-				apic->lapic_timer.hv_timer_in_use = false;
-				kvm_x86_ops->cancel_hv_timer(apic->vcpu);
-			}
-		}
-		trace_kvm_hv_timer_state(vcpu->vcpu_id,
-				apic->lapic_timer.hv_timer_in_use);
-	}
+	if (apic_lvtt_tscdeadline(apic))
+		start_hv_tscdeadline(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
@@ -1395,8 +1406,7 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 	if (!apic->lapic_timer.hv_timer_in_use)
 		return;
 
-	kvm_x86_ops->cancel_hv_timer(vcpu);
-	apic->lapic_timer.hv_timer_in_use = false;
+	cancel_hv_tscdeadline(apic);
 
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;
@@ -1451,15 +1461,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		/* lapic timer in tsc deadline mode */
-		u64 tscdeadline = apic->lapic_timer.tscdeadline;
-
-		if (kvm_x86_ops->set_hv_timer &&
-		    !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
-			apic->lapic_timer.hv_timer_in_use = true;
-			trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
-					apic->lapic_timer.hv_timer_in_use);
-		} else
+		if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
 			start_sw_tscdeadline(apic);
 	}
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5ff292778110..5bfdbbf1ce79 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4935,6 +4935,12 @@ out:
 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 {
 	local_irq_enable();
+	/*
+	 * We must have an instruction with interrupts enabled, so
+	 * the timer interrupt isn't delayed by the interrupt shadow.
+	 */
+	asm("nop");
+	local_irq_disable();
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e185649fb8b7..e51503063181 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -428,7 +428,6 @@ struct nested_vmx {
 	struct pi_desc *pi_desc;
 	bool pi_pending;
 	u16 posted_intr_nv;
-	u64 msr_ia32_feature_control;
 
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
@@ -612,6 +611,14 @@ struct vcpu_vmx {
 	bool guest_pkru_valid;
 	u32 guest_pkru;
 	u32 host_pkru;
+
+	/*
+	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+	 * in msr_ia32_feature_control_valid_bits.
+	 */
+	u64 msr_ia32_feature_control;
+	u64 msr_ia32_feature_control_valid_bits;
 };
 
 enum segment_cache_field {
@@ -1105,7 +1112,7 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void)
 
 	/* Clear the reserved bits */
 	eax &= ~(0x3U << 14 | 0xfU << 28);
-	for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 		if (eax == vmx_preemption_cpu_tfms[i])
 			return true;
 
@@ -1114,9 +1121,6 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void)
 
 static inline bool cpu_has_vmx_preemption_timer(void)
 {
-	if (cpu_has_broken_vmx_preemption_timer())
-		return false;
-
 	return vmcs_config.pin_based_exec_ctrl &
 		PIN_BASED_VMX_PREEMPTION_TIMER;
 }
@@ -1668,6 +1672,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
 	__vmcs_writel(field, __vmcs_readl(field) | mask);
 }
 
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+	vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
 	vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1696,6 +1705,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
 	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
 }
 
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+	vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
 	vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -2928,6 +2942,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	return 0;
 }
 
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+						 uint64_t val)
+{
+	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+	return !(val & ~valid_bits);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -2969,10 +2991,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
-	case MSR_IA32_FEATURE_CONTROL:
-		if (!nested_vmx_allowed(vcpu))
+	case MSR_IA32_MCG_EXT_CTL:
+		if (!msr_info->host_initiated &&
+		    !(to_vmx(vcpu)->msr_ia32_feature_control &
+		      FEATURE_CONTROL_LMCE))
 			return 1;
-		msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+		msr_info->data = vcpu->arch.mcg_ext_ctl;
+		break;
+	case MSR_IA32_FEATURE_CONTROL:
+		msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
@@ -3062,12 +3089,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC_ADJUST:
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
+	case MSR_IA32_MCG_EXT_CTL:
+		if ((!msr_info->host_initiated &&
+		     !(to_vmx(vcpu)->msr_ia32_feature_control &
+		       FEATURE_CONTROL_LMCE)) ||
+		    (data & ~MCG_EXT_CTL_LMCE_EN))
+			return 1;
+		vcpu->arch.mcg_ext_ctl = data;
+		break;
 	case MSR_IA32_FEATURE_CONTROL:
-		if (!nested_vmx_allowed(vcpu) ||
-		    (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+		if (!vmx_feature_control_msr_valid(vcpu, data) ||
+		    (to_vmx(vcpu)->msr_ia32_feature_control &
 		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
 			return 1;
-		vmx->nested.msr_ia32_feature_control = data;
+		vmx->msr_ia32_feature_control = data;
 		if (msr_info->host_initiated && data == 0)
 			vmx_leave_nested(vcpu);
 		break;
@@ -3362,12 +3397,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		      vmx_capability.ept, vmx_capability.vpid);
 	}
 
-	min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
 	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-		VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+		VM_EXIT_CLEAR_BNDCFGS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -3379,9 +3414,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 
+	if (cpu_has_broken_vmx_preemption_timer())
+		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 	if (!(_cpu_based_2nd_exec_control &
-		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
-		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
 		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 
 	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -6471,6 +6507,8 @@ static __init int hardware_setup(void)
 
 	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
+	kvm_mce_cap_supported |= MCG_LMCE_P;
+
 	return alloc_kvm_area();
 
 out8:
@@ -6939,7 +6977,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
 			!= VMXON_NEEDED_FEATURES) {
 		kvm_inject_gp(vcpu, 0);
 		return 1;
@@ -8012,6 +8050,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 	case EXIT_REASON_PCOMMIT:
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+	case EXIT_REASON_PREEMPTION_TIMER:
+		return false;
 	default:
 		return true;
 	}
@@ -8545,7 +8585,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			"push %[sp]\n\t"
 #endif
 			"pushf\n\t"
-			"orl $0x200, (%%" _ASM_SP ")\n\t"
 			__ASM_SIZE(push) " $%c[cs]\n\t"
 			"call *%[entry]\n\t"
 			:
@@ -8558,8 +8597,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
-	} else
-		local_irq_enable();
+	}
 }
 
 static bool vmx_has_high_real_mode_segbase(void)
@@ -9056,6 +9094,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -9203,6 +9243,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 			vmx->nested.nested_vmx_secondary_ctls_high &=
 				~SECONDARY_EXEC_PCOMMIT;
 	}
+
+	if (nested_vmx_allowed(vcpu))
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+			FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	else
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9759,9 +9806,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
 	exec_control = vmcs12->pin_based_vm_exec_control;
-	exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+	/* Preemption timer setting is only taken from vmcs01.  */
 	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+	exec_control |= vmcs_config.pin_based_exec_ctrl;
+	if (vmx->hv_deadline_tsc == -1)
+		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 
+	/* Posted interrupts setting is only taken from vmcs12.  */
 	if (nested_cpu_has_posted_intr(vmcs12)) {
 		/*
 		 * Note that we use L0's vector here and in
@@ -10680,8 +10732,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 				       vmcs12->vm_exit_intr_error_code,
 				       KVM_ISA_VMX);
 
-	vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
-	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+	vm_entry_controls_reset_shadow(vmx);
+	vm_exit_controls_reset_shadow(vmx);
 	vmx_segment_cache_clear(vmx);
 
 	/* if no vmcs02 cache requested, remove the one we used */
@@ -10690,8 +10742,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 
-	/* Update TSC_OFFSET if TSC was changed while L2 ran */
+	/* Update any VMCS fields that might have changed while L2 ran */
 	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+	if (vmx->hv_deadline_tsc == -1)
+		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+				PIN_BASED_VMX_PREEMPTION_TIMER);
+	else
+		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+			      PIN_BASED_VMX_PREEMPTION_TIMER);
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;
@@ -10793,9 +10851,9 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tscl = rdtsc(), delta_tsc;
-
-	delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl);
+	u64 tscl = rdtsc();
+	u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+	u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
 	/* Convert to host delta tsc if tsc scaling is enabled */
 	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@ -11087,6 +11145,16 @@ out:
 	return ret;
 }
 
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+			FEATURE_CONTROL_LMCE;
+	else
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+			~FEATURE_CONTROL_LMCE;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -11216,6 +11284,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_hv_timer = vmx_set_hv_timer,
 	.cancel_hv_timer = vmx_cancel_hv_timer,
 #endif
+
+	.setup_mce = vmx_setup_mce,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 299219630c94..0cc6cf834cdd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -70,7 +70,8 @@
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
 	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -984,6 +985,7 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
+	MSR_IA32_MCG_EXT_CTL,
 	MSR_IA32_SMBASE,
 };
 
@@ -1163,7 +1165,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 	int version;
 	int r;
 	struct pvclock_wall_clock wc;
-	struct timespec boot;
+	struct timespec64 boot;
 
 	if (!wall_clock)
 		return;
@@ -1186,13 +1188,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 	 * wall clock specified here.  guest system time equals host
 	 * system time for us, thus we must fill in host boot time here.
 	 */
-	getboottime(&boot);
+	getboottime64(&boot);
 
 	if (kvm->arch.kvmclock_offset) {
-		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
-		boot = timespec_sub(boot, ts);
+		struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+		boot = timespec64_sub(boot, ts);
 	}
-	wc.sec = boot.tv_sec;
+	wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
 	wc.nsec = boot.tv_nsec;
 	wc.version = version;
 
@@ -2685,11 +2687,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-		u64 mce_cap;
-
-		mce_cap = KVM_MCE_CAP_SUPPORTED;
 		r = -EFAULT;
-		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+		if (copy_to_user(argp, &kvm_mce_cap_supported,
+				 sizeof(kvm_mce_cap_supported)))
 			goto out;
 		r = 0;
 		break;
@@ -2872,7 +2872,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
-	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
 		goto out;
 	r = 0;
 	vcpu->arch.mcg_cap = mcg_cap;
@@ -2882,6 +2882,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	/* Init IA32_MCi_CTL to all 1s */
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+	if (kvm_x86_ops->setup_mce)
+		kvm_x86_ops->setup_mce(vcpu);
 out:
 	return r;
 }
@@ -6655,7 +6658,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	trace_kvm_entry(vcpu->vcpu_id);
 	wait_lapic_expire(vcpu);
-	__kvm_guest_enter();
+	guest_enter_irqoff();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
@@ -6706,16 +6709,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.exits;
 
-	/*
-	 * We must have an instruction between local_irq_enable() and
-	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
-	 * the interrupt shadow.  The stat.exits increment will do nicely.
-	 * But we need to prevent reordering, hence this barrier():
-	 */
-	barrier();
-
-	kvm_guest_exit();
+	guest_exit_irqoff();
 
+	local_irq_enable();
 	preempt_enable();
 
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d259274238db..ff4a32d24d56 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -84,7 +84,8 @@ static inline void context_tracking_init(void) { }
 
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline void guest_enter(void)
+/* must be called with irqs disabled */
+static inline void guest_enter_irqoff(void)
 {
 	if (vtime_accounting_cpu_enabled())
 		vtime_guest_enter(current);
@@ -93,9 +94,19 @@ static inline void guest_enter(void)
 
 	if (context_tracking_is_enabled())
 		__context_tracking_enter(CONTEXT_GUEST);
+
+	/* KVM does not hold any references to rcu protected data when it
+	 * switches CPU into a guest mode. In fact switching to a guest mode
+	 * is very similar to exiting to userspace from rcu point of view. In
+	 * addition CPU may stay in a guest mode for quite a long time (up to
+	 * one time slice). Lets treat guest mode as quiescent state, just like
+	 * we do with user-mode execution.
+	 */
+	if (!context_tracking_cpu_is_enabled())
+		rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
 	if (context_tracking_is_enabled())
 		__context_tracking_exit(CONTEXT_GUEST);
@@ -107,7 +118,7 @@ static inline void guest_exit(void)
 }
 
 #else
-static inline void guest_enter(void)
+static inline void guest_enter_irqoff(void)
 {
 	/*
 	 * This is running in ioctl context so its safe
@@ -116,9 +127,10 @@ static inline void guest_enter(void)
 	 */
 	vtime_account_system(current);
 	current->flags |= PF_VCPU;
+	rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
 	/* Flush the guest cputime we spent on the guest */
 	vtime_account_system(current);
@@ -126,4 +138,22 @@ static inline void guest_exit(void)
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 
+static inline void guest_enter(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	guest_enter_irqoff();
+	local_irq_restore(flags);
+}
+
+static inline void guest_exit(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	guest_exit_irqoff();
+	local_irq_restore(flags);
+}
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0640ee92b978..66b2f6159aad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -875,45 +875,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 }
 #endif
 
-/* must be called with irqs disabled */
-static inline void __kvm_guest_enter(void)
-{
-	guest_enter();
-	/* KVM does not hold any references to rcu protected data when it
-	 * switches CPU into a guest mode. In fact switching to a guest mode
-	 * is very similar to exiting to userspace from rcu point of view. In
-	 * addition CPU may stay in a guest mode for quite a long time (up to
-	 * one time slice). Lets treat guest mode as quiescent state, just like
-	 * we do with user-mode execution.
-	 */
-	if (!context_tracking_cpu_is_enabled())
-		rcu_virt_note_context_switch(smp_processor_id());
-}
-
-/* must be called with irqs disabled */
-static inline void __kvm_guest_exit(void)
-{
-	guest_exit();
-}
-
-static inline void kvm_guest_enter(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__kvm_guest_enter();
-	local_irq_restore(flags);
-}
-
-static inline void kvm_guest_exit(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__kvm_guest_exit();
-	local_irq_restore(flags);
-}
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 8b5e0a9f2431..610e13271918 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 	return ret;
 }
 
+static inline int page_ref_inc_return(struct page *page)
+{
+	int ret = atomic_inc_return(&page->_refcount);
+
+	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+		__page_ref_mod_and_return(page, 1, ret);
+	return ret;
+}
+
 static inline int page_ref_dec_and_test(struct page *page)
 {
 	int ret = atomic_dec_and_test(&page->_refcount);
diff --git a/mm/gup.c b/mm/gup.c
index c057784c8444..e3ac22f90fa4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -720,6 +720,7 @@ retry:
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(fixup_user_fault);
 
 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 						struct mm_struct *mm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef54b4c31792..154b9ab459b0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1442,6 +1442,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 	return true;
 }
 
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+			       unsigned long addr, bool *async,
+			       bool write_fault, kvm_pfn_t *p_pfn)
+{
+	unsigned long pfn;
+	int r;
+
+	r = follow_pfn(vma, addr, &pfn);
+	if (r) {
+		/*
+		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+		 * not call the fault handler, so do it here.
+		 */
+		bool unlocked = false;
+		r = fixup_user_fault(current, current->mm, addr,
+				     (write_fault ? FAULT_FLAG_WRITE : 0),
+				     &unlocked);
+		if (unlocked)
+			return -EAGAIN;
+		if (r)
+			return r;
+
+		r = follow_pfn(vma, addr, &pfn);
+		if (r)
+			return r;
+
+	}
+
+
+	/*
+	 * Get a reference here because callers of *hva_to_pfn* and
+	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+	 * simply do nothing for reserved pfns.
+	 *
+	 * Whoever called remap_pfn_range is also going to call e.g.
+	 * unmap_mapping_range before the underlying pages are freed,
+	 * causing a call to our MMU notifier.
+	 */ 
+	kvm_get_pfn(pfn);
+
+	*p_pfn = pfn;
+	return 0;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1461,7 +1507,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 {
 	struct vm_area_struct *vma;
 	kvm_pfn_t pfn = 0;
-	int npages;
+	int npages, r;
 
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
@@ -1483,14 +1529,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		goto exit;
 	}
 
+retry:
 	vma = find_vma_intersection(current->mm, addr, addr + 1);
 
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
-	else if ((vma->vm_flags & VM_PFNMAP)) {
-		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-			vma->vm_pgoff;
-		BUG_ON(!kvm_is_reserved_pfn(pfn));
+	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+		r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+		if (r == -EAGAIN)
+			goto retry;
+		if (r < 0)
+			pfn = KVM_PFN_ERR_FAULT;
 	} else {
 		if (async && vma_is_valid(vma, write_fault))
 			*async = true;