/*
 * bsc1203067_vmx
 *
 * Fix for CVE-2022-39189, bsc#1203067 (arch/x86/kvm/vmx/vmx.c part)
 *
 *  Copyright (c) 2022 SUSE
 *  Author: Nicolai Stange <nstange@suse.de>
 *
 *  Based on the original Linux kernel code. Other copyrights apply.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

#if IS_ENABLED(CONFIG_X86_64)

#if !IS_MODULE(CONFIG_KVM_INTEL)
#error "Live patch supports only CONFIG_KVM_INTEL=m"
#endif

#include "bsc1203067_common.h"

/* klp-ccp: from arch/x86/kvm/vmx/vmx.c */
#include <linux/frame.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
#include <linux/kvm_host.h>

/* klp-ccp: from include/linux/kvm_host.h */
static bool (*klpe_kvm_rebooting);

/* klp-ccp: from arch/x86/include/asm/kvm_host.h */
static void (*klpe_kvm_spurious_fault)(void);

#define klpr_____kvm_handle_fault_on_reboot(insn, cleanup_insn)	\
	"666: \n\t"							\
	insn "\n\t"							\
	"jmp	668f \n\t"						\
	"667: \n\t"							\
	CALL_NOSPEC "\n\t"						\
	"668: \n\t"							\
	".pushsection .fixup, \"ax\" \n\t"				\
	"700: \n\t"							\
	cleanup_insn "\n\t"						\
	"pushq %%rax \n\t"						\
	"movq klpe_kvm_rebooting, %%rax \n\t"				\
	"cmpb	$0, (%%rax)\n\t"					\
	"popq %%rax \n\t"						\
	"je	667b \n\t"						\
	"jmp	668b \n\t"						\
	".popsection \n\t"						\
	_ASM_EXTABLE(666b, 700b)


/* klp-ccp: from arch/x86/kvm/vmx/vmx.c */
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <asm/apic.h>
#include <asm/asm.h>

#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/io.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>

#include <asm/virtext.h>
#include <asm/vmx.h>
/* klp-ccp: from arch/x86/kvm/vmx/capabilities.h */
#include <asm/vmx.h>

/* klp-ccp: from arch/x86/kvm/lapic.h */
#include <linux/kvm_host.h>

/* klp-ccp: from arch/x86/kvm/vmx/capabilities.h */
struct nested_vmx_msrs {
	/*
	 * We only store the "true" versions of the VMX capability MSRs. We
	 * generate the "non-true" versions by setting the must-be-1 bits
	 * according to the SDM.
	 */
	u32 procbased_ctls_low;
	u32 procbased_ctls_high;
	u32 secondary_ctls_low;
	u32 secondary_ctls_high;
	u32 pinbased_ctls_low;
	u32 pinbased_ctls_high;
	u32 exit_ctls_low;
	u32 exit_ctls_high;
	u32 entry_ctls_low;
	u32 entry_ctls_high;
	u32 misc_low;
	u32 misc_high;
	u32 ept_caps;
	u32 vpid_caps;
	u64 basic;
	u64 cr0_fixed0;
	u64 cr0_fixed1;
	u64 cr4_fixed0;
	u64 cr4_fixed1;
	u64 vmcs_enum;
	u64 vmfunc_controls;
};

/* klp-ccp: from arch/x86/kvm/x86.h */
#include <linux/kvm_host.h>
/* klp-ccp: from arch/x86/kvm/kvm_cache_regs.h */
#include <linux/kvm_host.h>

/* klp-ccp: from arch/x86/kvm/x86.h */
static struct kvm_vcpu * (__percpu *klpe_current_vcpu);

static inline void klpr_kvm_before_interrupt(struct kvm_vcpu *vcpu)
{
	WRITE_ONCE(*this_cpu_ptr(klpe_current_vcpu), vcpu);
}

static inline void klpr_kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
	WRITE_ONCE(*this_cpu_ptr(klpe_current_vcpu), NULL);
}

/* klp-ccp: from arch/x86/kvm/cpuid.h */
#include <asm/cpu.h>
#include <asm/processor.h>
/* klp-ccp: from arch/x86/kvm/vmx/evmcs.h */
#include <linux/jump_label.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/vmx.h>
/* klp-ccp: from arch/x86/kvm/vmx/vmcs.h */
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/nospec.h>
#include <asm/kvm.h>
#include <asm/vmx.h>

struct vmcs_host_state {
	unsigned long cr3;	/* May not match real cr3 */
	unsigned long cr4;	/* May not match real cr4 */
	unsigned long gs_base;
	unsigned long fs_base;
	unsigned long rsp;

	u16           fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64
	u16           ds_sel, es_sel;
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif
};

struct vmcs_controls_shadow {
	u32 vm_entry;
	u32 vm_exit;
	u32 pin;
	u32 exec;
	u32 secondary_exec;
};

struct loaded_vmcs {
	struct vmcs *vmcs;
	struct vmcs *shadow_vmcs;
	int cpu;
	bool launched;
	bool nmi_known_unmasked;
	bool hv_timer_soft_disabled;
	/* Support for vnmi-less CPUs */
	int soft_vnmi_blocked;
	ktime_t entry_time;
	s64 vnmi_blocked_time;
	unsigned long *msr_bitmap;
	struct list_head loaded_vmcss_on_cpu_link;
	struct vmcs_host_state host_state;
	struct vmcs_controls_shadow controls_shadow;
};

static inline bool is_exception_n(u32 intr_info, u8 vector)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
}

static inline bool is_page_fault(u32 intr_info)
{
	return is_exception_n(intr_info, PF_VECTOR);
}

static inline bool is_machine_check(u32 intr_info)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}

static inline bool is_nmi(u32 intr_info)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
}

/* klp-ccp: from arch/x86/kvm/vmx/evmcs.h */
static struct static_key_false (*klpe_enable_evmcs);

#if IS_ENABLED(CONFIG_HYPERV)

static u32 (*klpe_evmcs_read32)(unsigned long field);

#else /* !IS_ENABLED(CONFIG_HYPERV) */
#error "klp-ccp: non-taken branch"
#endif /* IS_ENABLED(CONFIG_HYPERV) */

/* klp-ccp: from arch/x86/kvm/irq.h */
#include <linux/mm_types.h>
#include <linux/hrtimer.h>
#include <linux/kvm_host.h>
#include <linux/spinlock.h>
#include <kvm/iodev.h>
/* klp-ccp: from arch/x86/kvm/ioapic.h */
#include <linux/kvm_host.h>
#include <kvm/iodev.h>
/* klp-ccp: from arch/x86/kvm/mmu.h */
#include <linux/kvm_host.h>
/* klp-ccp: from arch/x86/kvm/vmx/vmcs12.h */
#include <linux/build_bug.h>
/* klp-ccp: from arch/x86/kvm/vmx/vmx.h */
#include <linux/kvm_host.h>
#include <asm/kvm.h>

/* klp-ccp: from arch/x86/include/asm/intel_pt.h */
#define PT_CPUID_LEAVES		2
#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */

/* klp-ccp: from arch/x86/kvm/vmx/ops.h */
#include <linux/nospec.h>
#include <asm/kvm_host.h>
#include <asm/vmx.h>

#define klpr___ex_clear(x, reg) \
	klpr_____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)

static __always_inline void vmcs_check32(unsigned long field)
{
	BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
			 "32-bit accessor invalid for 16-bit field");
	BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
			 "32-bit accessor invalid for natural width field");
}

static __always_inline unsigned long klpr___vmcs_readl(unsigned long field)
{
	unsigned long value;

	asm volatile (klpr___ex_clear("vmread %1, %0", "%k0")
		      : "=r"(value) : "r"(field),
			THUNK_TARGET(klpe_kvm_spurious_fault));
	return value;
}

static __always_inline u32 klpr_vmcs_read32(unsigned long field)
{
	vmcs_check32(field);
	if (static_key_enabled(&(&(*klpe_enable_evmcs))->key))
		return (*klpe_evmcs_read32)(field);
	return klpr___vmcs_readl(field);
}

/* klp-ccp: from arch/x86/kvm/vmx/vmx.h */
#define NR_AUTOLOAD_MSRS 8

struct vmx_msrs {
	unsigned int		nr;
	struct vmx_msr_entry	val[NR_AUTOLOAD_MSRS];
};

struct pi_desc {
	u32 pir[8];     /* Posted interrupt requested */
	union {
		struct {
				/* bit 256 - Outstanding Notification */
			u16	on	: 1,
				/* bit 257 - Suppress Notification */
				sn	: 1,
				/* bit 271:258 - Reserved */
				rsvd_1	: 14;
				/* bit 279:272 - Notification Vector */
			u8	nv;
				/* bit 287:280 - Reserved */
			u8	rsvd_2;
				/* bit 319:288 - Notification Destination */
			u32	ndst;
		};
		u64 control;
	};
	u32 rsvd[6];
} __aligned(64);

#define RTIT_ADDR_RANGE		4

struct pt_ctx {
	u64 ctl;
	u64 status;
	u64 output_base;
	u64 output_mask;
	u64 cr3_match;
	u64 addr_a[RTIT_ADDR_RANGE];
	u64 addr_b[RTIT_ADDR_RANGE];
};

struct pt_desc {
	u64 ctl_bitmask;
	u32 addr_range;
	u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
	struct pt_ctx host;
	struct pt_ctx guest;
};

struct nested_vmx {
	/* Has the level1 guest done vmxon? */
	bool vmxon;
	gpa_t vmxon_ptr;
	bool pml_full;

	/* The guest-physical address of the current VMCS L1 keeps for L2 */
	gpa_t current_vmptr;
	/*
	 * Cache of the guest's VMCS, existing outside of guest memory.
	 * Loaded from guest memory during VMPTRLD. Flushed to guest
	 * memory during VMCLEAR and VMPTRLD.
	 */
	struct vmcs12 *cached_vmcs12;
	/*
	 * Cache of the guest's shadow VMCS, existing outside of guest
	 * memory. Loaded from guest memory during VM entry. Flushed
	 * to guest memory during VM exit.
	 */
	struct vmcs12 *cached_shadow_vmcs12;

	/*
	 * Indicates if the shadow vmcs or enlightened vmcs must be updated
	 * with the data held by struct vmcs12.
	 */
	bool need_vmcs12_to_shadow_sync;
	bool dirty_vmcs12;

	/*
	 * Indicates lazily loaded guest state has not yet been decached from
	 * vmcs02.
	 */
	bool need_sync_vmcs02_to_vmcs12_rare;

	/*
	 * vmcs02 has been initialized, i.e. state that is constant for
	 * vmcs02 has been written to the backing VMCS.  Initialization
	 * is delayed until L1 actually attempts to run a nested VM.
	 */
	bool vmcs02_initialized;

	bool change_vmcs01_virtual_apic_mode;

	/*
	 * Enlightened VMCS has been enabled. It does not mean that L1 has to
	 * use it. However, VMX features available to L1 will be limited based
	 * on what the enlightened VMCS supports.
	 */
	bool enlightened_vmcs_enabled;

	/* L2 must run next, and mustn't decide to exit to L1. */
	bool nested_run_pending;

	/* Pending MTF VM-exit into L1.  */
	bool mtf_pending;

	struct loaded_vmcs vmcs02;

	/*
	 * Guest pages referred to in the vmcs02 with host-physical
	 * pointers, so we must keep them pinned while L2 runs.
	 */
	struct page *apic_access_page;
	struct kvm_host_map virtual_apic_map;
	struct kvm_host_map pi_desc_map;

	struct kvm_host_map msr_bitmap_map;

	struct pi_desc *pi_desc;
	bool pi_pending;
	u16 posted_intr_nv;

	struct hrtimer preemption_timer;
	bool preemption_timer_expired;

	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
	u64 vmcs01_debugctl;
	u64 vmcs01_guest_bndcfgs;

	u16 vpid02;
	u16 last_vpid;

	struct nested_vmx_msrs msrs;

	/* SMM related state */
	struct {
		/* in VMX operation on SMM entry? */
		bool vmxon;
		/* in guest mode on SMM entry? */
		bool guest_mode;
	} smm;

	gpa_t hv_evmcs_vmptr;
	struct kvm_host_map hv_evmcs_map;
	struct hv_enlightened_vmcs *hv_evmcs;
};

struct vcpu_vmx {
	struct kvm_vcpu       vcpu;
	u8                    fail;
	u8		      msr_bitmap_mode;

	/*
	 * If true, host state has been stored in vmx->loaded_vmcs for
	 * the CPU registers that only need to be switched when transitioning
	 * to/from the kernel, and the registers have been loaded with guest
	 * values.  If false, host state is loaded in the CPU registers
	 * and vmx->loaded_vmcs->host_state is invalid.
	 */
	bool		      guest_state_loaded;

	u32                   exit_intr_info;
	u32                   idt_vectoring_info;
	ulong                 rflags;

	struct shared_msr_entry *guest_msrs;
	int                   nmsrs;
	int                   save_nmsrs;
	bool                  guest_msrs_ready;
#ifdef CONFIG_X86_64
	u64		      msr_host_kernel_gs_base;
	u64		      msr_guest_kernel_gs_base;
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif
	u64		      spec_ctrl;
	u32		      msr_ia32_umwait_control;

	u32 secondary_exec_control;

	/*
	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
	 * non-nested (L1) guest, it always points to vmcs01. For a nested
	 * guest (L2), it points to a different VMCS.
	 */
	struct loaded_vmcs    vmcs01;
	struct loaded_vmcs   *loaded_vmcs;

	struct msr_autoload {
		struct vmx_msrs guest;
		struct vmx_msrs host;
	} msr_autoload;

	struct {
		int vm86_active;
		ulong save_rflags;
		struct kvm_segment segs[8];
	} rmode;
	struct {
		u32 bitmask; /* 4 bits per segment (1 bit per field) */
		struct kvm_save_segment {
			u16 selector;
			unsigned long base;
			u32 limit;
			u32 ar;
		} seg[8];
	} segment_cache;
	int vpid;
	bool emulation_required;

	u32 exit_reason;

	/* Posted interrupt descriptor */
	struct pi_desc pi_desc;

	/* Support for a guest hypervisor (nested VMX) */
	struct nested_vmx nested;

	/* Dynamic PLE window. */
	int ple_window;
	bool ple_window_dirty;

	bool req_immediate_exit;

	/* Support for PML */
	struct page *pml_pg;

	/* apic deadline value in host tsc */
	u64 hv_deadline_tsc;

	u64 current_tsc_ratio;

	u32 host_pkru;

	unsigned long host_debugctlmsr;

	/*
	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
	 * in msr_ia32_feature_control_valid_bits.
	 */
	u64 msr_ia32_feature_control;
	u64 msr_ia32_feature_control_valid_bits;
	u64 ept_pointer;

	struct pt_desc pt_desc;
};

static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
	return container_of(vcpu, struct vcpu_vmx, vcpu);
}

/* klp-ccp: from arch/x86/kvm/pmu.h */
#include <linux/nospec.h>

/* klp-ccp: from arch/x86/kvm/trace.h */
#include <linux/tracepoint.h>
#include <asm/vmx.h>
#include <asm/svm.h>
#include <asm/clocksource.h>
#include <asm/pvclock-abi.h>

/* klp-ccp: from arch/x86/kvm/vmx/vmx.c */
static void (*klpe_kvm_machine_check)(void);

static void klpr_handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
	vmx->exit_intr_info = klpr_vmcs_read32(VM_EXIT_INTR_INFO);

	/* if exit due to PF check for async PF */
	if (is_page_fault(vmx->exit_intr_info))
		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();

	/* Handle machine checks before interrupts are enabled */
	if (is_machine_check(vmx->exit_intr_info))
		(*klpe_kvm_machine_check)();

	/* We need to handle NMIs before interrupts are enabled */
	if (is_nmi(vmx->exit_intr_info)) {
		klpr_kvm_before_interrupt(&vmx->vcpu);
		asm("int $2");
		klpr_kvm_after_interrupt(&vmx->vcpu);
	}
}

static void (*klpe_handle_external_interrupt_irqoff)(struct kvm_vcpu *vcpu);

void klpp_vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	/*
	 * Fix CVE-2022-39189
	 *  -3 lines, +4 lines
	 */
	if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) {
		(*klpe_handle_external_interrupt_irqoff)(vcpu);
		vcpu->arch.hflags |= KLPP_HF_AT_INSN_BOUNDARY_MASK;
	} else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
		klpr_handle_exception_nmi_irqoff(vmx);
}



#include <linux/kernel.h>
#include <linux/module.h>
#include "livepatch_bsc1203067.h"
#include "../kallsyms_relocs.h"

#define LIVEPATCHED_MODULE "kvm_intel"

static struct klp_kallsyms_reloc klp_funcs[] = {
	{ "current_vcpu", (void *)&klpe_current_vcpu, "kvm" },
	{ "kvm_rebooting", (void *)&klpe_kvm_rebooting, "kvm" },
	{ "kvm_spurious_fault", (void *)&klpe_kvm_spurious_fault, "kvm" },
	{ "enable_evmcs", (void *)&klpe_enable_evmcs, "kvm_intel" },
	{ "evmcs_read32", (void *)&klpe_evmcs_read32, "kvm_intel" },
	{ "handle_external_interrupt_irqoff",
	  (void *)&klpe_handle_external_interrupt_irqoff, "kvm_intel" },
	{ "kvm_machine_check", (void *)&klpe_kvm_machine_check, "kvm_intel" },
};

static int livepatch_bsc1203067_module_notify(struct notifier_block *nb,
					      unsigned long action, void *data)
{
	struct module *mod = data;
	int ret;

	if (action != MODULE_STATE_COMING || strcmp(mod->name, LIVEPATCHED_MODULE))
		return 0;

	mutex_lock(&module_mutex);
	ret = __klp_resolve_kallsyms_relocs(klp_funcs, ARRAY_SIZE(klp_funcs));
	mutex_unlock(&module_mutex);
	WARN(ret, "livepatch: delayed kallsyms lookup failed. System is broken and can crash.\n");

	return ret;
}

static struct notifier_block livepatch_bsc1203067_module_nb = {
	.notifier_call = livepatch_bsc1203067_module_notify,
	.priority = INT_MIN+1,
};

int bsc1203067_vmx_init(void)
{
	int ret;

	mutex_lock(&module_mutex);
	if (find_module(LIVEPATCHED_MODULE)) {
		ret = __klp_resolve_kallsyms_relocs(klp_funcs,
						    ARRAY_SIZE(klp_funcs));
		if (ret)
			goto out;
	}

	ret = register_module_notifier(&livepatch_bsc1203067_module_nb);
out:
	mutex_unlock(&module_mutex);
	return ret;
}

void bsc1203067_vmx_cleanup(void)
{
	unregister_module_notifier(&livepatch_bsc1203067_module_nb);
}

#endif /* IS_ENABLED(CONFIG_X86_64) */
