From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/stubs: Introduce place_ret() to abstract away raw 0xc3's

The Indirect Target Selection speculative vulnerability means that indirect
branches (including RETs) are unsafe when in the first half of a cacheline.
This means it's not safe for logic using the stubs to write raw 0xc3's.

Introduce place_ret() which, for now, writes a raw 0xc3 but will contain
additional logic when return thunks are in use.

stub_selftest() doesn't strictly need to be converted as they only run on
boot, but doing so gets us a partial test of place_ret() too.

No functional change.

This is part of XSA-469 / CVE-2024-28956

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -42,6 +42,12 @@
 
 #define is_canonical_address(x) (((int64_t)(x) >> 47) == ((int64_t)(x) >> 63))
 
+static inline void *place_ret(void *ptr)
+{
+    *(uint8_t *)ptr = 0xc3;
+    return ptr + 1;
+}
+
 struct cpuid_leaf
 {
     uint32_t a, b, c, d;
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -7,9 +7,7 @@ subdir-$(CONFIG_XENOPROF) += oprofile
 subdir-$(CONFIG_PV) += pv
 subdir-y += x86_64
 
-alternative-y := alternative.init.o
-alternative-$(CONFIG_LIVEPATCH) :=
-obj-bin-y += $(alternative-y)
+obj-y += alternative.o
 obj-y += apic.o
 obj-y += bhb-thunk.o
 obj-y += bitops.o
@@ -35,7 +33,7 @@ obj-y += hypercall.o
 obj-y += i387.o
 obj-y += i8259.o
 obj-y += io_apic.o
-obj-$(CONFIG_LIVEPATCH) += alternative.o livepatch.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
 obj-y += msi.o
 obj-$(CONFIG_INDIRECT_THUNK) += indirect-thunk.o
 obj-y += ioport_emulate.o
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -115,6 +115,20 @@ void init_or_livepatch add_nops(void *in
 }
 
 /*
+ * Place a return at @ptr.  @ptr must be in the writable alias of a stub.
+ *
+ * Returns the next position to write into the stub.
+ */
+void *place_ret(void *ptr)
+{
+    uint8_t *p = ptr;
+
+    *p++ = 0xc3;
+
+    return p;
+}
+
+/*
  * text_poke - Update instructions on a live kernel or non-executed code.
  * @addr: address to modify
  * @opcode: source of the copy
--- a/xen/arch/x86/extable.c
+++ b/xen/arch/x86/extable.c
@@ -128,19 +128,19 @@ search_exception_table(const struct cpu_
 static int __init stub_selftest(void)
 {
     static const struct {
-        uint8_t opc[4];
+        uint8_t opc[3];
         uint64_t rax;
         union stub_exception_token res;
     } tests[] __initconst = {
-        { .opc = { 0x0f, 0xb9, 0xc3, 0xc3 }, /* ud1 */
+        { .opc = { 0x0f, 0xb9, 0x90 }, /* ud1 */
           .res.fields.trapnr = TRAP_invalid_op },
-        { .opc = { 0x90, 0x02, 0x00, 0xc3 }, /* nop; add (%rax),%al */
+        { .opc = { 0x90, 0x02, 0x00 }, /* nop; add (%rax),%al */
           .rax = 0x0123456789abcdef,
           .res.fields.trapnr = TRAP_gp_fault },
-        { .opc = { 0x02, 0x04, 0x04, 0xc3 }, /* add (%rsp,%rax),%al */
+        { .opc = { 0x02, 0x04, 0x04 }, /* add (%rsp,%rax),%al */
           .rax = 0xfedcba9876543210,
           .res.fields.trapnr = TRAP_stack_error },
-        { .opc = { 0xcc, 0xc3, 0xc3, 0xc3 }, /* int3 */
+        { .opc = { 0xcc, 0x90, 0x90 }, /* int3 */
           .res.fields.trapnr = TRAP_int3 },
     };
     unsigned long addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2;
@@ -156,6 +156,7 @@ static int __init stub_selftest(void)
 
         memset(ptr, 0xcc, STUB_BUF_SIZE / 2);
         memcpy(ptr, tests[i].opc, ARRAY_SIZE(tests[i].opc));
+        place_ret(ptr + ARRAY_SIZE(tests[i].opc));
         unmap_domain_page(ptr);
 
         asm volatile ( "INDIRECT_CALL %[stb]\n"
--- a/xen/arch/x86/ioport_emulate.c
+++ b/xen/arch/x86/ioport_emulate.c
@@ -8,14 +8,14 @@
 #include <xen/sched.h>
 #include <xen/dmi.h>
 
-static void ioemul_handle_proliant_quirk(
+static unsigned int ioemul_handle_proliant_quirk(
     u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
 {
     uint16_t port = regs->dx;
     uint8_t value = regs->al;
 
     if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
-        return;
+        return 0;
 
     /*    pushf */
     io_emul_stub[0] = 0x9c;
@@ -33,8 +33,8 @@ static void ioemul_handle_proliant_quirk
     io_emul_stub[7] = 0xfb;
     /*    popf */
     io_emul_stub[8] = 0x9d;
-    /*    ret */
-    io_emul_stub[9] = 0xc3;
+
+    return 9;
 }
 
 static int __init proliant_quirk(struct dmi_system_id *d)
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -104,7 +104,7 @@ idt_entry_t __section(".bss.page_aligned
 /* Pointer to the IDT of every CPU. */
 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
 
-void (*ioemul_handle_quirk)(
+unsigned int (*ioemul_handle_quirk)(
     u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
 
 static int debug_stack_lines = 20;
@@ -2169,6 +2169,7 @@ static io_emul_stub_t *io_emul_stub_setu
 {
     struct stubs *this_stubs = &this_cpu(stubs);
     unsigned long stub_va = this_stubs->addr + STUB_BUF_SIZE / 2;
+    unsigned quirk_stub_used;
 
     if ( !ctxt->io_emul_stub )
         ctxt->io_emul_stub =
@@ -2198,16 +2199,20 @@ static io_emul_stub_t *io_emul_stub_setu
     ctxt->io_emul_stub[16] = opcode;
     /* imm8 or nop */
     ctxt->io_emul_stub[17] = !(opcode & 8) ? port : 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    ctxt->io_emul_stub[18] = 0xc3;
+    quirk_stub_used = 18;
     BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 19);
 
     if ( ioemul_handle_quirk )
     {
         BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 15 + 10);
-        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[15], ctxt->ctxt.regs);
+        quirk_stub_used = ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[15],
+                                              ctxt->ctxt.regs);
+        quirk_stub_used += quirk_stub_used ? 15 : 18;
     }
 
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    place_ret(&ctxt->io_emul_stub[quirk_stub_used]);
+
     asm volatile ( "lfence" ::: "memory" ); /* SCSB */
 
     /* Handy function-typed pointer to the stub. */
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1158,19 +1158,23 @@ static inline bool fpu_check_write(void)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
+    void *_p = get_stub(stub);                                          \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
     fic.insn_bytes = nr_;                                               \
-    memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
+    memcpy(_p, ((uint8_t[]){ bytes }), nr_); _p += nr_;                 \
+    place_ret(_p);                                                      \
     invoke_stub("", "", "=m" (fic) : "m" (fic));                        \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_stub_eflags(bytes...)                          \
 do {                                                                    \
+    void *_p = get_stub(stub);                                          \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
     unsigned long tmp_;                                                 \
     fic.insn_bytes = nr_;                                               \
-    memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
+    memcpy(_p, ((uint8_t[]){ bytes }), nr_); _p += nr_;                 \
+    place_ret(_p);                                                      \
     invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),             \
                 _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),            \
                 [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_),       \
@@ -5653,7 +5657,7 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         fic.insn_bytes = PFX_BYTES + 2;
-        opc[2] = 0xc3;
+        place_ret(&opc[2]);
 
         copy_REX_VEX(opc, rex_prefix, vex);
         ea.reg = decode_register(modrm_reg, &_regs, 0);
@@ -5701,7 +5705,7 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         fic.insn_bytes = PFX_BYTES + 2;
-        opc[2] = 0xc3;
+        place_ret(&opc[2]);
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
@@ -5854,7 +5858,7 @@ x86_emulate(
         opc[1] = modrm & 0xc7;
         fic.insn_bytes = PFX_BYTES + 2;
     simd_0f_to_gpr:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        place_ret(&opc[fic.insn_bytes - PFX_BYTES]);
 
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
@@ -6091,7 +6095,7 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0x38;
         fic.insn_bytes = PFX_BYTES + 2;
-        opc[2] = 0xc3;
+        place_ret(&opc[2]);
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "+m" (src.val), "+m" (fic.exn_raised)
@@ -6235,7 +6239,7 @@ x86_emulate(
         opc[2] = imm1;
         fic.insn_bytes = PFX_BYTES + 3;
     simd_0f_reg_only:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        place_ret(&opc[fic.insn_bytes - PFX_BYTES]);
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", [dummy_out] "=g" (dummy) : [dummy_in] "i" (0) );
@@ -7053,7 +7057,7 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0xc7;
         fic.insn_bytes = PFX_BYTES + 2;
-        opc[2] = 0xc3;
+        place_ret(&opc[2]);
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
@@ -7196,7 +7200,7 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         fic.insn_bytes = PFX_BYTES + 2;
-        opc[2] = 0xc3;
+        place_ret(&opc[2]);
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
@@ -7352,7 +7356,7 @@ x86_emulate(
         pvex->reg = 0xf; /* rAX */
         buf[3] = b;
         buf[4] = 0x09; /* reg=rCX r/m=(%rCX) */
-        buf[5] = 0xc3;
+        place_ret(&buf[5]);
 
         src.reg = decode_vex_gpr(vex.reg, &_regs, ctxt);
         emulate_stub([dst] "=&c" (dst.val), "[dst]" (&src.val), "a" (*src.reg));
@@ -7386,7 +7390,7 @@ x86_emulate(
         pvex->reg = 0xf; /* rAX */
         buf[3] = b;
         buf[4] = (modrm & 0x38) | 0x01; /* r/m=(%rCX) */
-        buf[5] = 0xc3;
+        place_ret(&buf[5]);
 
         dst.reg = decode_vex_gpr(vex.reg, &_regs, ctxt);
         emulate_stub("=&a" (dst.val), "c" (&src.val));
@@ -7500,7 +7504,7 @@ x86_emulate(
         opc[1] = modrm & 0x38;
         opc[2] = imm1;
         fic.insn_bytes = PFX_BYTES + 3;
-        opc[3] = 0xc3;
+        place_ret(&opc[3]);
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
@@ -7626,7 +7630,7 @@ x86_emulate(
         }
         opc[2] = imm1;
         fic.insn_bytes = PFX_BYTES + 3;
-        opc[3] = 0xc3;
+        place_ret(&opc[3]);
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
@@ -7707,7 +7711,7 @@ x86_emulate(
         pxop->reg = 0xf; /* rAX */
         buf[3] = b;
         buf[4] = (modrm & 0x38) | 0x01; /* r/m=(%rCX) */
-        buf[5] = 0xc3;
+        place_ret(&buf[5]);
 
         dst.reg = decode_vex_gpr(vex.reg, &_regs, ctxt);
         emulate_stub([dst] "=&a" (dst.val), "c" (&src.val));
@@ -7747,7 +7751,7 @@ x86_emulate(
         buf[3] = b;
         buf[4] = 0x09; /* reg=rCX r/m=(%rCX) */
         *(uint32_t *)(buf + 5) = imm1;
-        buf[9] = 0xc3;
+        place_ret(&buf[9]);
 
         emulate_stub([dst] "=&c" (dst.val), "[dst]" (&src.val));
 
@@ -7769,7 +7773,7 @@ x86_emulate(
 
         if ( !opc )
             BUG();
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        place_ret(&opc[fic.insn_bytes - PFX_BYTES]);
         copy_REX_VEX(opc, rex_prefix, vex);
 
         if ( ea.type == OP_MEM )
--- a/xen/include/asm-x86/alternative.h
+++ b/xen/include/asm-x86/alternative.h
@@ -28,6 +28,8 @@ struct alt_instr {
 #define ALT_REPL_PTR(a)     __ALT_PTR(a, repl_offset)
 
 extern void add_nops(void *insns, unsigned int len);
+void *place_ret(void *ptr);
+
 /* Similar to alternative_instructions except it can be run with IRQs enabled. */
 extern void apply_alternatives(const struct alt_instr *start,
                                const struct alt_instr *end);
--- a/xen/include/asm-x86/io.h
+++ b/xen/include/asm-x86/io.h
@@ -51,7 +51,7 @@ __OUT(l,,int)
 extern void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
 /* Function pointer used to handle platform specific I/O port emulation. */
-extern void (*ioemul_handle_quirk)(
+extern unsigned int (*ioemul_handle_quirk)(
     u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
 
 #endif
