From 8eb56eb959a50bf9afd0fd590ec394e9145970a4 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Mon, 3 Apr 2023 19:06:02 +0100
Subject: x86/boot: Merge CPUID policy initialisation logic into cpu-policy.c

Switch to the newer cpu_policy nomenclature.  Do some easy cleanup of
includes.

No practical change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -4,14 +4,599 @@
 #include <xen/sched.h>
 
 #include <asm/cpu-policy.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/svm.h>
 #include <asm/msr-index.h>
+#include <asm/paging.h>
 #include <asm/setup.h>
+#include <asm/xstate.h>
 
 struct cpu_policy __read_mostly       raw_cpu_policy;
 struct cpu_policy __read_mostly      host_cpu_policy;
 struct cpu_policy __read_mostly    pv_max_cpu_policy;
 struct cpu_policy __read_mostly   hvm_max_cpu_policy;
 
+const uint32_t known_features[] = INIT_KNOWN_FEATURES;
+
+static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
+static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
+static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
+static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
+
+static const struct feature_name {
+    const char *name;
+    unsigned int bit;
+} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
+
+/*
+ * Parse a list of cpuid feature names -> bool, calling the callback for any
+ * matches found.
+ *
+ * always_inline, because this is init code only and we really don't want a
+ * function pointer call in the middle of the loop.
+ */
+static int __init always_inline parse_cpuid(
+    const char *s, void (*callback)(unsigned int feat, bool val))
+{
+    const char *ss;
+    int val, rc = 0;
+
+    do {
+        const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
+        const char *feat;
+
+        ss = strchr(s, ',');
+        if ( !ss )
+            ss = strchr(s, '\0');
+
+        /* Skip the 'no-' prefix for name comparisons. */
+        feat = s;
+        if ( strncmp(s, "no-", 3) == 0 )
+            feat += 3;
+
+        /* (Re)initalise lhs and rhs for binary search. */
+        lhs = feature_names;
+        rhs = feature_names + ARRAY_SIZE(feature_names);
+
+        while ( lhs < rhs )
+        {
+            int res;
+
+            mid = lhs + (rhs - lhs) / 2;
+            res = cmdline_strcmp(feat, mid->name);
+
+            if ( res < 0 )
+            {
+                rhs = mid;
+                continue;
+            }
+            if ( res > 0 )
+            {
+                lhs = mid + 1;
+                continue;
+            }
+
+            if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
+            {
+                callback(mid->bit, val);
+                mid = NULL;
+            }
+
+            break;
+        }
+
+        /*
+         * Mid being NULL means that the name and boolean were successfully
+         * identified.  Everything else is an error.
+         */
+        if ( mid )
+            rc = -EINVAL;
+
+        s = ss + 1;
+    } while ( *ss );
+
+    return rc;
+}
+
+static void __init _parse_xen_cpuid(unsigned int feat, bool val)
+{
+    if ( !val )
+        setup_clear_cpu_cap(feat);
+}
+
+static int __init parse_xen_cpuid(const char *s)
+{
+    return parse_cpuid(s, _parse_xen_cpuid);
+}
+custom_param("cpuid", parse_xen_cpuid);
+
+static bool __initdata dom0_cpuid_cmdline;
+static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
+static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
+
+static void __init _parse_dom0_cpuid(unsigned int feat, bool val)
+{
+    __set_bit  (feat, val ? dom0_enable_feat  : dom0_disable_feat);
+    __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
+}
+
+static int __init parse_dom0_cpuid(const char *s)
+{
+    dom0_cpuid_cmdline = true;
+
+    return parse_cpuid(s, _parse_dom0_cpuid);
+}
+custom_param("dom0-cpuid", parse_dom0_cpuid);
+
+#define EMPTY_LEAF ((struct cpuid_leaf){})
+static void zero_leaves(struct cpuid_leaf *l,
+                        unsigned int first, unsigned int last)
+{
+    memset(&l[first], 0, sizeof(*l) * (last - first + 1));
+}
+
+static void sanitise_featureset(uint32_t *fs)
+{
+    /* for_each_set_bit() uses unsigned longs.  Extend with zeroes. */
+    uint32_t disabled_features[
+        ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
+    unsigned int i;
+
+    for ( i = 0; i < FSCAPINTS; ++i )
+    {
+        /* Clamp to known mask. */
+        fs[i] &= known_features[i];
+
+        /*
+         * Identify which features with deep dependencies have been
+         * disabled.
+         */
+        disabled_features[i] = ~fs[i] & deep_features[i];
+    }
+
+    for_each_set_bit(i, (void *)disabled_features,
+                     sizeof(disabled_features) * 8)
+    {
+        const uint32_t *dfs = lookup_deep_deps(i);
+        unsigned int j;
+
+        ASSERT(dfs); /* deep_features[] should guarentee this. */
+
+        for ( j = 0; j < FSCAPINTS; ++j )
+        {
+            fs[j] &= ~dfs[j];
+            disabled_features[j] &= ~dfs[j];
+        }
+    }
+}
+
+static void recalculate_xstate(struct cpu_policy *p)
+{
+    uint64_t xstates = XSTATE_FP_SSE;
+    uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
+    unsigned int i, Da1 = p->xstate.Da1;
+
+    /*
+     * The Da1 leaf is the only piece of information preserved in the common
+     * case.  Everything else is derived from other feature state.
+     */
+    memset(&p->xstate, 0, sizeof(p->xstate));
+
+    if ( !p->basic.xsave )
+        return;
+
+    if ( p->basic.avx )
+    {
+        xstates |= XSTATE_YMM;
+        xstate_size = max(xstate_size,
+                          xstate_offsets[_XSTATE_YMM] +
+                          xstate_sizes[_XSTATE_YMM]);
+    }
+
+    if ( p->feat.mpx )
+    {
+        xstates |= XSTATE_BNDREGS | XSTATE_BNDCSR;
+        xstate_size = max(xstate_size,
+                          xstate_offsets[_XSTATE_BNDCSR] +
+                          xstate_sizes[_XSTATE_BNDCSR]);
+    }
+
+    if ( p->feat.avx512f )
+    {
+        xstates |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
+        xstate_size = max(xstate_size,
+                          xstate_offsets[_XSTATE_HI_ZMM] +
+                          xstate_sizes[_XSTATE_HI_ZMM]);
+    }
+
+    if ( p->feat.pku )
+    {
+        xstates |= XSTATE_PKRU;
+        xstate_size = max(xstate_size,
+                          xstate_offsets[_XSTATE_PKRU] +
+                          xstate_sizes[_XSTATE_PKRU]);
+    }
+
+    if ( p->extd.lwp )
+    {
+        xstates |= XSTATE_LWP;
+        xstate_size = max(xstate_size,
+                          xstate_offsets[_XSTATE_LWP] +
+                          xstate_sizes[_XSTATE_LWP]);
+    }
+
+    p->xstate.max_size  =  xstate_size;
+    p->xstate.xcr0_low  =  xstates & ~XSTATE_XSAVES_ONLY;
+    p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
+
+    p->xstate.Da1 = Da1;
+    if ( p->xstate.xsaves )
+    {
+        p->xstate.xss_low   =  xstates & XSTATE_XSAVES_ONLY;
+        p->xstate.xss_high  = (xstates & XSTATE_XSAVES_ONLY) >> 32;
+    }
+    else
+        xstates &= ~XSTATE_XSAVES_ONLY;
+
+    for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
+    {
+        uint64_t curr_xstate = 1ul << i;
+
+        if ( !(xstates & curr_xstate) )
+            continue;
+
+        p->xstate.comp[i].size   = xstate_sizes[i];
+        p->xstate.comp[i].offset = xstate_offsets[i];
+        p->xstate.comp[i].xss    = curr_xstate & XSTATE_XSAVES_ONLY;
+        p->xstate.comp[i].align  = curr_xstate & xstate_align;
+    }
+}
+
+/*
+ * Misc adjustments to the policy.  Mostly clobbering reserved fields and
+ * duplicating shared fields.  Intentionally hidden fields are annotated.
+ */
+static void recalculate_misc(struct cpu_policy *p)
+{
+    p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
+    p->basic.apic_id = 0; /* Dynamic. */
+
+    p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
+    p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
+
+    p->basic.raw[0x8] = EMPTY_LEAF;
+    p->basic.raw[0xb] = EMPTY_LEAF; /* TODO: Rework topology logic. */
+    p->basic.raw[0xc] = EMPTY_LEAF;
+
+    p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
+
+    /* Most of Power/RAS hidden from guests. */
+    p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
+
+    p->extd.raw[0x8].d = 0;
+
+    switch ( p->x86_vendor )
+    {
+    case X86_VENDOR_INTEL:
+        p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
+        p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
+        p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
+
+        p->extd.vendor_ebx = 0;
+        p->extd.vendor_ecx = 0;
+        p->extd.vendor_edx = 0;
+
+        p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
+
+        p->extd.raw[0x5] = EMPTY_LEAF;
+        p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
+
+        p->extd.raw[0x8].a &= 0x0000ffff;
+        p->extd.raw[0x8].c = 0;
+        break;
+
+    case X86_VENDOR_AMD:
+        zero_leaves(p->basic.raw, 0x2, 0x3);
+        memset(p->cache.raw, 0, sizeof(p->cache.raw));
+        zero_leaves(p->basic.raw, 0x9, 0xa);
+
+        p->extd.vendor_ebx = p->basic.vendor_ebx;
+        p->extd.vendor_ecx = p->basic.vendor_ecx;
+        p->extd.vendor_edx = p->basic.vendor_edx;
+
+        p->extd.raw_fms = p->basic.raw_fms;
+        p->extd.raw[0x1].b &= 0xff00ffff;
+        p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
+
+        p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
+        p->extd.raw[0x8].c &= 0x0003f0ff;
+
+        p->extd.raw[0x9] = EMPTY_LEAF;
+
+        zero_leaves(p->extd.raw, 0xb, 0x18);
+
+        /* 0x19 - TLB details.  Pass through. */
+        /* 0x1a - Perf hints.   Pass through. */
+
+        p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
+
+        p->extd.raw[0x1c].a = 0; /* LWP.a entirely dynamic. */
+
+        p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */
+        p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */
+        p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */
+        p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */
+        break;
+    }
+}
+
+static void __init calculate_raw_policy(void)
+{
+    struct cpu_policy *p = &raw_cpu_policy;
+    unsigned int i;
+
+    cpuid_leaf(0, &p->basic.raw[0]);
+    for ( i = 1; i < min(ARRAY_SIZE(p->basic.raw),
+                         p->basic.max_leaf + 1ul); ++i )
+    {
+        switch ( i )
+        {
+        case 0x4: case 0x7: case 0xd:
+            /* Multi-invocation leaves.  Deferred. */
+            continue;
+        }
+
+        cpuid_leaf(i, &p->basic.raw[i]);
+    }
+
+    if ( p->basic.max_leaf >= 4 )
+    {
+        for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
+        {
+            union {
+                struct cpuid_leaf l;
+                struct cpuid_cache_leaf c;
+            } u;
+
+            cpuid_count_leaf(4, i, &u.l);
+
+            if ( u.c.type == 0 )
+                break;
+
+            p->cache.subleaf[i] = u.c;
+        }
+
+        /*
+         * The choice of CPUID_GUEST_NR_CACHE is arbitrary.  It is expected
+         * that it will eventually need increasing for future hardware.
+         */
+        if ( i == ARRAY_SIZE(p->cache.raw) )
+            printk(XENLOG_WARNING
+                   "CPUID: Insufficient Leaf 4 space for this hardware\n");
+    }
+
+    if ( p->basic.max_leaf >= 7 )
+    {
+        cpuid_count_leaf(7, 0, &p->feat.raw[0]);
+
+        for ( i = 1; i < min(ARRAY_SIZE(p->feat.raw),
+                             p->feat.max_subleaf + 1ul); ++i )
+            cpuid_count_leaf(7, i, &p->feat.raw[i]);
+    }
+
+    if ( p->basic.max_leaf >= XSTATE_CPUID )
+    {
+        uint64_t xstates;
+
+        cpuid_count_leaf(XSTATE_CPUID, 0, &p->xstate.raw[0]);
+        cpuid_count_leaf(XSTATE_CPUID, 1, &p->xstate.raw[1]);
+
+        xstates = ((uint64_t)(p->xstate.xcr0_high | p->xstate.xss_high) << 32) |
+            (p->xstate.xcr0_low | p->xstate.xss_low);
+
+        for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.raw)); ++i )
+        {
+            if ( xstates & (1ul << i) )
+                cpuid_count_leaf(XSTATE_CPUID, i, &p->xstate.raw[i]);
+        }
+    }
+
+    /* Extended leaves. */
+    cpuid_leaf(0x80000000, &p->extd.raw[0]);
+    for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
+                         p->extd.max_leaf + 1 - 0x80000000ul); ++i )
+        cpuid_leaf(0x80000000 + i, &p->extd.raw[i]);
+
+    p->x86_vendor = boot_cpu_data.x86_vendor;
+
+    if ( cpu_has_arch_caps )
+        rdmsrl(MSR_ARCH_CAPABILITIES, p->arch_caps.raw);
+}
+
+static void __init calculate_host_policy(void)
+{
+    struct cpu_policy *p = &host_cpu_policy;
+    unsigned int max_extd_leaf;
+
+    *p = raw_cpu_policy;
+
+    p->basic.max_leaf =
+        min_t(uint32_t, p->basic.max_leaf,   ARRAY_SIZE(p->basic.raw) - 1);
+    p->feat.max_subleaf =
+        min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
+
+    max_extd_leaf = p->extd.max_leaf;
+
+    /*
+     * For AMD/Hygon hardware before Zen3, we unilaterally modify LFENCE to be
+     * dispatch serialising for Spectre mitigations.  Extend max_extd_leaf
+     * beyond what hardware supports, to include the feature leaf containing
+     * this information.
+     */
+    if ( cpu_has_lfence_dispatch )
+        max_extd_leaf = max(max_extd_leaf, 0x80000021);
+
+    p->extd.max_leaf = 0x80000000 | min_t(uint32_t, max_extd_leaf & 0xffff,
+                                          ARRAY_SIZE(p->extd.raw) - 1);
+
+    x86_cpu_featureset_to_policy(boot_cpu_data.x86_capability, p);
+    recalculate_xstate(p);
+    recalculate_misc(p);
+
+    if ( p->extd.svm )
+    {
+        /* Clamp to implemented features which require hardware support. */
+        p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
+                               (1u << SVM_FEATURE_LBRV) |
+                               (1u << SVM_FEATURE_NRIPS) |
+                               (1u << SVM_FEATURE_PAUSEFILTER) |
+                               (1u << SVM_FEATURE_DECODEASSISTS));
+        /* Enable features which are always emulated. */
+        p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
+                               (1u << SVM_FEATURE_TSCRATEMSR));
+    }
+
+    /* Temporary, until we have known_features[] for feature bits in MSRs. */
+    p->arch_caps.raw &=
+        (ARCH_CAPABILITIES_RDCL_NO | ARCH_CAPABILITIES_IBRS_ALL | ARCH_CAPS_RSBA |
+         ARCH_CAPS_SKIP_L1DFL | ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO |
+         ARCH_CAPS_IF_PSCHANGE_MC_NO | ARCH_CAPS_TSX_CTRL | ARCH_CAPS_TAA_NO |
+         ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO | ARCH_CAPS_PSDP_NO |
+         ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA | ARCH_CAPS_BHI_NO |
+         ARCH_CAPS_PBRSB_NO);
+}
+
+static void __init guest_common_feature_adjustments(uint32_t *fs)
+{
+    /* Unconditionally claim to be able to set the hypervisor bit. */
+    __set_bit(X86_FEATURE_HYPERVISOR, fs);
+
+    /*
+     * If IBRS is offered to the guest, unconditionally offer STIBP.  It is a
+     * nop on non-HT hardware, and has this behaviour to make heterogeneous
+     * setups easier to manage.
+     */
+    if ( test_bit(X86_FEATURE_IBRSB, fs) )
+        __set_bit(X86_FEATURE_STIBP, fs);
+    if ( test_bit(X86_FEATURE_IBRS, fs) )
+        __set_bit(X86_FEATURE_AMD_STIBP, fs);
+
+    /*
+     * On hardware which supports IBRS/IBPB, we can offer IBPB independently
+     * of IBRS by using the AMD feature bit.  An administrator may wish for
+     * performance reasons to offer IBPB without IBRS.
+     */
+    if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+        __set_bit(X86_FEATURE_IBPB, fs);
+}
+
+static void __init calculate_pv_max_policy(void)
+{
+    struct cpu_policy *p = &pv_max_cpu_policy;
+    uint32_t fs[FSCAPINTS];
+    unsigned int i;
+
+    *p = host_cpu_policy;
+    x86_cpu_policy_to_featureset(p, fs);
+
+    for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+        fs[i] &= pv_featuremask[i];
+
+    /*
+     * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional
+     * availability, or admin choice), hide the feature.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
+    {
+        __clear_bit(X86_FEATURE_IBRSB, fs);
+        __clear_bit(X86_FEATURE_IBRS, fs);
+    }
+
+    guest_common_feature_adjustments(fs);
+
+    sanitise_featureset(fs);
+    x86_cpu_featureset_to_policy(fs, p);
+    recalculate_xstate(p);
+
+    p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
+
+    p->arch_caps.raw = 0; /* Not supported yet. */
+}
+
+static void __init calculate_hvm_max_policy(void)
+{
+    struct cpu_policy *p = &hvm_max_cpu_policy;
+    uint32_t fs[FSCAPINTS];
+    unsigned int i;
+    const uint32_t *mask;
+
+    *p = host_cpu_policy;
+    x86_cpu_policy_to_featureset(p, fs);
+
+    mask = hvm_funcs.hap_supported ?
+        hvm_hap_featuremask : hvm_shadow_featuremask;
+
+    for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+        fs[i] &= mask[i];
+
+    /*
+     * Xen can provide an APIC emulation to HVM guests even if the host's APIC
+     * isn't enabled.
+     */
+    __set_bit(X86_FEATURE_APIC, fs);
+
+    /*
+     * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
+     * long mode (and init_amd() has cleared it out of host capabilities), but
+     * HVM guests are able if running in protected mode.
+     */
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+         raw_cpu_policy.basic.sep )
+        __set_bit(X86_FEATURE_SEP, fs);
+
+    /*
+     * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests (functional
+     * availability, or admin choice), hide the feature.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
+    {
+        __clear_bit(X86_FEATURE_IBRSB, fs);
+        __clear_bit(X86_FEATURE_IBRS, fs);
+    }
+
+    /*
+     * With VT-x, some features are only supported by Xen if dedicated
+     * hardware support is also available.
+     */
+    if ( cpu_has_vmx )
+    {
+        if ( !cpu_has_vmx_mpx )
+            __clear_bit(X86_FEATURE_MPX, fs);
+
+        if ( !cpu_has_vmx_xsaves )
+            __clear_bit(X86_FEATURE_XSAVES, fs);
+    }
+
+    guest_common_feature_adjustments(fs);
+
+    sanitise_featureset(fs);
+    x86_cpu_featureset_to_policy(fs, p);
+    recalculate_xstate(p);
+
+    p->arch_caps.raw = 0; /* Not supported yet. */
+}
+
+void __init init_guest_cpu_policies(void)
+{
+    calculate_raw_policy();
+    calculate_host_policy();
+
+    calculate_pv_max_policy();
+
+    if ( hvm_enabled )
+        calculate_hvm_max_policy();
+}
+
 int init_domain_cpu_policy(struct domain *d)
 {
     struct cpu_policy *p = xmalloc(struct cpu_policy);
@@ -27,3 +612,224 @@ int init_domain_cpu_policy(struct domain
 
     return 0;
 }
+
+void recalculate_cpuid_policy(struct domain *d)
+{
+    struct cpu_policy *p = d->arch.cpuid;
+    const struct cpu_policy *max =
+        is_pv_domain(d) ? &pv_max_cpu_policy : &hvm_max_cpu_policy;
+    uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
+    unsigned int i;
+
+    p->x86_vendor = get_cpu_vendor(p->basic.vendor_ebx, p->basic.vendor_ecx,
+                                   p->basic.vendor_edx, gcv_guest);
+
+    p->basic.max_leaf   = min(p->basic.max_leaf,   max->basic.max_leaf);
+    p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
+    p->extd.max_leaf    = 0x80000000 | min(p->extd.max_leaf & 0xffff,
+                                           (p->x86_vendor == X86_VENDOR_AMD
+                                            ? CPUID_GUEST_NR_EXTD_AMD
+                                            : CPUID_GUEST_NR_EXTD_INTEL) - 1);
+
+    x86_cpu_policy_to_featureset(p, fs);
+    x86_cpu_policy_to_featureset(max, max_fs);
+
+    if ( is_hvm_domain(d) )
+    {
+        /*
+         * HVM domains using Shadow paging have further restrictions on their
+         * available paging features.
+         */
+        if ( !hap_enabled(d) )
+        {
+            for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
+                max_fs[i] &= hvm_shadow_featuremask[i];
+        }
+
+        /* Hide nested-virt if it hasn't been explicitly configured. */
+        if ( !nestedhvm_enabled(d) )
+        {
+            __clear_bit(X86_FEATURE_VMX, max_fs);
+            __clear_bit(X86_FEATURE_SVM, max_fs);
+        }
+
+        /*
+         * MPX is disabled by default, but we can support incoming VMs which
+         * have seen it, if the hardware is capable.
+         */
+        if ( cpu_has_mpx && cpu_has_vmx_mpx )
+            __set_bit(X86_FEATURE_MPX, max_fs);
+    }
+
+    /*
+     * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY.  These bits
+     * affect how to interpret topology information in other cpuid leaves.
+     */
+    __set_bit(X86_FEATURE_HTT, max_fs);
+    __set_bit(X86_FEATURE_X2APIC, max_fs);
+    __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
+
+    /*
+     * 32bit PV domains can't use any Long Mode features, and cannot use
+     * SYSCALL on non-AMD hardware.
+     */
+    if ( is_pv_32bit_domain(d) )
+    {
+        __clear_bit(X86_FEATURE_LM, max_fs);
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            __clear_bit(X86_FEATURE_SYSCALL, max_fs);
+    }
+
+    /*
+     * ITSC is masked by default (so domains are safe to migrate), but a
+     * toolstack which has configured disable_migrate or vTSC for a domain may
+     * safely select it, and needs a way of doing so.
+     */
+    if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
+        __set_bit(X86_FEATURE_ITSC, max_fs);
+
+    /*
+     * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
+     * TSX and hide the feature bits.  Migrating-in VMs may have been booted
+     * pre-mitigation when the TSX features were visbile.
+     *
+     * This situation is compatible (albeit with a perf hit to any TSX code in
+     * the guest), so allow the feature bits to remain set.
+     */
+    if ( cpu_has_tsx_ctrl )
+    {
+        __set_bit(X86_FEATURE_HLE, max_fs);
+        __set_bit(X86_FEATURE_RTM, max_fs);
+    }
+
+    /* Clamp the toolstacks choices to reality. */
+    for ( i = 0; i < ARRAY_SIZE(fs); i++ )
+        fs[i] &= max_fs[i];
+
+    if ( p->basic.max_leaf < XSTATE_CPUID )
+        __clear_bit(X86_FEATURE_XSAVE, fs);
+
+    sanitise_featureset(fs);
+
+    /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
+    fs[FEATURESET_7b0] &= ~(cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
+                            cpufeat_mask(X86_FEATURE_NO_FPU_SEL));
+    fs[FEATURESET_7b0] |= (host_cpu_policy.feat._7b0 &
+                           (cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
+                            cpufeat_mask(X86_FEATURE_NO_FPU_SEL)));
+
+    x86_cpu_featureset_to_policy(fs, p);
+
+    /* Pass host cacheline size through to guests. */
+    p->basic.clflush_size = max->basic.clflush_size;
+
+    p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
+    p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
+                                paging_max_paddr_bits(d));
+    p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
+                                (p->basic.pae || p->basic.pse36) ? 36 : 32);
+
+    p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
+
+    recalculate_xstate(p);
+    recalculate_misc(p);
+
+    for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
+    {
+        if ( p->cache.subleaf[i].type >= 1 &&
+             p->cache.subleaf[i].type <= 3 )
+        {
+            /* Subleaf has a valid cache type. Zero reserved fields. */
+            p->cache.raw[i].a &= 0xffffc3ffu;
+            p->cache.raw[i].d &= 0x00000007u;
+        }
+        else
+        {
+            /* Subleaf is not valid.  Zero the rest of the union. */
+            zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
+            break;
+        }
+    }
+
+    if ( !p->extd.svm )
+        p->extd.raw[0xa] = EMPTY_LEAF;
+
+    if ( !p->extd.page1gb )
+        p->extd.raw[0x19] = EMPTY_LEAF;
+
+    if ( p->extd.lwp )
+        p->extd.raw[0x1c].d &= max->extd.raw[0x1c].d;
+    else
+        p->extd.raw[0x1c] = EMPTY_LEAF;
+}
+
+void __init init_dom0_cpuid_policy(struct domain *d)
+{
+    struct cpu_policy *p = d->arch.cpuid;
+
+    /* dom0 can't migrate.  Give it ITSC if available. */
+    if ( cpu_has_itsc )
+        p->extd.itsc = true;
+
+    /*
+     * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
+     * so dom0 can turn off workarounds as appropriate.  Temporary, until the
+     * domain policy logic gains a better understanding of MSRs.
+     */
+    if ( cpu_has_arch_caps )
+        p->feat.arch_caps = true;
+
+    /* Apply dom0-cpuid= command line settings, if provided. */
+    if ( dom0_cpuid_cmdline )
+    {
+        uint32_t fs[FSCAPINTS];
+        unsigned int i;
+
+        x86_cpu_policy_to_featureset(p, fs);
+
+        for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+        {
+            fs[i] |=  dom0_enable_feat [i];
+            fs[i] &= ~dom0_disable_feat[i];
+        }
+
+        x86_cpu_featureset_to_policy(fs, p);
+    }
+
+    recalculate_cpuid_policy(d);
+
+    if ( p->feat.arch_caps )
+    {
+        uint64_t val;
+
+        rdmsrl(MSR_ARCH_CAPABILITIES, val);
+
+        p->arch_caps.raw = val &
+            (ARCH_CAPABILITIES_RDCL_NO | ARCH_CAPABILITIES_IBRS_ALL | ARCH_CAPS_RSBA |
+             ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO |
+             ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO |
+             ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA |
+             ARCH_CAPS_BHI_NO | ARCH_CAPS_PBRSB_NO);
+    }
+}
+
+static void __init __maybe_unused build_assertions(void)
+{
+    BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
+    BUILD_BUG_ON(ARRAY_SIZE(pv_featuremask) != FSCAPINTS);
+    BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_featuremask) != FSCAPINTS);
+    BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_featuremask) != FSCAPINTS);
+    BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
+
+    /* Find some more clever allocation scheme if this trips. */
+    BUILD_BUG_ON(sizeof(struct cpu_policy) > PAGE_SIZE);
+
+    BUILD_BUG_ON(sizeof(raw_cpu_policy.basic) !=
+                 sizeof(raw_cpu_policy.basic.raw));
+    BUILD_BUG_ON(sizeof(raw_cpu_policy.feat) !=
+                 sizeof(raw_cpu_policy.feat.raw));
+    BUILD_BUG_ON(sizeof(raw_cpu_policy.xstate) !=
+                 sizeof(raw_cpu_policy.xstate.raw));
+    BUILD_BUG_ON(sizeof(raw_cpu_policy.extd) !=
+                 sizeof(raw_cpu_policy.extd.raw));
+}
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -1,590 +1,14 @@
-#include <xen/init.h>
-#include <xen/lib.h>
 #include <xen/sched.h>
-#include <asm/cpu-policy.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/nestedhvm.h>
-#include <asm/hvm/svm/svm.h>
-#include <asm/hvm/vmx/vmcs.h>
-#include <asm/msr.h>
-#include <asm/paging.h>
-#include <asm/processor.h>
-#include <asm/xstate.h>
-
-const uint32_t known_features[] = INIT_KNOWN_FEATURES;
-
-static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
-static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
-static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
-static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
-
-static const struct feature_name {
-    const char *name;
-    unsigned int bit;
-} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
-
-/*
- * Parse a list of cpuid feature names -> bool, calling the callback for any
- * matches found.
- *
- * always_inline, because this is init code only and we really don't want a
- * function pointer call in the middle of the loop.
- */
-static int __init always_inline parse_cpuid(
-    const char *s, void (*callback)(unsigned int feat, bool val))
-{
-    const char *ss;
-    int val, rc = 0;
-
-    do {
-        const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
-        const char *feat;
-
-        ss = strchr(s, ',');
-        if ( !ss )
-            ss = strchr(s, '\0');
-
-        /* Skip the 'no-' prefix for name comparisons. */
-        feat = s;
-        if ( strncmp(s, "no-", 3) == 0 )
-            feat += 3;
-
-        /* (Re)initalise lhs and rhs for binary search. */
-        lhs = feature_names;
-        rhs = feature_names + ARRAY_SIZE(feature_names);
-
-        while ( lhs < rhs )
-        {
-            int res;
-
-            mid = lhs + (rhs - lhs) / 2;
-            res = cmdline_strcmp(feat, mid->name);
-
-            if ( res < 0 )
-            {
-                rhs = mid;
-                continue;
-            }
-            if ( res > 0 )
-            {
-                lhs = mid + 1;
-                continue;
-            }
-
-            if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
-            {
-                callback(mid->bit, val);
-                mid = NULL;
-            }
-
-            break;
-        }
-
-        /*
-         * Mid being NULL means that the name and boolean were successfully
-         * identified.  Everything else is an error.
-         */
-        if ( mid )
-            rc = -EINVAL;
-
-        s = ss + 1;
-    } while ( *ss );
-
-    return rc;
-}
+#include <xen/types.h>
 
-static void __init _parse_xen_cpuid(unsigned int feat, bool val)
-{
-    if ( !val )
-        setup_clear_cpu_cap(feat);
-    else if ( feat == X86_FEATURE_RDRAND &&
-              (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
-        setup_force_cpu_cap(X86_FEATURE_RDRAND);
-}
-
-static int __init parse_xen_cpuid(const char *s)
-{
-    return parse_cpuid(s, _parse_xen_cpuid);
-}
-custom_param("cpuid", parse_xen_cpuid);
-
-static bool __initdata dom0_cpuid_cmdline;
-static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
-static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
-
-static void __init _parse_dom0_cpuid(unsigned int feat, bool val)
-{
-    __set_bit  (feat, val ? dom0_enable_feat  : dom0_disable_feat);
-    __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
-}
+#include <public/hvm/params.h>
 
-static int __init parse_dom0_cpuid(const char *s)
-{
-    dom0_cpuid_cmdline = true;
-
-    return parse_cpuid(s, _parse_dom0_cpuid);
-}
-custom_param("dom0-cpuid", parse_dom0_cpuid);
+#include <asm/cpu-policy.h>
+#include <asm/cpuid.h>
+#include <asm/hvm/viridian.h>
+#include <asm/xstate.h>
 
 #define EMPTY_LEAF ((struct cpuid_leaf){})
-static void zero_leaves(struct cpuid_leaf *l,
-                        unsigned int first, unsigned int last)
-{
-    memset(&l[first], 0, sizeof(*l) * (last - first + 1));
-}
-
-static void sanitise_featureset(uint32_t *fs)
-{
-    /* for_each_set_bit() uses unsigned longs.  Extend with zeroes. */
-    uint32_t disabled_features[
-        ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
-    unsigned int i;
-
-    for ( i = 0; i < FSCAPINTS; ++i )
-    {
-        /* Clamp to known mask. */
-        fs[i] &= known_features[i];
-
-        /*
-         * Identify which features with deep dependencies have been
-         * disabled.
-         */
-        disabled_features[i] = ~fs[i] & deep_features[i];
-    }
-
-    for_each_set_bit(i, (void *)disabled_features,
-                     sizeof(disabled_features) * 8)
-    {
-        const uint32_t *dfs = lookup_deep_deps(i);
-        unsigned int j;
-
-        ASSERT(dfs); /* deep_features[] should guarentee this. */
-
-        for ( j = 0; j < FSCAPINTS; ++j )
-        {
-            fs[j] &= ~dfs[j];
-            disabled_features[j] &= ~dfs[j];
-        }
-    }
-}
-
-static void recalculate_xstate(struct cpuid_policy *p)
-{
-    uint64_t xstates = XSTATE_FP_SSE;
-    uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
-    unsigned int i, Da1 = p->xstate.Da1;
-
-    /*
-     * The Da1 leaf is the only piece of information preserved in the common
-     * case.  Everything else is derived from other feature state.
-     */
-    memset(&p->xstate, 0, sizeof(p->xstate));
-
-    if ( !p->basic.xsave )
-        return;
-
-    if ( p->basic.avx )
-    {
-        xstates |= XSTATE_YMM;
-        xstate_size = max(xstate_size,
-                          xstate_offsets[_XSTATE_YMM] +
-                          xstate_sizes[_XSTATE_YMM]);
-    }
-
-    if ( p->feat.mpx )
-    {
-        xstates |= XSTATE_BNDREGS | XSTATE_BNDCSR;
-        xstate_size = max(xstate_size,
-                          xstate_offsets[_XSTATE_BNDCSR] +
-                          xstate_sizes[_XSTATE_BNDCSR]);
-    }
-
-    if ( p->feat.avx512f )
-    {
-        xstates |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
-        xstate_size = max(xstate_size,
-                          xstate_offsets[_XSTATE_HI_ZMM] +
-                          xstate_sizes[_XSTATE_HI_ZMM]);
-    }
-
-    if ( p->feat.pku )
-    {
-        xstates |= XSTATE_PKRU;
-        xstate_size = max(xstate_size,
-                          xstate_offsets[_XSTATE_PKRU] +
-                          xstate_sizes[_XSTATE_PKRU]);
-    }
-
-    if ( p->extd.lwp )
-    {
-        xstates |= XSTATE_LWP;
-        xstate_size = max(xstate_size,
-                          xstate_offsets[_XSTATE_LWP] +
-                          xstate_sizes[_XSTATE_LWP]);
-    }
-
-    p->xstate.max_size  =  xstate_size;
-    p->xstate.xcr0_low  =  xstates & ~XSTATE_XSAVES_ONLY;
-    p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
-
-    p->xstate.Da1 = Da1;
-    if ( p->xstate.xsaves )
-    {
-        p->xstate.xss_low   =  xstates & XSTATE_XSAVES_ONLY;
-        p->xstate.xss_high  = (xstates & XSTATE_XSAVES_ONLY) >> 32;
-    }
-    else
-        xstates &= ~XSTATE_XSAVES_ONLY;
-
-    for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
-    {
-        uint64_t curr_xstate = 1ul << i;
-
-        if ( !(xstates & curr_xstate) )
-            continue;
-
-        p->xstate.comp[i].size   = xstate_sizes[i];
-        p->xstate.comp[i].offset = xstate_offsets[i];
-        p->xstate.comp[i].xss    = curr_xstate & XSTATE_XSAVES_ONLY;
-        p->xstate.comp[i].align  = curr_xstate & xstate_align;
-    }
-}
-
-/*
- * Misc adjustments to the policy.  Mostly clobbering reserved fields and
- * duplicating shared fields.  Intentionally hidden fields are annotated.
- */
-static void recalculate_misc(struct cpuid_policy *p)
-{
-    p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
-    p->basic.apic_id = 0; /* Dynamic. */
-
-    p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
-    p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
-
-    p->basic.raw[0x8] = EMPTY_LEAF;
-    p->basic.raw[0xb] = EMPTY_LEAF; /* TODO: Rework topology logic. */
-    p->basic.raw[0xc] = EMPTY_LEAF;
-
-    p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
-
-    /* Most of Power/RAS hidden from guests. */
-    p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
-
-    p->extd.raw[0x8].d = 0;
-
-    switch ( p->x86_vendor )
-    {
-    case X86_VENDOR_INTEL:
-        p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
-        p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
-        p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
-
-        p->extd.vendor_ebx = 0;
-        p->extd.vendor_ecx = 0;
-        p->extd.vendor_edx = 0;
-
-        p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
-
-        p->extd.raw[0x5] = EMPTY_LEAF;
-        p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
-
-        p->extd.raw[0x8].a &= 0x0000ffff;
-        p->extd.raw[0x8].c = 0;
-        break;
-
-    case X86_VENDOR_AMD:
-        zero_leaves(p->basic.raw, 0x2, 0x3);
-        memset(p->cache.raw, 0, sizeof(p->cache.raw));
-        zero_leaves(p->basic.raw, 0x9, 0xa);
-
-        p->extd.vendor_ebx = p->basic.vendor_ebx;
-        p->extd.vendor_ecx = p->basic.vendor_ecx;
-        p->extd.vendor_edx = p->basic.vendor_edx;
-
-        p->extd.raw_fms = p->basic.raw_fms;
-        p->extd.raw[0x1].b &= 0xff00ffff;
-        p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
-
-        p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
-        p->extd.raw[0x8].c &= 0x0003f0ff;
-
-        p->extd.raw[0x9] = EMPTY_LEAF;
-
-        zero_leaves(p->extd.raw, 0xb, 0x18);
-
-        /* 0x19 - TLB details.  Pass through. */
-        /* 0x1a - Perf hints.   Pass through. */
-
-        p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
-
-        p->extd.raw[0x1c].a = 0; /* LWP.a entirely dynamic. */
-        p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */
-        p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */
-        p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */
-        p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */
-        break;
-    }
-}
-
-static void __init calculate_raw_policy(void)
-{
-    struct cpuid_policy *p = &raw_cpu_policy;
-    unsigned int i;
-
-    cpuid_leaf(0, &p->basic.raw[0]);
-    for ( i = 1; i < min(ARRAY_SIZE(p->basic.raw),
-                         p->basic.max_leaf + 1ul); ++i )
-    {
-        switch ( i )
-        {
-        case 0x4: case 0x7: case 0xd:
-            /* Multi-invocation leaves.  Deferred. */
-            continue;
-        }
-
-        cpuid_leaf(i, &p->basic.raw[i]);
-    }
-
-    if ( p->basic.max_leaf >= 4 )
-    {
-        for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
-        {
-            union {
-                struct cpuid_leaf l;
-                struct cpuid_cache_leaf c;
-            } u;
-
-            cpuid_count_leaf(4, i, &u.l);
-
-            if ( u.c.type == 0 )
-                break;
-
-            p->cache.subleaf[i] = u.c;
-        }
-
-        /*
-         * The choice of CPUID_GUEST_NR_CACHE is arbitrary.  It is expected
-         * that it will eventually need increasing for future hardware.
-         */
-        if ( i == ARRAY_SIZE(p->cache.raw) )
-            printk(XENLOG_WARNING
-                   "CPUID: Insufficient Leaf 4 space for this hardware\n");
-    }
-
-    if ( p->basic.max_leaf >= 7 )
-    {
-        cpuid_count_leaf(7, 0, &p->feat.raw[0]);
-
-        for ( i = 1; i < min(ARRAY_SIZE(p->feat.raw),
-                             p->feat.max_subleaf + 1ul); ++i )
-            cpuid_count_leaf(7, i, &p->feat.raw[i]);
-    }
-
-    if ( p->basic.max_leaf >= XSTATE_CPUID )
-    {
-        uint64_t xstates;
-
-        cpuid_count_leaf(XSTATE_CPUID, 0, &p->xstate.raw[0]);
-        cpuid_count_leaf(XSTATE_CPUID, 1, &p->xstate.raw[1]);
-
-        xstates = ((uint64_t)(p->xstate.xcr0_high | p->xstate.xss_high) << 32) |
-            (p->xstate.xcr0_low | p->xstate.xss_low);
-
-        for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.raw)); ++i )
-        {
-            if ( xstates & (1ul << i) )
-                cpuid_count_leaf(XSTATE_CPUID, i, &p->xstate.raw[i]);
-        }
-    }
-
-    /* Extended leaves. */
-    cpuid_leaf(0x80000000, &p->extd.raw[0]);
-    for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
-                         p->extd.max_leaf + 1 - 0x80000000ul); ++i )
-        cpuid_leaf(0x80000000 + i, &p->extd.raw[i]);
-
-    p->x86_vendor = boot_cpu_data.x86_vendor;
-
-    if ( cpu_has_arch_caps )
-        rdmsrl(MSR_ARCH_CAPABILITIES, p->arch_caps.raw);
-}
-
-static void __init calculate_host_policy(void)
-{
-    struct cpuid_policy *p = &host_cpu_policy;
-    unsigned int max_extd_leaf;
-
-    *p = raw_cpu_policy;
-
-    p->basic.max_leaf =
-        min_t(uint32_t, p->basic.max_leaf,   ARRAY_SIZE(p->basic.raw) - 1);
-    p->feat.max_subleaf =
-        min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
-
-    max_extd_leaf = p->extd.max_leaf;
-
-    /*
-     * For AMD/Hygon hardware before Zen3, we unilaterally modify LFENCE to be
-     * dispatch serialising for Spectre mitigations.  Extend max_extd_leaf
-     * beyond what hardware supports, to include the feature leaf containing
-     * this information.
-     */
-    if ( cpu_has_lfence_dispatch )
-        max_extd_leaf = max(max_extd_leaf, 0x80000021);
-
-    p->extd.max_leaf = 0x80000000 | min_t(uint32_t, max_extd_leaf & 0xffff,
-                                          ARRAY_SIZE(p->extd.raw) - 1);
-
-    x86_cpu_featureset_to_policy(boot_cpu_data.x86_capability, p);
-    recalculate_xstate(p);
-    recalculate_misc(p);
-
-    if ( p->extd.svm )
-    {
-        /* Clamp to implemented features which require hardware support. */
-        p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
-                               (1u << SVM_FEATURE_LBRV) |
-                               (1u << SVM_FEATURE_NRIPS) |
-                               (1u << SVM_FEATURE_PAUSEFILTER) |
-                               (1u << SVM_FEATURE_DECODEASSISTS));
-        /* Enable features which are always emulated. */
-        p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
-                               (1u << SVM_FEATURE_TSCRATEMSR));
-    }
-
-    /* Temporary, until we have known_features[] for feature bits in MSRs. */
-    p->arch_caps.raw &=
-        (ARCH_CAPABILITIES_RDCL_NO | ARCH_CAPABILITIES_IBRS_ALL | ARCH_CAPS_RSBA |
-         ARCH_CAPS_SKIP_L1DFL | ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO |
-         ARCH_CAPS_IF_PSCHANGE_MC_NO | ARCH_CAPS_TSX_CTRL | ARCH_CAPS_TAA_NO |
-         ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO | ARCH_CAPS_PSDP_NO |
-         ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA | ARCH_CAPS_BHI_NO |
-         ARCH_CAPS_PBRSB_NO);
-}
-
-static void __init guest_common_feature_adjustments(uint32_t *fs)
-{
-    /* Unconditionally claim to be able to set the hypervisor bit. */
-    __set_bit(X86_FEATURE_HYPERVISOR, fs);
-
-    /*
-     * If IBRS is offered to the guest, unconditionally offer STIBP.  It is a
-     * nop on non-HT hardware, and has this behaviour to make heterogeneous
-     * setups easier to manage.
-     */
-    if ( test_bit(X86_FEATURE_IBRSB, fs) )
-        __set_bit(X86_FEATURE_STIBP, fs);
-
-    /*
-     * On hardware which supports IBRS/IBPB, we can offer IBPB independently
-     * of IBRS by using the AMD feature bit.  An administrator may wish for
-     * performance reasons to offer IBPB without IBRS.
-     */
-    if ( boot_cpu_has(X86_FEATURE_IBRSB) )
-        __set_bit(X86_FEATURE_IBPB, fs);
-}
-
-static void __init calculate_pv_max_policy(void)
-{
-    struct cpuid_policy *p = &pv_max_cpu_policy;
-    uint32_t pv_featureset[FSCAPINTS];
-    unsigned int i;
-
-    *p = host_cpu_policy;
-    x86_cpu_policy_to_featureset(p, pv_featureset);
-
-    for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
-        pv_featureset[i] &= pv_featuremask[i];
-
-    /*
-     * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests because of
-     * administrator choice, hide the feature.
-     */
-    if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
-        __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
-
-    guest_common_feature_adjustments(pv_featureset);
-
-    sanitise_featureset(pv_featureset);
-    x86_cpu_featureset_to_policy(pv_featureset, p);
-    recalculate_xstate(p);
-
-    p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
-
-    p->arch_caps.raw = 0; /* Not supported yet. */
-}
-
-static void __init calculate_hvm_max_policy(void)
-{
-    struct cpuid_policy *p = &hvm_max_cpu_policy;
-    uint32_t hvm_featureset[FSCAPINTS];
-    unsigned int i;
-    const uint32_t *hvm_featuremask;
-
-    if ( !hvm_enabled )
-        return;
-
-    *p = host_cpu_policy;
-    x86_cpu_policy_to_featureset(p, hvm_featureset);
-
-    hvm_featuremask = hvm_funcs.hap_supported ?
-        hvm_hap_featuremask : hvm_shadow_featuremask;
-
-    for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
-        hvm_featureset[i] &= hvm_featuremask[i];
-
-    /*
-     * Xen can provide an APIC emulation to HVM guests even if the host's APIC
-     * isn't enabled.
-     */
-    __set_bit(X86_FEATURE_APIC, hvm_featureset);
-
-    /*
-     * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
-     * long mode (and init_amd() has cleared it out of host capabilities), but
-     * HVM guests are able if running in protected mode.
-     */
-    if ( (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
-         raw_cpu_policy.basic.sep )
-        __set_bit(X86_FEATURE_SEP, hvm_featureset);
-
-    /*
-     * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests because of
-     * administrator choice, hide the feature.
-     */
-    if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
-        __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
-
-    /*
-     * With VT-x, some features are only supported by Xen if dedicated
-     * hardware support is also available.
-     */
-    if ( cpu_has_vmx )
-    {
-        if ( !cpu_has_vmx_mpx )
-            __clear_bit(X86_FEATURE_MPX, hvm_featureset);
-
-        if ( !cpu_has_vmx_xsaves )
-            __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
-    }
-
-    guest_common_feature_adjustments(hvm_featureset);
-
-    sanitise_featureset(hvm_featureset);
-    x86_cpu_featureset_to_policy(hvm_featureset, p);
-    recalculate_xstate(p);
-
-    p->arch_caps.raw = 0; /* Not supported yet. */
-}
-
-void __init init_guest_cpuid(void)
-{
-    calculate_raw_policy();
-    calculate_host_policy();
-    calculate_pv_max_policy();
-    calculate_hvm_max_policy();
-}
 
 /*
  * Caller to confirm that MSR_SPEC_CTRL is available.  Intel and AMD have
@@ -608,6 +32,7 @@ uint64_t msr_spec_ctrl_valid_bits(const
 
 const uint32_t *lookup_deep_deps(uint32_t feature)
 {
+    static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
     static const struct {
         uint32_t feature;
         uint32_t fs[FSCAPINTS];
@@ -636,198 +61,6 @@ const uint32_t *lookup_deep_deps(uint32_
     return NULL;
 }
 
-void recalculate_cpuid_policy(struct domain *d)
-{
-    struct cpuid_policy *p = d->arch.cpuid;
-    const struct cpuid_policy *max =
-        is_pv_domain(d) ? &pv_max_cpu_policy : &hvm_max_cpu_policy;
-    uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
-    unsigned int i;
-
-    p->x86_vendor = get_cpu_vendor(p->basic.vendor_ebx, p->basic.vendor_ecx,
-                                   p->basic.vendor_edx, gcv_guest);
-
-    p->basic.max_leaf   = min(p->basic.max_leaf,   max->basic.max_leaf);
-    p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
-    p->extd.max_leaf    = 0x80000000 | min(p->extd.max_leaf & 0xffff,
-                                           (p->x86_vendor == X86_VENDOR_AMD
-                                            ? CPUID_GUEST_NR_EXTD_AMD
-                                            : CPUID_GUEST_NR_EXTD_INTEL) - 1);
-
-    x86_cpu_policy_to_featureset(p, fs);
-    x86_cpu_policy_to_featureset(max, max_fs);
-
-    if ( is_hvm_domain(d) )
-    {
-        /*
-         * HVM domains using Shadow paging have further restrictions on their
-         * available paging features.
-         */
-        if ( !hap_enabled(d) )
-        {
-            for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
-                max_fs[i] &= hvm_shadow_featuremask[i];
-        }
-
-        /* Hide nested-virt if it hasn't been explicitly configured. */
-        if ( !nestedhvm_enabled(d) )
-        {
-            __clear_bit(X86_FEATURE_VMX, max_fs);
-            __clear_bit(X86_FEATURE_SVM, max_fs);
-        }
-    }
-
-    /*
-     * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY.  These bits
-     * affect how to interpret topology information in other cpuid leaves.
-     */
-    __set_bit(X86_FEATURE_HTT, max_fs);
-    __set_bit(X86_FEATURE_X2APIC, max_fs);
-    __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
-
-    /*
-     * 32bit PV domains can't use any Long Mode features, and cannot use
-     * SYSCALL on non-AMD hardware.
-     */
-    if ( is_pv_32bit_domain(d) )
-    {
-        __clear_bit(X86_FEATURE_LM, max_fs);
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-            __clear_bit(X86_FEATURE_SYSCALL, max_fs);
-    }
-
-    /*
-     * ITSC is masked by default (so domains are safe to migrate), but a
-     * toolstack which has configured disable_migrate or vTSC for a domain may
-     * safely select it, and needs a way of doing so.
-     */
-    if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
-        __set_bit(X86_FEATURE_ITSC, max_fs);
-
-    /*
-     * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
-     * TSX and hide the feature bits.  Migrating-in VMs may have been booted
-     * pre-mitigation when the TSX features were visbile.
-     *
-     * This situation is compatible (albeit with a perf hit to any TSX code in
-     * the guest), so allow the feature bits to remain set.
-     */
-    if ( cpu_has_tsx_ctrl )
-    {
-        __set_bit(X86_FEATURE_HLE, max_fs);
-        __set_bit(X86_FEATURE_RTM, max_fs);
-    }
-
-    /* Clamp the toolstacks choices to reality. */
-    for ( i = 0; i < ARRAY_SIZE(fs); i++ )
-        fs[i] &= max_fs[i];
-
-    if ( p->basic.max_leaf < XSTATE_CPUID )
-        __clear_bit(X86_FEATURE_XSAVE, fs);
-
-    sanitise_featureset(fs);
-
-    /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
-    fs[FEATURESET_7b0] &= ~(cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
-                            cpufeat_mask(X86_FEATURE_NO_FPU_SEL));
-    fs[FEATURESET_7b0] |= (host_cpu_policy.feat._7b0 &
-                           (cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
-                            cpufeat_mask(X86_FEATURE_NO_FPU_SEL)));
-
-    x86_cpu_featureset_to_policy(fs, p);
-
-    /* Pass host cacheline size through to guests. */
-    p->basic.clflush_size = max->basic.clflush_size;
-
-    p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
-    p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
-                                paging_max_paddr_bits(d));
-    p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
-                                (p->basic.pae || p->basic.pse36) ? 36 : 32);
-
-    p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
-
-    recalculate_xstate(p);
-    recalculate_misc(p);
-
-    for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
-    {
-        if ( p->cache.subleaf[i].type >= 1 &&
-             p->cache.subleaf[i].type <= 3 )
-        {
-            /* Subleaf has a valid cache type. Zero reserved fields. */
-            p->cache.raw[i].a &= 0xffffc3ffu;
-            p->cache.raw[i].d &= 0x00000007u;
-        }
-        else
-        {
-            /* Subleaf is not valid.  Zero the rest of the union. */
-            zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
-            break;
-        }
-    }
-
-    if ( !p->extd.svm )
-        p->extd.raw[0xa] = EMPTY_LEAF;
-
-    if ( !p->extd.page1gb )
-        p->extd.raw[0x19] = EMPTY_LEAF;
-
-    if ( p->extd.lwp )
-        p->extd.raw[0x1c].d &= max->extd.raw[0x1c].d;
-    else
-        p->extd.raw[0x1c] = EMPTY_LEAF;
-}
-
-void __init init_dom0_cpuid_policy(struct domain *d)
-{
-    struct cpuid_policy *p = d->arch.cpuid;
-
-    /* dom0 can't migrate.  Give it ITSC if available. */
-    if ( cpu_has_itsc )
-        p->extd.itsc = true;
-
-    /*
-     * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
-     * so dom0 can turn off workarounds as appropriate.  Temporary, until the
-     * domain policy logic gains a better understanding of MSRs.
-     */
-    if ( cpu_has_arch_caps )
-    {
-        uint64_t val;
-
-        p->feat.arch_caps = true;
-
-        rdmsrl(MSR_ARCH_CAPABILITIES, val);
-
-        d->arch.msr->arch_caps.raw = val &
-            (ARCH_CAPABILITIES_RDCL_NO | ARCH_CAPABILITIES_IBRS_ALL | ARCH_CAPS_RSBA |
-             ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO |
-             ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO |
-             ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA |
-             ARCH_CAPS_BHI_NO | ARCH_CAPS_PBRSB_NO);
-    }
-
-    /* Apply dom0-cpuid= command line settings, if provided. */
-    if ( dom0_cpuid_cmdline )
-    {
-        uint32_t fs[FSCAPINTS];
-        unsigned int i;
-
-        x86_cpu_policy_to_featureset(p, fs);
-
-        for ( i = 0; i < ARRAY_SIZE(fs); ++i )
-        {
-            fs[i] |=  dom0_enable_feat [i];
-            fs[i] &= ~dom0_disable_feat[i];
-        }
-
-        x86_cpu_featureset_to_policy(fs, p);
-
-        recalculate_cpuid_policy(d);
-    }
-}
-
 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
                  uint32_t subleaf, struct cpuid_leaf *res)
 {
@@ -1237,27 +470,6 @@ void x86_cpu_featureset_to_policy(
     p->feat._7d1             = fs[FEATURESET_7d1];
 }
 
-static void __init __maybe_unused build_assertions(void)
-{
-    BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
-    BUILD_BUG_ON(ARRAY_SIZE(pv_featuremask) != FSCAPINTS);
-    BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_featuremask) != FSCAPINTS);
-    BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_featuremask) != FSCAPINTS);
-    BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
-
-    /* Find some more clever allocation scheme if this trips. */
-    BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
-
-    BUILD_BUG_ON(sizeof(raw_cpu_policy.basic) !=
-                 sizeof(raw_cpu_policy.basic.raw));
-    BUILD_BUG_ON(sizeof(raw_cpu_policy.feat) !=
-                 sizeof(raw_cpu_policy.feat.raw));
-    BUILD_BUG_ON(sizeof(raw_cpu_policy.xstate) !=
-                 sizeof(raw_cpu_policy.xstate.raw));
-    BUILD_BUG_ON(sizeof(raw_cpu_policy.extd) !=
-                 sizeof(raw_cpu_policy.extd.raw));
-}
-
 /*
  * Local variables:
  * mode: C
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -71,7 +71,6 @@
 #include <public/memory.h>
 #include <public/vm_event.h>
 #include <public/arch-x86/cpuid.h>
-#include <asm/cpuid.h>
 
 bool_t __read_mostly hvm_enabled;
 
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -50,7 +50,7 @@
 #include <asm/nmi.h>
 #include <asm/alternative.h>
 #include <asm/mc146818rtc.h>
-#include <asm/cpuid.h>
+#include <asm/cpu-policy.h>
 #include <asm/spec_ctrl.h>
 
 /* opt_nosmp: If true, secondary processors are ignored. */
@@ -1607,7 +1607,7 @@ void __init noreturn __start_xen(unsigne
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions");
 
-    init_guest_cpuid();
+    init_guest_cpu_policies();
 
     if ( dom0_pvh )
     {
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -24,6 +24,7 @@
 #include <xen/symbols.h>
 #include <xen/keyhandler.h>
 #include <xen/guest_access.h>
+#include <asm/cpu-policy.h>
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/mpspec.h>
--- a/xen/include/asm-x86/cpu-policy.h
+++ b/xen/include/asm-x86/cpu-policy.h
@@ -334,9 +334,18 @@ struct domain;
 
 const uint32_t *lookup_deep_deps(uint32_t feature);
 
+/* Initialise the guest cpu_policy objects. */
+void init_guest_cpu_policies(void);
+
 /* Allocate and initialise a CPU policy suitable for the domain. */
 int init_domain_cpu_policy(struct domain *d);
 
+/* Apply dom0-specific tweaks to the CPUID policy. */
+void init_dom0_cpuid_policy(struct domain *d);
+
+/* Clamp the CPUID policy to reality. */
+void recalculate_cpuid_policy(struct domain *d);
+
 #endif /* __X86_CPU_POLICY_H__ */
 
 /*
--- a/xen/include/asm-x86/cpuid.h
+++ b/xen/include/asm-x86/cpuid.h
@@ -7,14 +7,11 @@
 #ifndef __ASSEMBLY__
 #include <xen/types.h>
 #include <xen/kernel.h>
-#include <asm/cpu-policy.h>
 #include <asm/x86_emulate.h>
 #include <public/sysctl.h>
 
 extern const uint32_t known_features[FSCAPINTS];
 
-void init_guest_cpuid(void);
-
 /*
  * Expected levelling capabilities (given cpuid vendor/family information),
  * and levelling capabilities actually available (given MSR probing).
@@ -48,12 +45,8 @@ extern struct cpuidmasks cpuidmask_defau
 /* Whether or not cpuid faulting is available for the current domain. */
 DECLARE_PER_CPU(bool, cpuid_faulting_enabled);
 
-/* Apply dom0-specific tweaks to the CPUID policy. */
-void init_dom0_cpuid_policy(struct domain *d);
-
-/* Clamp the CPUID policy to reality. */
-void recalculate_cpuid_policy(struct domain *d);
-
+struct vcpu;
+struct cpuid_leaf;
 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
                  uint32_t subleaf, struct cpuid_leaf *res);