#!/bin/bash

# klp-kvm-l1tf-ctrl-smt
#
# Copyright (c) 2018 SUSE
# Author: Nicolai Stange <nstange@suse.de>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
#/


me=klp-kvm-l1tf-ctrl-smt

action=
simulate=no
force=no

pr_prefix="${me}: "
if [ -t 1 ]; then
    # Interactive shell, don't prefix messages with
    # the program name.
    pr_prefix=
fi

pr_err() {
    echo "${pr_prefix}error: $1" 1>&2
}

pr_warn() {
    echo "${pr_prefix}warning: $1" 1>&2
}

pr_info() {
    echo "${pr_prefix}$1"
}

usage() {
    echo "${me} [-h] [-n] [-f] -d | -e"
}

usage_err() {
    pr_err "$1"
    echo "Usage: $(usage)" 1>&2
    echo '' 1>&2
    echo "See \`${me} -h' for more information." 1>&2
    exit 1
}

show_help() {
    usage
    cat <<EOF
	-h	Show this help.
	-d	Disable SMT.
	-e	Enable SMT. Makes the system vulnerable to CVE-2018-3646!
	-n	Dry run.
	-f	Ignore failing sanity checks.
EOF
}


while getopts ":hdenf" o; do
    case "$o" in
	e)
	    if [ -n "$action" -a "$action" != enable ]; then
		usage_err "both of -d and -e given on command line"
		exit 1
	    fi
	    action=enable
	    ;;

	d)
	    if [ -n "$action" -a "$action" != disable ]; then
		usage_err "both of -d and -e given on command line"
		exit 1
	    fi
	    action=disable
	    ;;

	n)
	    simulate=yes
	    ;;

	f)
	    force=yes
	    ;;

	h)
	    show_help
	    exit 0
	    ;;

	?)
	    usage_err "command line option \`$OPTARG' not recognized"
	    exit 1
	    ;;
    esac
done

if [ -z "$action" ]; then
    usage_err "neither of -d or -e given on command line"
    exit 1
fi

if ! grep -q GenuineIntel /proc/cpuinfo; then
    pr_info "CPU not vulnerable"
    exit 0
fi


# Determine the target /sys/devices/system/cpu/cpuX/online state.
if [ $action = disable ]; then
    target_online_state=0
else
    target_online_state=1
fi


# In a first step, gather all the threads to act on. For the enable
# action, note that offline logical CPUs don't have that
# /sys/devices/system/cpuX/topology/ information, hence each of the
# two $action cases is handled separately.
sysfs_cpu=/sys/devices/system/cpu

if [ ! -d $sysfs_cpu ]; then
    pr_err "directory \`$sysfs_cpu' does not exist"
    exit 2
fi

threads=
if [ $action = disable ]; then
    thread_pairs=
    for tsl in $sysfs_cpu/cpu[0-9]*/topology/thread_siblings_list; do
	if [ "$(wc -l $tsl | cut -d' ' -f 1)" -ne 1 ]; then
	    # Not a single line
	    pr_err "unexpected format in \`$tsl'"
	    exit 2

	elif grep -q '^[0-9]\+$' $tsl; then
	    # A single number -> no sibling
	    continue

	elif ! grep -q '^[0-9]\+,[[:blank:]]*[0-9]\+$' $tsl; then
	    # Not in 'number,number' format
	    pr_err "unexpected format in \`$tsl'"
	    exit 2
	fi

	# Order the thread siblings by their ID. They should already be,
	# but better be safe.
	tsl="$(cat $tsl | tr -d '[:blank:]')"
	t1="${tsl%,*}"
	t2="${tsl#*,}"
	if [ "$t1" -gt "$t2" ]; then
	    _t="$t1"
	    t1="$t2"
	    t2="$_t"
	    unset _t
	    tsl="${t2},$t1"
	fi

	if [ ! -f $sysfs_cpu/cpu$t2/online ]; then
	    pr_err "file \`$sysfs_cpu/cpu$t2/online' does not exist"
	    exit 1
	fi

	thread_pairs="$thread_pairs $tsl"
	threads="$threads $t2"
    done

    # Remove duplicates and sort
    thread_pairs="$(echo $thread_pairs | tr ' ' '\n' | sort -n | uniq)"
    threads="$(echo $threads | tr ' ' '\n' | sort -n | uniq)"

else # $action = enable
    # No topology information for offline CPUs. Simply online all
    # threads that are currently offline.
    for o in $sysfs_cpu/cpu[0-9]*/online; do
	if [ "$(cat $o)" -ne 0 ]; then
	    continue
	fi

	t="$(echo "$(basename "$(dirname "$o")")" | sed 's/cpu//')"
	threads="$threads $t"
    done

    # Sort by number
    threads="$(echo $threads | tr ' ' '\n' | sort -n)"
fi

if [ -z "$threads" ]; then
    pr_info "Nothing to do"
    exit 0
fi


# If run with -f, make sanity check failures non-fatal
if [ $force = yes ]; then
    check_fail() {
	pr_warn "$1"
    }
else
    check_fail() {
	pr_err "$1"
    }
fi

# If the action is to disable SMT, check for each to be offlined
# CPUs that it isn't a member of a non-toplevel cpuset cgroup.
check_cg_cs() {
    pr_info "Checking cpuset cgroups..."

    sysfs_cg_cs=/sys/fs/cgroup/cpuset
    if [ ! -d $sysfs_cg_cs ]; then
	check_fail "directory \`$sysfs_cg_cs' does not exist"
	return 1
    fi

    find $sysfs_cg_cs -type d -print0 | \
	( r=0
	  while IFS= read -r -d '' cg_cs; do
	      if [ "$cg_cs" = "$sysfs_cg_cs" ]; then
		  # toplevel cgroup
		  continue
	      fi

	      if [ ! -f "${cg_cs}/cpuset.cpus" ]; then
		  pr_err "file \`${cg_cs}/cpuset.cpus' does not exist"
		  exit 2
	      fi

	      cg_cs_cpus="$(cat "${cg_cs}/cpuset.cpus" | tr ',' ' ' )"

	      for c in $cg_cs_cpus; do
		  if echo "$c" | grep -q '^[0-9]\+\(-[0-9]\+\)\?$'; then
		      c_low="${c%-*}"
		      c_high="${c#*-}"
		  else
		      pr_err "unexpected format \`$c' in \`${cg_cs}/cpuset.cpus'"
		      exit 2
		  fi

		  for t in $threads; do
		      if [ "$c_low" -le "$t" -a "$t" -le "$c_high" ]; then
			  check_fail "about to be offlined CPU $t is a member of \`${cg_cs}/cpuset.cpus'"
			  r=1
		      fi
		  done
	      done
	  done
	  exit $r
	)
    return $?
}

# If the action is to disable SMT, check that for each to be offlined
# CPU, either both or none of the two siblings are a member of any
# task's affinity list. This guarantees two things:
# 1.) Consistent downscaling across all tasks in the system.
# 2.) A task's affinity list will not become empty and thus, it'll be
#     kept as is (otherwise it would get reset to the list of CPUs
#     remaining online). This makes sure that bringing the CPUs up
#     again will revert the change.
check_task_affinities() {
    pr_info "Checking task affinities..."

    local r=0
    for tsk in $(ps -AL -o lwp); do
	tsk_is_kthread="$(awk '{print and($9, 0x200000)}' /proc/$tsk/stat 2>/dev/null)"
	if [ $? -ne 0 ]; then
	    # Task might have exited
	    continue
	fi

	if [ "$tsk_is_kthread" -ne 0 ]; then
	    continue
	fi

	cal="$(grep '^Cpus_allowed_list:' /proc/$tsk/status 2>/dev/null)"
	if [ $? -ne 0 ]; then
	    # Task might have exited
	    continue
	fi
	cal="$(echo "$cal" | sed 's/Cpus_allowed_list:[[:blank:]]*//; s/,/ /g')"

	for p in $thread_pairs; do
	    t1="${p%,*}"
	    t2="${p#*,}"

	    t1_in_cal=no
	    t2_in_cal=no

	    for c in $cal; do
		if [[ "$c" =~ ^[0-9]+(-[0-9]+)?$ ]]; then
		    c_low="${c%-*}"
		    c_high="${c#*-}"
		else
		    pr_err "unexpected format \`$c' in \`/proc/$tsk/status'"
		    exit 2
		fi

		if [ "$c_low" -le "$t1" -a "$t1" -le "$c_high" ]; then
		       t1_in_cal=yes
		fi

		if [ "$c_low" -le "$t2" -a "$t2" -le "$c_high" ]; then
		       t2_in_cal=yes
		fi
	    done

	    if [ $t1_in_cal = yes -a $t2_in_cal = no -o \
		 $t1_in_cal = no -a $t2_in_cal = yes ]; then

		if [ $t1_in_cal = no ]; then
		    _t="$t1"
		    t1="$t2"
		    t2="$_t"
		    unset _t
		fi
		comm="$(cat /proc/$tsk/comm 2>/dev/null)"
		if [ $? -ne 0 ]; then
		    # Task might have exited
		    continue
		fi
		check_fail "task \`$tsk $comm' has CPU $t1 in its affinity list but not its sibling $t2"
		r=1
	    fi
	done
    done

    return $r
}

# Check that irqbalance's config is trivial.
check_irqbalance_config() {
    pr_info "Checking irqbalance configuration..."

    if [ ! -f /etc/sysconfig/irqbalance ]; then
	return 0
    fi

    (
      . /etc/sysconfig/irqbalance || exit 2
      if [ -n "$IRQBALANCE_BANNED_CPUS" -o -n "$IRQBALANCE_ARGS" ]; then
	  if [ $force != "yes" ]; then
	      check_fail "non-trivial \`/etc/sysconfig/irqbalance', can't proceed without \`-f' option"
	  else
	      check_fail "non-trivial \`/etc/sysconfig/irqbalance'"
	  fi
	  exit 1
      fi
      exit 0
    )
    return $?
}

# Enabling SMT later again would simply bring up all offline logical
# cores. In order to make sure that this really undoes the current
# disable operation (and nothing more), check that all logical cores
# are online now.
check_all_cores_online() {
    pr_info "Checking that all logical cores are online..."

    local r=0
    for c in $sysfs_cpu/cpu[0-9]*; do
	if [ ! -f "$c/online" ]; then
	    if [ "$c" = $sysfs_cpu/cpu0 ]; then
		# The BSP hasn't got an 'offline' file.
		continue
	    fi

	    pr_err "file \`$c/online' does not exist"
	    return 2
	fi

	if [ "$(cat "$c/online")" -ne 1 ]; then
	    t="$(echo "$(basename "$c")" | sed 's/cpu//')"
	    check_fail "configuration unsupported: CPU $t is offline"
	    r=1
	fi
    done
    return $r
}

if [ $action = disable ]; then
    r=0

    check_cg_cs
    _r=$?
    if [ $_r -eq 1 ]; then
	# Unsafe condition detected
	r=1
    elif [ $_r -eq 2 ]; then
	# Checks failed for some other reason.
	exit 2
    fi

    check_task_affinities
    _r=$?
    if [ $_r -eq 1 ]; then
	# Unsafe condition detected
	r=1
    elif [ $_r -eq 2 ]; then
	# Checks failed for some other reason.
	exit 2
    fi

    check_irqbalance_config
    _r=$?
    if [ $_r -eq 1 ]; then
	# Unsafe condition detected
	r=1
    elif [ $_r -eq 2 ]; then
	# Checks failed for some other reason.
	exit 2
    fi

    check_all_cores_online
    _r=$?
    if [ $_r -eq 1 ]; then
	# Unsafe condition detected
	r=1
    elif [ $_r -eq 2 ]; then
	# Checks failed for some other reason.
	exit 2
    fi

    if [ $r -ne 0 ]; then
	if [ $force != yes ]; then
	    pr_err "prerequiste checks failed, aborting"
	    exit 2
	fi
    fi
fi


for t in $threads; do
    if [ $simulate = yes ]; then
	if [ $action = disable ]; then
	    pr_info "Would offline CPU $t"
	else
	    pr_info "Would online CPU $t"
	fi

    else
	if [ $action = disable ]; then
	    printf "${pr_prefix}Offlining CPU $t...\t"
	else
	    printf "${pr_prefix}Onlining CPU $t...\t"
	fi

	if ! ( echo $target_online_state > $sysfs_cpu/cpu$t/online ) 2>/dev/null; then
	    echo 'fail'
	    if [ $action = disable ]; then
		pr_err "failed to offline CPU $t"
	    else
		pr_err "failed to online CPU $t"
	    fi
	    exit 2
	fi
	echo 'success'
    fi
done

exit 0
