#!/bin/bash

#
# Check health state of the system
#
# Check, if important services are started and running. If this is not the
# case:
# - on first boot after update, rollback to old snapshot
# - if it is not the first boot, reboot
# - if reboot does not help, log this
#

STATE_FILE=/var/lib/misc/health-check.state
REBOOTED_STATE=/var/lib/misc/health-check.rebooted
PLUGINDIR=/usr/lib/health-checker
TELEM_SEVERITY=1
TELEM_PAYLOAD=""

BTRFS_ID_DEFAULT=0
BTRFS_ID_CURRENT=0

set_btrfs_id()
{
    BTRFS_ID_DEFAULT=`btrfs subvolume get-default / | awk '{print $2}'`
    BTRFS_ID_CURRENT=`findmnt --output OPTIONS --noheadings / | sed -e 's|.*subvolid=\([0-9]\+\).*|\1|g'`
}

create_log()
{
    local SEVERITY=1

    logger -s -p $1 $2

    # Create severity and payload for telemetrics if available
    case $1 in
	user.emerg)
	    SEVERITY=4
	    ;;
	user.crit)
	    SEVERITY=3
	    ;;
	user.alert)
	    SEVERITY=2
	    ;;
	*)
	    SEVERITY=1
	    ;;
    esac

    if [ $SEVERITY -gt $TELEM_SEVERITY ]; then
	TELEM_SEVERITY=$SEVERITY
    fi
    if [ -z "${TELEM_PAYLOAD}" ]; then
	TELEM_PAYLOAD=$2
    else
	TELEM_PAYLOAD="${TELEM_PAYLOAD}\n$2"
    fi
}

telem_send_record()
{
    # Log via telemetrics if available
    if [ -x /usr/bin/telem-record-gen ]; then
        echo -e "${TELEM_PAYLOAD}" | /usr/bin/telem-record-gen -s $TELEM_SEVERITY -c "org.opensuse/health/boot"
	# Communiction is async, give daemon time to send data
	# before we reboot
	test "$1" = "1" && sleep 2
    fi
}

save_working_snapshot()
{
    set_btrfs_id

    if [ ${BTRFS_ID_DEFAULT} -eq ${BTRFS_ID_CURRENT} ]; then
	echo "LAST_WORKING_BTRFS_ID=${BTRFS_ID_DEFAULT}" > $STATE_FILE
    fi
}

rollback()
{
    . ${STATE_FILE}
    btrfs subvolume set-default ${LAST_WORKING_BTRFS_ID} /.snapshots
    if [ $? -ne 0 ]; then
        create_log user.crit "ERROR: btrfs set-default $BTRFS_ID_DEFAULT failed!"
	telem_send_playload 1
        exit 1
    fi
}

stop_services()
{
    # stop all services
    for script in ${PLUGINDIR}/* ; do
	${script} stop
    done
}

error_decission()
{
    if [ ! -f ${STATE_FILE} ]; then
	# No state file, no successfull boot
	create_log user.emerg "Machine didn't come up correct, stop services"
	stop_services
	return
    fi

  . ${STATE_FILE}

  set_btrfs_id

  if [ ${BTRFS_ID_DEFAULT} -ne ${BTRFS_ID_CURRENT} ]; then
      # Don't tamper with system if not booted into default snapshot
      create_log user.alert "Machine didn't come up correct, trying rebooting into default snapshot"
      systemctl reboot
  elif [ ${LAST_WORKING_BTRFS_ID} -ne ${BTRFS_ID_DEFAULT} ]; then
      create_log user.alert "Machine didn't come up correct, do a rollback"
      rollback
      if [ $? -eq 0 ]; then
	  telem_send_record 1
	  systemctl reboot
      fi
  elif [ ! -f ${REBOOTED_STATE} ]; then
      create_log user.crit "Machine didn't come up correct, try a reboot"
      echo `date "+%Y-%m-%d %H:%M"` > ${REBOOTED_STATE}
      telem_send_record 1
      systemctl reboot
  else
      create_log user.emerg "Machine didn't come up correct, stop services"
      stop_services
  fi
}

# Clear GRUB flag (used to determine if system was able to boot at all)
echo "Clearing GRUB flag"
grub2-editenv - set health_checker_flag=0

echo "Starting health check"
FAILED=0;
for script in ${PLUGINDIR}/* ; do
  ${script} check
  if [ $? -ne 0 ]; then
      create_log user.crit "ERROR: \"${script} check\" failed"
      FAILED=1
  fi
done

if [ ${FAILED} -ne 0 ]; then
    echo "Health check failed!"
    error_decission
    telem_send_record 0
    exit 1
else
    echo "Health check passed"
    # Save good working state and remove old rebooted state file
    save_working_snapshot
    if [ -f ${REBOOTED_STATE} ]; then
        create_log user.info "Health check passed after reboot"
        rm -rf ${REBOOTED_STATE}
    fi
fi


if [ -z "${TELEM_PAYLOAD}" ]; then
    TELEM_PAYLOAD="Health check passed"
fi
telem_send_record 0
exit 0
