# saphana-filesystem-lib
#
# Description:	Clone resource to check vitality of SAP HANA filesystem
#
###################################################################################################################
#
# saphana-filesystem-lib: (short sht)
# Author:       Fabian Herschel, February 2023
# Support:      linux@sap.com
# License:      GNU General Public License (GPL)
# Copyright:    (c) 2023-2026 SUSE LLC
#
# An example usage:
#      See usage() function below for more details...
#
# OCF resource parameters:
#   OCF_RESKEY_SID              (LNX, SUS, SLE)
#   OCF_RESKEY_InstanceNumber   (00..99)
#   OCF_RESKEY_DIRECTORY        (path to be monitored, default /hana/shared/<SID>)
#   OCF_RESKEY_ON_FAIL_ACTION   (optional: fail, ignore; default is fail)
#
#######################################################################
#
saphana_filesystem_lib_version="1.3.1415927"
#
#######################################################################

function shfs_log_version() {
    # called by: TODO
    super_ocf_log info "RA: saphana_filesystem_lib_version=$saphana_filesystem_lib_version"
} # end function shfs_log_version

#
# function: shfs_usage - short usage info
# params:   -
# globals:  $0(r)
#
function shfs_usage() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    local rc=0
    methods=$(sht_methods all)
    methods="${methods// /|}"
	echo "usage: $0 ($methods)

    $0 manages a SAP HANA Instance as an HA resource.

    The 'start'        operation bind-mounts the fS
    The 'stop'         operation unmounts the fs
    The 'status'       operation reports whether the HANA FS is accessible
    The 'monitor'      operation reports whether the HANA FA is accessible
    The 'notify'       operation always returns SUCCESS
    The 'validate-all' operation reports whether the parameters are valid
    The 'methods'      operation reports on the methods $0 supports
    The 'reload'       operation allows to change parameters without forcing a recover of all resource instances
    "
	return "$rc"
} # end function shfs_usage

#
# function: shfs_meta_data - print resource agent meta-data for cluster
# params:   -
# globals:  -
#
function shfs_meta_data() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ()"
    local rc=0
    # shellcheck disable=SC2016
    echo '<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="SAPHanaFilesystem" version="$SAPHanaFilesystemVersion">
    <version>1.0</version>
    <shortdesc lang="en">SAPHanaFilesystem - Monitors mounted SAP HANA filesystems.</shortdesc>
    <longdesc lang="en">The SAPHanaFilesystem resource agent (RA) monitors mounted SAP HANA filesystems
by checking read/write access. This RA does neither mount nor umount any
filesystem. In case the filesystem monitor fails, the RA decides how to proceed
based on HANA system replication status.  The monitor and action timeouts are
significantly shorter than SAPHanaController resource timeouts. This results in faster takeover actions.

* Behaviour on HANA primary sites

srHook=SOK: In case of monitor failure the Linux cluster tries to stop and
restart the SAPHanaFilesystem resource (not the real filesystem). If that stop
fails and the HANA system replication is in sync the node gets fenced.
In consequence an HANA sr_takeover will be triggered.

srHook=SFAIL: In case of monitor failure the Linux cluster tries to stop and
restart the SAPHanaFilesystem resource (not the real filesystem). If the HANA
system replication is not in sync this will be repeated until it gains success
or migration-threshold is reached.

* Behaviour on HANA secondary sites

In case of monitor failure the Linux cluster tries to stop and restart the
SAPHanaFilesystem resource (not the real filesystem). This will be repeated
until it gains success or migration-threshold is reached.
    </longdesc>
<parameters>
    <parameter name="SID" unique="0" required="1">
        <longdesc lang="en">The SAP System Identifier (SID)</longdesc>
        <shortdesc lang="en">The SAP System Identifier (SID)</shortdesc>
        <content type="string" default="" />
    </parameter>
    <parameter name="InstanceNumber" unique="0" required="0">
        <longdesc lang="en">The SAP Instance Number - currently not used (placeholder)</longdesc>
        <shortdesc lang="en">The SAP Instance Number</shortdesc>
        <content type="string" default="" />
    </parameter>
    <parameter name="DIRECTORY" unique="0" required="0">
        <longdesc lang="en">The directory to be checked. Default is "/hana/shared/\$SID".
The RA creates data in an own subdirectory .suse_SAPHanaFilesystem. Do not touch this hidden subdirectory.</longdesc>
        <shortdesc lang="en">The directory to be checked. Default is "/hana/shared/\$SID"</shortdesc>
        <content type="string" default="" />
    </parameter>
    <parameter name="ON_FAIL_ACTION" unique="0" required="0">
        <longdesc lang="en">TODO - parameter to decide fence or ignore. default is 'stop_error' to force fence.</longdesc>
        <shortdesc lang="en">TODO</shortdesc>
        <content type="string" default="" />
    </parameter>
</parameters>
<actions>
    <action name="start" timeout="10" />
    <action name="stop" timeout="20" />
    <action name="status" timeout="60" />
    <action name="monitor" depth="0" timeout="120" interval="120" />
    <action name="validate-all" timeout="5" />
    <action name="meta-data" timeout="5" />
    <action name="methods" timeout="5" />
    <action name="reload" timeout="5" />
</actions>
</resource-agent>
'
return "$rc"
} # end function shfs_meta_data

#
# function: shfs_methods - report supported cluster methods
# params:   all
# globals:  -
# methods: What methods/operations do we support?
#
function shfs_methods() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    local set="${1:-all}"
    local methods=""
    local rc=0
    case "$set" in
        all )
            methods="start stop status monitor notify validate-all methods meta-data usage admin-setup reload"
            ;;
        actions )
            methods="start stop status monitor reload"
            ;;
    esac
    # shellcheck disable=SC2086
    printf "%s " $methods   # unquoted "methods" is intended here ( ##shellcheck disabled for this line)
    return "$rc"
} # end function shfs_methods

#
# function: shfs_init - initialize variables for the resource agent
# params:   -
# globals:  OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w),
# globals:  ATTR_NAME_HANA_*
# globals:  nodelist(w)
# globals:  NODENAME(w), hdbver(w)
# shfs_init : Define global variables with default values, if optional parameters are not set
#
function shfs_init() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    local rc=$OCF_SUCCESS
    local hdbANSWER=""
    local chkMethod=""
    # TODO PRIO3: NG - some vars are marked for export to avoid 'unused' error in shellcheck. Check later, if these vars are valuable again when combining Filesystem and Controller in the next step
    #
    # global
    #
    NODENAME=$(crm_node -n)
    SID="$OCF_RESKEY_SID"
    InstanceNr="$OCF_RESKEY_InstanceNumber"
    InstanceName="HDB${InstanceNr}"
    runDir="/run/SAPHanaSR_${SID}"
    fs_hana_shared="${OCF_RESKEY_DIRECTORY:-/hana/shared/$SID}"
    HANA_CALL_TIMEOUT="${OCF_RESKEY_HANA_CALL_TIMEOUT:-90}"
    ON_FAIL_ACTION="${OCF_RESKEY_ON_FAIL_ACTION:-fail}"
    td_hana_shared="$fs_hana_shared/.suse_SAPHanaFilesystem/${NODENAME}"
    pp_hana_shared="/run/SAPHanaFileSystem_poison_pill_${SID}"
    sid="${SID,,}"
    export sidadm="${sid}adm"
    #
    # init attribute definitions
    #
    saphana_init_attribute_definitions
    SAPHanaFilter=$(get_hana_attribute "X" "${ATTR_NAME_HANA_FILTER[@]}")
    mapfile -t otherNodes < <( cluster_get_other_nodes "${NODENAME}" ) # the syntax < <(..) is intended
    super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$OCF_SUCCESS"
    return "$OCF_SUCCESS"
} # end function shfs_init

#
#############################################################################
#
# function: shfs_start - start a hana pseudo FS resource
# params:   -
# globals:  OCF_*
# shfs_start : Start the SAP HANA pseudo FS resource instance
# TODO: should shfs_start trigger an start error, if the FS is not usable at this point of time (running an internal shfs_monitor)
#
function shfs_start() {
    super_ocf_log info "FLOW ${FUNCNAME[0]} ()"
    local rc="$OCF_NOT_RUNNING"
    local ha_ps_rc
    shfs_reset_poison_pill
    ha_pseudo_resource "$OCF_RESOURCE_INSTANCE" start; ha_ps_rc="$?"
    case "$ha_ps_rc" in
        0 ) rc="$OCF_SUCCESS";;
        * ) rc="$OCF_NOT_RUNNING";;
    esac
    super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc"
    return "$rc"
} # end function shfs_start

#
# function: shfs_stop - stop a hana pseudo FS resource
# params:   -
# globals:  OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r)
# shfs_stop: Stop the SAP HANA pseudo FS resource instance
#
function shfs_stop() {
    super_ocf_log info "FLOW ${FUNCNAME[0]} ()"
    local rc=0
    local ha_ps_rc
    ha_pseudo_resource "$OCF_RESOURCE_INSTANCE" stop; ha_ps_rc="$?"
    if [ -e "$pp_hana_shared" ]; then
        rm "$pp_hana_shared"
        super_ocf_log info "RA poison pill detected - reporting stop error - sleep 10s"
        sleep 10
        rc="$OCF_ERR_GENERIC"
    else
        rc="$ha_ps_rc"
    fi
    super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc"
    return "$rc"
} # end function shfs_stop

function shfs_reset_poison_pill() {
    if [ -e "$pp_hana_shared" ]; then
        super_ocf_log info "reset old RA poison pill"
        rm "$pp_hana_shared"
    fi
}

function shfs_error_simulation() {
    if [ -e "/run/break_SAPHanaFilesystem_${SID}" ]; then
        rm "/run/break_SAPHanaFilesystem_${SID}"
        return 0
    else
        return 1
    fi
}

function shfs_test_directory()
{
    local rc1=0 rc2=0
    mkdir -p "${td_hana_shared}"; rc1="$?"
    (
        dd_opts="oflag=direct,sync bs=4k count=1 conv=fsync,sync"
        STATUSFILE="${td_hana_shared}/test"
        # shellcheck disable=SC2086  # - dd_opts must not be quoted
        dd if=/dev/zero of="${STATUSFILE}" $dd_opts
    ); rc2="$?"
    ls "${td_hana_shared}"
    (( error_code = 1000 * rc2 + rc1 ))
    #super_ocf_log info "RA error_code=$error_code rc1=$rc1 rc2=$rc2"
    if [[ "$error_code" != "0" ]]; then
        return 2
    else
        return 0
    fi
}

#
# function: shfs_monitor - monitor a hana pseudo FS resource instance
# params:   --
# globals:  OCF_*(r), SAPCONTROL(r), InstanveNr(r)
# shfs_monitor: monitor a hana pseudo FS resource instance - check FS for rw access
#
function shfs_monitor() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ()"
    local rc=0
    local test_rc
    local actionTimeOut="$OCF_RESKEY_CRM_meta_timeout"
    if [ -z "$actionTimeOut" ]; then
        timeout=100
    else
        # timeout for FS IO should be 90% of the action timeout and in seconds (not ms) 90/(100000) => 9/10000
        ((timeout = actionTimeOut * 9/10000))
    fi
    super_ocf_log info "DEC: Timeout for I/O action: timeout=$timeout"
    shfs_reset_poison_pill
    ha_pseudo_resource "$OCF_RESOURCE_INSTANCE" monitor; ha_ps_rc="$?"
    if ocf_is_probe; then
        rc="$ha_ps_rc"
    else
	    #
	    # only for PoC tests allow an error simulation
	    #
	    if shfs_error_simulation; then
            test_rc=42
	    fi
	    super_ocf_log info "RA monitor() test_rc=$test_rc"
        gSite="$(get_hana_attribute "${HOSTNAME}" "${ATTR_NAME_HANA_SITE[@]}")"
        #
        # get remote site
        #
        super_ocf_log info "DEC: pre-get_remote_site-call"
        saphana_init_get_remote_site
        test_loc_sr=$(get_SRHOOK "$gSite")
        test_rem_sr=$(get_SRHOOK "$gRemSite")

        if [[ "${test_loc_sr}_${test_rem_sr}" == "PRIM_SOK" ]]; then
            super_ocf_log info "DEC: check directory as secondary is in sync (timeout=$timeout)"
            export -f shfs_test_directory # shfs_test_directory to be usable for sub-processes
            export td_hana_shared
            timeout --foreground -s 9 "$timeout" bash -c "shfs_test_directory"; test_rc="$?"
        else
            super_ocf_log info "DEC: SKIP check directory (test_loc_sr=$test_loc_sr, test_rem_sr=$test_rem_sr)"
            test_rc=0
        fi
        super_ocf_log info "DEC: post-get_remote_site-call gSite=$gSite gRemSite=$gRemSite test_loc_sr=$test_loc_sr test_rem_sr=$test_rem_sr"
        super_ocf_log debug "INFO: site=$gSite, mode=$gSrMode"
            case "${test_rc}_${test_rem_sr}" in
            0_* )   rc="$OCF_SUCCESS"
                super_ocf_log info "RA monitor() ${test_rc}_${test_rem_sr} The sun is shining"
                ;;
            *_SOK ) rc="$OCF_ERR_GENERIC"
                super_ocf_log info "RA monitor() ${test_rc}_${test_rem_sr} Go out of here"
                case "$ON_FAIL_ACTION" in
                    ignore )
                        super_ocf_log info "RA monitor() ON_FAIL_ACTION=$ON_FAIL_ACTION => ignore FS error, do not create poison pill file"
                        ;;
                    * )
                        super_ocf_log info "RA monitor() ON_FAIL_ACTION=$ON_FAIL_ACTION => do not ignore FS error, create poison pill file"
                        touch "$pp_hana_shared"
                        ;;
                esac
                ;;
            *_* )   rc="$OCF_ERR_GENERIC"
                super_ocf_log info "RA monitor() ${test_rc}_${test_rem_sr} Thunderstorm"
                ;;
            esac
    fi

    # TODO PRIO2: Param to switch writing cache file on/off needed?
    # TODO PRIO2: Writing cache file only for scale-out?
    # TODO PRIO3: Also caching info for primary-secondary role, if needed
    # TODO PRIO3: Also caching info for system replication status (srHook) role, if needed
    # DONE PRIO1: Remove grep and sed to adjust output
    ( SAPHanaSR-showAttr --format=cache --select=sitelist > /run/crm/SAPHanaSR_site_cache; chmod 644 /run/crm/SAPHanaSR_site_cache )&
    super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc"
    return "$rc"
} # end function shfs_monitor

#
# function: shfs_status - get status of a hana pseudo FS resource instance
# params:   -
# globals:  SID(r), InstanceName(r), OCF_*(r), sidarm(r)
# shfs_status: status of a hana pseudo FS resource instance
#
function shfs_status() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ()"
    local rc=0
    shfs_monitor; rc="$?"
    return "$rc"
} # end function shfs_status

#
# function: shfs_validate - validation of (some) variables/parameters
# params:   -
# globals:  OCF_*(r), SID(r), InstanceName(r), InstanceNr(r),
# shfs_validate: Check the semantic of the input parameters
#
function shfs_validate() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    local rc="$OCF_SUCCESS"
    #
    # check, if SID does NOT match ^[A-Z][A-Z0-9][A-Z0-9]$
    # we 'substract' the regular pattern from the string, if the result is not empty, it's not a 1:1 hit
    #
    if [[ -n "${SID/#[A-Z][A-Z0-9][A-Z0-9]/}" ]]
    then
        super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!"
        rc="$OCF_ERR_ARGS"
    fi
    #
    # check, if InstanceNr does NOT match ^[0-9][0-9]$
    # we 'substract' the regular pattern from the string, if the result is not empty, it's not a 1:1 hit
    #
    if [[ -n  "${InstanceNr/#[0-9][0-9]/}" ]]
    then
        super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!"
        rc="$OCF_ERR_ARGS"
    fi

    super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc"
    return "$rc"
} # end function shfs_validate

#
# function: shfs_start_clone - start a hana pseudo FS clone instance
# params:   -
# globals:  OCF_*(r),
# shfs_start_clone
#
function shfs_start_clone() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    local rc="$OCF_NOT_RUNNING"
    shfs_start; rc="$?"
    return "$rc"
} # end function shfs_start_clone

#
# function: shfs_stop_clone - stop a hana pseudo FS clone instance
# params:   -
# globals:  NODENAME(r), HANA_STATE_*, ATTR_NAME_*
# shfs_stop_clone
#
function shfs_stop_clone() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    local rc=0
    shfs_stop; rc="$?"
    super_ocf_log info "RA: shfs_stop_clone got rc=$rc from shfs_stop()"
    return "$rc"
} # end function shfs_stop_clone

#
# function: shfs_monitor_clone - monitor a hana FS pseudo clone instance
# params:   -
# globals:  OCF_*, SID, InstanceNr, InstanceName, MAPPING(r)
# shfs_monitor_clone
#
function shfs_monitor_clone() {
    # called by: TODO
    super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)"
    shfs_monitor; rc=$?
    super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc"
    return "$rc"
} # end function shfs_monitor_clone
