/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
// Use milestones to abort old builds when the user force pushes
def buildNumber = env.BUILD_NUMBER as int
if (buildNumber > 1) milestone(buildNumber - 1)
milestone(buildNumber)


import groovy.transform.Field
@Field boolean build_ok = true


def get_portafiducia_download_path() {
    /* Stable Portafiducia tarball */
    def AWS_ACCOUNT_ID = sh (
                script: "aws sts get-caller-identity --query Account --output text | tr -dc 0-9",
                returnStdout: true
              )
    return "s3://libfabric-ci-$AWS_ACCOUNT_ID-us-west-2/portafiducia/portafiducia.tar.gz"
}

def download_and_extract_portafiducia(outputDir) {
    /* Download PortaFiducia tarball from S3 and extract to outputDir */
    def tempPath = "/tmp/portafiducia.tar.gz"
    def downloadPath = this.get_portafiducia_download_path()
    sh """
        mkdir -p ${outputDir}
        aws s3 cp ${downloadPath} ${tempPath}
        tar xf ${tempPath} -C ${outputDir}
    """
}

def install_porta_fiducia() {
    /*
     * Install PortaFiducia in a (new) virtual environment.
     */
    sh '''
        python3 -m venv venv
        . venv/bin/activate
        pip install --upgrade pip
        pip install --upgrade awscli
        pip install -e PortaFiducia
    '''
}

def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, test_config_file, addl_args) {
    /*
     * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments
     * param@ args: str, the command line arguments
     */
    def cluster_name = get_cluster_name(build_tag, os, instance_type)
    def args = "--config configs/${test_config_file} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
    sh ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}"
}

def get_random_string(len) {
    def s = sh (
        script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}",
        returnStdout: true
    )
    return s
}

def get_cluster_name(build_tag, os, instance_type) {
    /*
     * Compose the cluster name. Pcluster requires a cluster name under 60 characters.
     * cluster name cannot have ".".
     * Jenkins does not allow groovy to use the replace() method
     * of string. Therefore we used shell command sed to replace "." with ""
     */
    build_tag = sh(
                        script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"",
                        returnStdout: true
                )

    def cluster_name = sh(
                        script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'",
                        returnStdout: true
                     )

    return cluster_name
}

def get_single_node_windows_test_stage_with_lock(stage_name, lock_label) {
    /*
     * Get Windows Stage
     */
    return {
        stage("${stage_name}") {
            lock(label: lock_label, quantity: 1) {
                sh """
                    . venv/bin/activate;
                    cd PortaFiducia/scripts;
                    export PULL_REQUEST_ID=${env.CHANGE_ID};
                    env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID};
                """
            }
        }
    }

}

def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_count, region, test_config, lock_label, addl_args) {
    /*
     * Generate a single test stage that run test_orchestrator.py with the given parameters.
     * param@ stage_name: the name of the stage
     * param@ build_tag: the BUILD_TAG env generated by Jenkins
     * param@ os: the operating system for the test stage.
     * param@ instance_type: the instance type for the test stage.
     * param@ instance_count: number of intances to use
     * param@ region: the (default) aws region where the tests are run.
     * param@ test_config: the name of test config file in PortaFiducia/tests/configs/
     * param@ addl_args: additional arguments passed to test_orchestrator.py
     * return@: the test stage.
     */
    return {
        stage("${stage_name}") {
            lock(label: lock_label, quantity: instance_count) {
                this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args)
            }
        }
    }
}

pipeline {
    agent {
        ecs {
            inheritFrom 'fargate-large'
        }
    }
    options {
        buildDiscarder(logRotator(daysToKeepStr: "90"))
        timeout(time: 10, unit: 'HOURS')
    }
    environment {
        // AWS region where the cluster is created
        REGION="us-west-2"
    }
    stages {
        // Cleanup workspace before job start.
        stage("Clean up workspace") {
            steps{
                deleteDir()
            }
        }
        stage("Checkout SCM repo") {
            steps {
                checkout scm
            }
        }
        stage("Download and extract PortaFiducia") {
            steps {
                script {
                    sh 'printenv'
                    download_and_extract_portafiducia('PortaFiducia')
                }
            }
        }
        stage("Install PortaFiducia") {
            steps {
                script {
                    install_porta_fiducia()
                }

            }
        }
        stage("Test EFA provider") {
            steps {
                script {
                    def stages = [:]
                    // This needs the extra space at the end
                    // Set 12 hour timeout for all clusters
                    def addl_args_pr = "--timeout 720 --test-libfabric-pr $env.CHANGE_ID "
                    // Use lockable resources to limit the number of jobs that can get executed in parallel
                    def g4dn8x_lock_label = "g4dn8x"
                    def g4dn12x_lock_label  = "g4dn12x"
                    def c52x_lock_label  = "c52x"
                    def hpc6a48x_lock_label  = "hpc6a48x"
                    def c6gn16x_lock_label  = "c6gn16x"
                    def c5n18x_lock_label  = "c5n18x"
                    def c6g2x_lock_label  = "c6g2x"

                    // Single Node Tests - EFA
                    stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr)
                    stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr)
                    stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr)
                    stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr)

                    // Single Node Tests - SHM
                    stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm")
                    stages["1_g4dn_alinux2023_shm"] = get_test_stage_with_lock("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm")
                    stages["1_g4dn_ubuntu2004_shm"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm")
                    stages["1_c5_rhel8_shm"] = get_test_stage_with_lock("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", c52x_lock_label, addl_args_pr + "--test-libfabric-provider shm --enable-efa false")
                    stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage_with_lock("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", c52x_lock_label, addl_args_pr + "--test-libfabric-provider shm --enable-cma false --enable-efa false")

                    // Single Node Windows Test
                    stages["EFA_Windows_Test"] = get_single_node_windows_test_stage_with_lock("EFA_Windows_Test", c5n18x_lock_label)

                    // Multi Node Tests - EFA
                    stages["2_hpc6a_alinux2_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr)
                    stages["2_hpc6a_alinux2023_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr)
                    stages["2_c6gn_alinux2_efa"] = get_test_stage_with_lock("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6gn16x_lock_label, addl_args_pr)
                    stages["2_c6gn_alinux2023_efa"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6gn16x_lock_label, addl_args_pr)
                    stages["2_c5n_alinux2_efa"] = get_test_stage_with_lock("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", c5n18x_lock_label, addl_args_pr)
                    stages["2_c5n_alinux2023_efa"] = get_test_stage_with_lock("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", c5n18x_lock_label, addl_args_pr)
                    stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr)
                    stages["2_hpc6a_rhel8_efa"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr)

                    // Multi Node Tests - TCP
                    stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false")
                    stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false")
                    stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false")
                    stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false")
                    stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", "libfabric_pr_test.yaml", g4dn12x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false --test-list test_nccl_tests")

                    // Multi Node Tests - SOCKETS
                    stages["2_c6g_alinux2_sockets"] = get_test_stage_with_lock("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false")
                    stages["2_c6g_alinux2023_sockets"] = get_test_stage_with_lock("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false")
                    stages["2_c6g_ubuntu2004_sockets"] = get_test_stage_with_lock("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false")
                    stages["2_c6g_rhel8_sockets"] = get_test_stage_with_lock("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false")

                    parallel stages
                }
            }
        }
        stage('check build_ok') {
            steps {
                script {
                    if (build_ok) {
                        currentBuild.result = "SUCCESS"
                    }
                    else {
                        currentBuild.result = "FAILURE"
                    }
                }
            }
        }
    }
    post {
        always {
            sh 'find PortaFiducia/tests/outputs -name "*.xml" | xargs du -shc'
            junit testResults: 'PortaFiducia/tests/outputs/**/*.xml', keepLongStdio: false
            archiveArtifacts artifacts: 'PortaFiducia/tests/outputs/**/*.*'
        }
        failure {
            sh '''
                . venv/bin/activate
                ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_*
            '''
        }
        aborted {
            sh '. venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name "$BUILD_TAG"\'*\' --region $REGION'
        }
        // Cleanup workspace after job completes.
        cleanup {
            deleteDir()
        }
    }
}
