Restoring RabbitMQ Deployment From A Failed And Irrecoverable State

How to safely reinstall RabbitMQ deployment within Automation Suite?

Issue Description:

In certain scenarios, it may be necessary to reinstall the RabbitMQ deployment within Automation Suite, particularly when it is stuck in a CrashLoopBackOff state and cannot be recovered. This deployment utilizes a Persistent Volume Claim (PVC) for its storage needs by way of the distributed block storage system Longhorn. However, data within the RabbitMQ deployment is not required to be maintained persistently; thus, can be wiped without concern.

Root Cause: While there many factors that can contribute to a RabbitMQ deployment to fail and enter into a 'CrashLoopBackOff' state within Automation Suite. The two most common reasons are:

  • Persistent Volume Issues: Problems with Persistent Volumes (PV) and Persistent Volume Claims (PVC), such as access permissions, insufficient storage space, or issues with the underlying storage system (aka Longhorn), can lead to failure in stateful applications like RabbitMQ.
  • Corrupted Data or Full Disk: Corruption in the RabbitMQ data files or a full disk can prevent RabbitMQ from starting up properly.


Resolution:
The reinstallation process is made straightforward by the script below, which automates the complete deletion and reinstallation of the components. Execute it on a primary node within the cluster after saving to a local file:

#!/bin/bash


export PATH="$PATH:/usr/local/bin:/var/lib/rancher/rke2/bin"
export KUBECONFIG="/etc/rancher/rke2/rke2.yaml"


execute_command() {
    echo "Executing: $1"
    
    # Create temporary files for stdout and stderr
    TMP_OUT=$(mktemp)
    TMP_ERR=$(mktemp)

    # Execute command and redirect outputs to temporary files
    eval "$1" > "$TMP_OUT" 2> "$TMP_ERR"
    CMD_EXIT_CODE=$?

    # Capture outputs from temporary files into the variables
    CMD_OUT=$(cat "$TMP_OUT")
    CMD_ERR=$(cat "$TMP_ERR")

    # Cleanup temporary files
    rm -f "$TMP_OUT" "$TMP_ERR"

}

test_tmp_write_access() {
    echo "Testing write access to /tmp"
    # Create a temporary file in /tmp
    TMP_FILE="/tmp/test_write_access_$(date +%s%N).tmp"

    # Try to write to the temporary file
    if echo "test" > "$TMP_FILE"; then
        echo "Write access to /tmp confirmed."
        # Cleanup the test file
        rm -f "$TMP_FILE"
    else
        echo "Failed to write to /tmp."
        exit 1
    fi

    echo ""
}

cleanup() {
    echo "Failure occurred. Scaling backup rabbitmq as a precaution"
    cmd=" kubectl -n rabbitmq patch rabbitmqcluster rabbitmq -p \"{\\\"spec\\\":{\\\"replicas\\\": $rabbitmqReplicas}}\" --type=merge"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to patch rabbitmq replicas"
        echo "ERROR: $CMD_ERR"
        echo "Try manually running the following command: $cmd"
        exit 1
    fi

    echo "Rabbitmq is scaled called back up. Address the error that caused the exit and try re-running the script"
}

cleanup_post_secretDeletion() {
    echo "-Failure occurred post successful secret deletion. See error for more details"
    echo "--------IMPORTANT--------"
    echo "At this point, try logging in to argocd and syncing sfcore (if on 21.10.X or 22.4.X) and UiPath (applies to all versions)"
    echo "Credentials manager must recreate the secrets for rabbitmq before this script can be executed again"
    exit 1

}

set_replicas() {
    echo "Getting and saving rabbitmq replicas count"
    cmd=" kubectl -n rabbitmq get rabbitmqcluster rabbitmq -o json | jq -r '.spec.replicas'"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to get rabbitmq replicas"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    echo "Replica Count: $CMD_OUT"
    echo ""

    rabbitmqReplicas=$CMD_OUT
}

patch_rabbitmq_cluster() {
    echo "Patching rabbitmq replicas to $1"
    cmd=" kubectl -n rabbitmq patch rabbitmqcluster rabbitmq -p \"{\\\"spec\\\":{\\\"replicas\\\": $1}}\" --type=merge"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to patch rabbitmq replicas"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    echo ""
}

scale_down_rabbitmq_sts () {
    echo "Scaling down rabbitmq sts to 0"
    cmd=" kubectl -n rabbitmq scale sts rabbitmq-server --replicas=0"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to scale down rabbitmq sts"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    echo "Force deleting rabbitmq pods"
    cmd=" kubectl -n rabbitmq delete pods --all --force --grace-period=0"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to delete rabbitmq pods"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    echo ""
}

delete_pvcs () {
    echo "Deleting rabbitmq pvc(s)"
    cmd=" kubectl -n rabbitmq delete pvc --all --force --grace-period=0"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to delete rabbitmq pvc"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    echo ""
}


#
#    Secret should be deleted by a hook. But if it fails, we need to delete it manually.
#
delete_uipath_secret () {
    echo "Deleting uipath secret"
    cmd=" kubectl -n uipath get secret --template '{{range .items}}{{.metadata.name}}{{\"\\n\"}}{{end}}' | grep -i rabbitmq-secret | xargs -I{} kubectl -n uipath delete secret {}"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to delete uipath secret"
        echo "ERROR: $CMD_ERR"
        echo "Its OK to ignore this"
    fi

    echo ""
}

delete_rabbitmq_secret () {
    echo "Deleting rabbitmq secret"
    cmd=" kubectl -n rabbitmq get secret --template '{{range .items}}{{.metadata.name}}{{\"\\n\"}}{{end}}' | grep -i rabbitmq-secret | xargs -I{} kubectl -n rabbitmq delete secret {}"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to delete rabbitmq secret"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi
    echo ""
}

find_argocd () {
    echo "Finding argocd binary"
    cmd=" find /opt/UiPathAutomationSuite -type f -name argocd"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to find argocd pod"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi
    local files=$CMD_OUT

    if [[ ! $files ]]; then
        echo "No argocd binary found."
        echo "Try manually syncing UiPath from the argocd UI"
        exit 1
    fi

    argocd=$(ls -t $files | head -n 1)
    
    echo "Latest argocd binary: $argocd"
    echo ""

}

login_argocd () {
    echo "Logging into argocd"

    echo "Get the argocd server ip"
    cmd=" kubectl -n istio-system get vs argocdserver-vs  -o jsonpath='{.spec.hosts[0]}'"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to get argocd server ip"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    local argocdServerIp=$CMD_OUT
    echo "Argocd Server IP: $argocdServerIp"

    echo "Get the argocd admin password"
    cmd=" kubectl get secrets/argocd-admin-password -n argocd -o jsonpath='{.data.password}'"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to get argocd admin password"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    local argocdAdminPassword=$(echo $CMD_OUT | base64 -d)
    echo "Argocd Admin Password: $argocdAdminPassword"
    cmd=" $argocd login $argocdServerIp:443 --username admin --password $argocdAdminPassword  --grpc-web --grpc-web-root-path "/" --insecure"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to login to argocd"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi
    echo ""
}

sync_argocd_apps () {
    echo "Syncing argocd app(s) - This may take some time - The status in argocd UI will be more interesting to watch"

    echo "Checking Version"
    cmd=" kubectl -n argocd get applications orchestrator -o jsonpath={.spec.source.targetRevision}"
    execute_command "$cmd"
    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to determine version"
        echo "ERROR: $CMD_ERR"
        exit 1
    fi

    local version=$CMD_OUT
    
    #check to see if the version contains 21.10
    if [[ $version == *"21.10"* ]] || [[ $version == *"22.4"* ]]; then
        echo "Version is 21.10 or 22.4"
        echo "Syncing sfcore"
        cmd=" $argocd app sync sfcore"
        execute_command "$cmd"

        if [ "$CMD_EXIT_CODE" -ne 0 ]; then
            echo "Failed to resync sfcore argocd app"
            echo "ERROR: $CMD_ERR"
            zecho "If failure occured because sync is already in progress, ignore"
            exit 1
        fi
        echo ""
    fi

    cmd=" $argocd app sync uipath"
    execute_command "$cmd"

    if [ "$CMD_EXIT_CODE" -ne 0 ]; then
        echo "Failed to resync uipath argocd app"
        echo "ERROR: $CMD_ERR"
        echo "If failure occured because sync is already in progress, ignore"
        exit 1
    fi
    echo ""
}

##############################################################################################################
#  Main
##############################################################################################################


#
#   Check that we have access to tmp so our error handling works.
#
test_tmp_write_access
set_replicas
patch_rabbitmq_cluster 0 

#
# if this section fails, we should scale back up rabbitmq as a precaution
#
trap cleanup EXIT
scale_down_rabbitmq_sts "$rabbitmqReplicas"
delete_pvcs
trap - EXIT
patch_rabbitmq_cluster $rabbitmqReplicas

delete_rabbitmq_secret
# If a failure happens after a delete, they need to run a sync from the UI.
trap cleanup_post_secretDeletion EXIT
delete_uipath_secret
find_argocd
login_argocd
trap - EXIT

sync_argocd_apps