From e438946787b9cceec766fa4a721138d3b4e72956 Mon Sep 17 00:00:00 2001
From: Thomas Jones <thomas.jones@ibm.com>
Date: Fri, 30 May 2025 16:40:13 -0400
Subject: [PATCH] Fix: libcrmcommon: Add retries on connect to avoid fatal
 errors when sub-daemons communicate Add pcmk__connect_ipc_retry_conrefused()
 and use it where it makes sense Add retry loop to
 connect_and_send_attrd_request() that retries connect and send.

---
 daemons/controld/controld_schedulerd.c |  2 +-
 include/crm/common/ipc_internal.h      |  4 +++-
 lib/common/ipc_attrd.c                 | 19 ++++++++++++----
 lib/common/ipc_client.c                | 31 ++++++++++++++++++++++++++
 lib/pacemaker/pcmk_cluster_queries.c   |  2 +-
 5 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/daemons/controld/controld_schedulerd.c b/daemons/controld/controld_schedulerd.c
index 88f7ac1489..b716440701 100644
--- a/daemons/controld/controld_schedulerd.c
+++ b/daemons/controld/controld_schedulerd.c
@@ -198,7 +198,7 @@ new_schedulerd_ipc_connection(void)
 
     pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
 
-    rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3);
+    rc = pcmk__connect_ipc_retry_conrefused(schedulerd_api, pcmk_ipc_dispatch_main, 3);
     if (rc != pcmk_rc_ok) {
         crm_err("Error connecting to %s: %s",
                 pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
diff --git a/include/crm/common/ipc_internal.h b/include/crm/common/ipc_internal.h
index 72b6f7f189..b15df9fc3e 100644
--- a/include/crm/common/ipc_internal.h
+++ b/include/crm/common/ipc_internal.h
@@ -93,7 +93,9 @@ int pcmk__connect_generic_ipc(crm_ipc_t *ipc);
 int pcmk__ipc_fd(crm_ipc_t *ipc, int *fd);
 int pcmk__connect_ipc(pcmk_ipc_api_t *api, enum pcmk_ipc_dispatch dispatch_type,
                       int attempts);
-
+int pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+                                       enum pcmk_ipc_dispatch dispatch_type,
+                                       int attempts);
 /*
  * Server-related
  */
diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c
index 394bf75c1d..7ff0ca7873 100644
--- a/lib/common/ipc_attrd.c
+++ b/lib/common/ipc_attrd.c
@@ -150,6 +150,8 @@ create_attrd_op(const char *user_name)
 static int
 connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
 {
+    static const int max_retries = 5;
+    int remaining_attempts = max_retries;
     int rc = pcmk_rc_ok;
     bool created_api = false;
     enum pcmk_ipc_dispatch dispatch = pcmk_ipc_dispatch_sync;
@@ -164,10 +166,19 @@ connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
         dispatch = api->dispatch_type;
     }
 
-    rc = pcmk__connect_ipc(api, dispatch, 5);
-    if (rc == pcmk_rc_ok) {
-        rc = pcmk__send_ipc_request(api, request);
-    }
+    // If attrd is killed and is being restarted we will temporarily get
+    // ECONNREFUSED on connect if it is already dead or ENOTCONN if it died
+    // after we connected to it. We should wait a bit and retry in those cases.
+    do {
+        if (rc == ENOTCONN || rc == ECONNREFUSED) {
+            sleep(max_retries - remaining_attempts);
+        }
+        rc = pcmk__connect_ipc(api, dispatch, remaining_attempts);
+        if (rc == pcmk_rc_ok) {
+            rc = pcmk__send_ipc_request(api, request);
+        }
+        remaining_attempts--;
+    } while ((rc == ENOTCONN || rc == ECONNREFUSED) && remaining_attempts >= 0);
 
     if (created_api) {
         pcmk_free_ipc_api(api);
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
index 1f65ace114..9dd6501ddb 100644
--- a/lib/common/ipc_client.c
+++ b/lib/common/ipc_client.c
@@ -471,6 +471,37 @@ connect_without_main_loop(pcmk_ipc_api_t *api)
     return rc;
 }
 
+/*!
+ * \internal
+ * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors
+ *        and ECONNREFUSED)
+ *
+ * \param[in,out] api            IPC API instance
+ * \param[in]     dispatch_type  How IPC replies should be dispatched
+ * \param[in]     attempts       How many times to try (in case of soft error)
+ *
+ * \return Standard Pacemaker return code
+ */
+int
+pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+                                   enum pcmk_ipc_dispatch dispatch_type,
+                                   int attempts)
+{
+    int remaining = attempts;
+    int rc = pcmk_rc_ok;
+
+    do {
+        if (rc == ECONNREFUSED) {
+            pcmk__sleep_ms((attempts - remaining) * 500);
+        }
+        rc = pcmk__connect_ipc(api, dispatch_type, remaining);
+        remaining--;
+    } while (rc == ECONNREFUSED && remaining >= 0);
+
+    return rc;
+}
+
+
 /*!
  * \internal
  * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors)
diff --git a/lib/pacemaker/pcmk_cluster_queries.c b/lib/pacemaker/pcmk_cluster_queries.c
index 2f91a68738..8a08d99180 100644
--- a/lib/pacemaker/pcmk_cluster_queries.c
+++ b/lib/pacemaker/pcmk_cluster_queries.c
@@ -360,7 +360,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb,
         pcmk_register_ipc_callback(api, cb, data);
     }
 
-    rc = pcmk__connect_ipc(api, dispatch_type, 5);
+    rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
     if (rc != pcmk_rc_ok) {
         if (rc == EREMOTEIO) {
             data->pcmkd_state = pcmk_pacemakerd_state_remote;
-- 
2.43.0

