From 85d7a70916c5fd85d89ad34396be5df0ed151669 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 24 Oct 2024 11:30:00 -0500
Subject: [PATCH 09/16] Refactor: pacemaker-attrd: track node CIB ID rather
 than cluster ID

Previously, attribute_value_t had a nodeid member to track the cluster
ID of the node that the value is for. However the only reason we need
it is to be able to get the node's XML ID in the CIB, for writing out
the value. Rename it to node_xml_id and track the XML ID directly.

In practice, there is no real change, since the CIB XML ID of Corosync
nodes is simply their cluster ID as a string. This allows us to keep
the same XML attribute and value in peer messages for backward
compatibility.

If we ever support node XML IDs that are *not* the string equivalent of
their cluster IDs, rolling upgrades will be possible only from versions
with this commit and later.
---
 daemons/attrd/attrd_alerts.c     | 12 ++++++++-
 daemons/attrd/attrd_attributes.c | 11 +++++++-
 daemons/attrd/attrd_cib.c        | 45 ++++++++++++++++++++------------
 daemons/attrd/attrd_corosync.c   | 42 +++++++++++++++--------------
 daemons/attrd/attrd_ipc.c        | 14 +++++-----
 daemons/attrd/attrd_messages.c   |  6 +++--
 daemons/attrd/attrd_utils.c      |  1 +
 daemons/attrd/pacemaker-attrd.h  |  4 +--
 8 files changed, 86 insertions(+), 49 deletions(-)

diff --git a/daemons/attrd/attrd_alerts.c b/daemons/attrd/attrd_alerts.c
index 55cb477c22..81d02d2ce2 100644
--- a/daemons/attrd/attrd_alerts.c
+++ b/daemons/attrd/attrd_alerts.c
@@ -124,12 +124,22 @@ attrd_read_options(gpointer user_data)
 }
 
 int
-attrd_send_attribute_alert(const char *node, int nodeid,
+attrd_send_attribute_alert(const char *node, const char *node_xml_id,
                            const char *attr, const char *value)
 {
+    uint32_t nodeid = 0U;
+    pcmk__node_status_t *node_status = NULL;
+
     if (attrd_alert_list == NULL) {
         return pcmk_ok;
     }
+    node_status = pcmk__search_node_caches(0, node, node_xml_id,
+                                           pcmk__node_search_remote
+                                           |pcmk__node_search_cluster_member
+                                           |pcmk__node_search_cluster_cib);
+    if (node_status != NULL) {
+        nodeid = node_status->cluster_layer_id;
+    }
     return lrmd_send_attribute_alert(attrd_lrmd_connect(), attrd_alert_list,
                                      node, nodeid, attr, value);
 }
diff --git a/daemons/attrd/attrd_attributes.c b/daemons/attrd/attrd_attributes.c
index 74301d678a..6d80acfce1 100644
--- a/daemons/attrd/attrd_attributes.c
+++ b/daemons/attrd/attrd_attributes.c
@@ -142,7 +142,16 @@ attrd_add_value_xml(xmlNode *parent, const attribute_t *a,
     crm_xml_add(xml, PCMK__XA_ATTR_SET_TYPE, a->set_type);
     crm_xml_add(xml, PCMK__XA_ATTR_SET, a->set_id);
     crm_xml_add(xml, PCMK__XA_ATTR_USER, a->user);
-    pcmk__xe_add_node(xml, v->nodename, v->nodeid);
+    crm_xml_add(xml, PCMK__XA_ATTR_HOST, v->nodename);
+
+    /* @COMPAT Prior to 2.1.10 and 3.0.1, the node's cluster ID was added
+     * instead of its XML ID. For Corosync and Pacemaker Remote nodes, those are
+     * the same, but if we ever support node XML IDs that differ from their
+     * cluster IDs, we will have to drop support for rolling upgrades from
+     * versions before those.
+     */
+    crm_xml_add(xml, PCMK__XA_ATTR_HOST_ID, v->node_xml_id);
+
     crm_xml_add(xml, PCMK__XA_ATTR_VALUE, v->current);
     crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING,
                     pcmk__timeout_ms2s(a->timeout_ms));
diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c
index ad2bf2052c..665af3625d 100644
--- a/daemons/attrd/attrd_cib.c
+++ b/daemons/attrd/attrd_cib.c
@@ -10,7 +10,6 @@
 #include <crm_internal.h>
 
 #include <errno.h>
-#include <inttypes.h>   // PRIu32
 #include <stdbool.h>
 #include <stdlib.h>
 #include <glib.h>
@@ -450,10 +449,12 @@ send_alert_attributes_value(attribute_t *a, GHashTable *t)
     g_hash_table_iter_init(&vIter, t);
 
     while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & at)) {
-        rc = attrd_send_attribute_alert(at->nodename, at->nodeid,
+        rc = attrd_send_attribute_alert(at->nodename, at->node_xml_id,
                                         a->id, at->current);
-        crm_trace("Sent alerts for %s[%s]=%s: nodeid=%d rc=%d",
-                  a->id, at->nodename, at->current, at->nodeid, rc);
+        crm_trace("Sent alerts for %s[%s]=%s with node XML ID %s "
+                  "(%s agents failed)",
+                  a->id, at->nodename, at->current, at->node_xml_id,
+                  ((rc == 0)? "no" : ((rc == -1)? "some" : "all")));
     }
 }
 
@@ -462,7 +463,7 @@ set_alert_attribute_value(GHashTable *t, attribute_value_t *v)
 {
     attribute_value_t *a_v = pcmk__assert_alloc(1, sizeof(attribute_value_t));
 
-    a_v->nodeid = v->nodeid;
+    a_v->node_xml_id = pcmk__str_copy(v->node_xml_id);
     a_v->nodename = pcmk__str_copy(v->nodename);
     a_v->current = pcmk__str_copy(v->current);
 
@@ -551,26 +552,25 @@ write_attribute(attribute_t *a, bool ignore_delay)
             continue;
         }
 
-        // Try to get the XML ID used for the node in the CIB
+        /* We need the node's CIB XML ID to write out its attributes, so look
+         * for it now. Check the node caches first, even if the ID was
+         * previously known (in case it changed), but use any previous value as
+         * a fallback.
+         */
+
         if (pcmk_is_set(v->flags, attrd_value_remote)) {
             // A Pacemaker Remote node's XML ID is the same as its name
             node_xml_id = v->nodename;
 
         } else {
-            /* Get cluster node XML IDs from the peer caches.
-             * This will create a cluster node cache entry if none exists.
-             */
-            pcmk__node_status_t *peer = pcmk__get_node(v->nodeid, v->nodename,
-                                                       NULL,
+            // This creates a cluster node cache entry if none exists
+            pcmk__node_status_t *peer = pcmk__get_node(0, v->nodename,
+                                                       v->node_xml_id,
                                                        pcmk__node_search_any);
 
             node_xml_id = pcmk__cluster_get_xml_id(peer);
-
-            // Remember peer's node ID if we're just now learning it
-            if ((peer->cluster_layer_id != 0) && (v->nodeid == 0)) {
-                crm_trace("Learned ID %" PRIu32 " for node %s",
-                          peer->cluster_layer_id, v->nodename);
-                v->nodeid = peer->cluster_layer_id;
+            if (node_xml_id == NULL) {
+                node_xml_id = v->node_xml_id;
             }
         }
 
@@ -583,6 +583,17 @@ write_attribute(attribute_t *a, bool ignore_delay)
             continue;
         }
 
+        /* Remember the XML ID and let peers know it (in case one of them
+         * becomes the writer later)
+         */
+        if (!pcmk__str_eq(v->node_xml_id, node_xml_id, pcmk__str_none)) {
+            crm_trace("Setting %s[%s] node XML ID to %s (was %s)",
+                      a->id, v->nodename, node_xml_id,
+                      pcmk__s(v->node_xml_id, "unknown"));
+            pcmk__str_update(&(v->node_xml_id), node_xml_id);
+            attrd_broadcast_value(a, v);
+        }
+
         // Update this value as part of the CIB transaction we're building
         rc = add_attr_update(a, v->current, node_xml_id);
         if (rc != pcmk_rc_ok) {
diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c
index 72ebc1843b..02816b94d2 100644
--- a/daemons/attrd/attrd_corosync.c
+++ b/daemons/attrd/attrd_corosync.c
@@ -209,19 +209,6 @@ attrd_peer_change_cb(enum pcmk__node_update kind, pcmk__node_status_t *peer,
     }
 }
 
-static void
-record_peer_nodeid(attribute_value_t *v, const char *host)
-{
-    pcmk__node_status_t *known_peer =
-        pcmk__get_node(v->nodeid, host, NULL, pcmk__node_search_cluster_member);
-
-    crm_trace("Learned %s has XML ID %s",
-              known_peer->name, pcmk__cluster_node_uuid(known_peer));
-    if (attrd_election_won()) {
-        attrd_write_attributes(attrd_write_changed);
-    }
-}
-
 #define readable_value(rv_v) pcmk__s((rv_v)->current, "(unset)")
 
 #define readable_peer(p)    \
@@ -235,6 +222,7 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer,
     int is_remote = 0;
     bool changed = false;
     attribute_value_t *v = NULL;
+    const char *node_xml_id = crm_element_value(xml, PCMK__XA_ATTR_HOST_ID);
 
     // Create entry for value if not already existing
     v = g_hash_table_lookup(a->values, host);
@@ -245,6 +233,13 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer,
         g_hash_table_replace(a->values, v->nodename, v);
     }
 
+    /* If update doesn't contain the node XML ID, fall back to any previously
+     * known value (for logging)
+     */
+    if (node_xml_id == NULL) {
+        node_xml_id = v->node_xml_id;
+    }
+
     // If value is for a Pacemaker Remote node, remember that
     crm_element_value_int(xml, PCMK__XA_ATTR_IS_REMOTE, &is_remote);
     if (is_remote) {
@@ -270,11 +265,12 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer,
 
     } else if (changed) {
         crm_notice("Setting %s[%s]%s%s: %s -> %s "
-                   QB_XS " from %s with %s write delay",
+                   QB_XS " from %s with %s write delay and node XML ID %s",
                    attr, host, a->set_type ? " in " : "",
                    pcmk__s(a->set_type, ""), readable_value(v),
                    pcmk__s(value, "(unset)"), peer->name,
-                   (a->timeout_ms == 0)? "no" : pcmk__readable_interval(a->timeout_ms));
+                   (a->timeout_ms == 0)? "no" : pcmk__readable_interval(a->timeout_ms),
+                   pcmk__s(node_xml_id, "unknown"));
         pcmk__str_update(&v->current, value);
         attrd_set_attr_flags(a, attrd_attr_changed);
 
@@ -319,11 +315,17 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer,
     // This allows us to later detect local values that peer doesn't know about
     attrd_set_value_flags(v, attrd_value_from_peer);
 
-    /* If this is a cluster node whose node ID we are learning, remember it */
-    if ((v->nodeid == 0) && !pcmk_is_set(v->flags, attrd_value_remote)
-        && (crm_element_value_int(xml, PCMK__XA_ATTR_HOST_ID,
-                                  (int*)&v->nodeid) == 0) && (v->nodeid > 0)) {
-        record_peer_nodeid(v, host);
+    // Remember node's XML ID if we're just learning it
+    if ((node_xml_id != NULL)
+        && !pcmk__str_eq(node_xml_id, v->node_xml_id, pcmk__str_none)) {
+        crm_trace("Learned %s[%s] node XML ID is %s (was %s)",
+                  a->id, v->nodename, node_xml_id,
+                  pcmk__s(v->node_xml_id, "unknown"));
+        pcmk__str_update(&(v->node_xml_id), node_xml_id);
+        if (attrd_election_won()) {
+            // In case we couldn't write a value missing the XML ID before
+            attrd_write_attributes(attrd_write_changed);
+        }
     }
 }
 
diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c
index 5ab2763dbf..fd917a37bb 100644
--- a/daemons/attrd/attrd_ipc.c
+++ b/daemons/attrd/attrd_ipc.c
@@ -12,6 +12,7 @@
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <inttypes.h>   // PRIu32
 #include <sys/types.h>
 
 #include <crm/cluster.h>
@@ -232,12 +233,13 @@ attrd_client_refresh(pcmk__request_t *request)
 static void
 handle_missing_host(xmlNode *xml)
 {
-    const char *host = crm_element_value(xml, PCMK__XA_ATTR_HOST);
-
-    if (host == NULL) {
-        crm_trace("Inferring host");
-        pcmk__xe_add_node(xml, attrd_cluster->priv->node_name,
-                          attrd_cluster->priv->node_id);
+    if (crm_element_value(xml, PCMK__XA_ATTR_HOST) == NULL) {
+        crm_trace("Inferring local node %s with XML ID %s",
+                  attrd_cluster->priv->node_name,
+                  attrd_cluster->priv->node_xml_id);
+        crm_xml_add(xml, PCMK__XA_ATTR_HOST, attrd_cluster->priv->node_name);
+        crm_xml_add(xml, PCMK__XA_ATTR_HOST_ID,
+                    attrd_cluster->priv->node_xml_id);
     }
 }
 
diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c
index b6eebc66cb..e1038a820b 100644
--- a/daemons/attrd/attrd_messages.c
+++ b/daemons/attrd/attrd_messages.c
@@ -9,6 +9,7 @@
 
 #include <crm_internal.h>
 
+#include <inttypes.h>   // PRIu32
 #include <glib.h>
 
 #include <crm/common/messages_internal.h>
@@ -314,8 +315,9 @@ attrd_broadcast_protocol(void)
     crm_xml_add(attrd_op, PCMK__XA_ATTR_NAME, CRM_ATTR_PROTOCOL);
     crm_xml_add(attrd_op, PCMK__XA_ATTR_VALUE, ATTRD_PROTOCOL_VERSION);
     crm_xml_add_int(attrd_op, PCMK__XA_ATTR_IS_PRIVATE, 1);
-    pcmk__xe_add_node(attrd_op, attrd_cluster->priv->node_name,
-                      attrd_cluster->priv->node_id);
+    crm_xml_add(attrd_op, PCMK__XA_ATTR_HOST, attrd_cluster->priv->node_name);
+    crm_xml_add(attrd_op, PCMK__XA_ATTR_HOST_ID,
+                attrd_cluster->priv->node_xml_id);
 
     crm_debug("Broadcasting attrd protocol version %s for node %s",
               ATTRD_PROTOCOL_VERSION, attrd_cluster->priv->node_name);
diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c
index f219b8862d..3621f5f354 100644
--- a/daemons/attrd/attrd_utils.c
+++ b/daemons/attrd/attrd_utils.c
@@ -232,6 +232,7 @@ attrd_free_attribute_value(gpointer data)
     free(v->nodename);
     free(v->current);
     free(v->requested);
+    free(v->node_xml_id);
     free(v);
 }
 
diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
index 13646b8e51..07103a6b01 100644
--- a/daemons/attrd/pacemaker-attrd.h
+++ b/daemons/attrd/pacemaker-attrd.h
@@ -99,7 +99,7 @@ extern crm_trigger_t *attrd_config_read;
 
 void attrd_lrmd_disconnect(void);
 gboolean attrd_read_options(gpointer user_data);
-int attrd_send_attribute_alert(const char *node, int nodeid,
+int attrd_send_attribute_alert(const char *node, const char *node_xml_id,
                                const char *attr, const char *value);
 
 // Elections
@@ -155,7 +155,7 @@ typedef struct attribute_value_s {
     char *nodename;     // Node that this value is for
     char *current;      // Attribute value
     char *requested;    // Value specified in pending CIB write, if any
-    uint32_t nodeid;    // Cluster node ID of node that this value is for
+    char *node_xml_id;  // XML ID used for node in CIB
     uint32_t flags;     // Group of attrd_value_flags
 } attribute_value_t;
 
-- 
2.43.0

