Commit f7f2f424 authored by Rocky Automation's avatar Rocky Automation 📺
Browse files

import pacemaker-2.0.5-9.el8_4.3

parent 6148bf76
This diff is collapsed.
From 5c2d8665773254ff8b9676ac359a1210e34640e3 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Mon, 1 Mar 2021 14:02:52 +0100
Subject: [PATCH] API: add pcmk__mainloop_timer_get_period() to internal API
---
include/crm/common/internal.h | 1 +
lib/common/mainloop.c | 34 +++++++++++++++++++++++++---------
2 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h
index f69abe8..63bfd2c 100644
--- a/include/crm/common/internal.h
+++ b/include/crm/common/internal.h
@@ -96,6 +96,7 @@ pcmk__open_devnull(int flags)
int pcmk__add_mainloop_ipc(crm_ipc_t *ipc, int priority, void *userdata,
struct ipc_client_callbacks *callbacks,
mainloop_io_t **source);
+guint pcmk__mainloop_timer_get_period(mainloop_timer_t *timer);
/* internal messaging utilities (from messages.c) */
diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c
index 2f00e31..75f24e2 100644
--- a/lib/common/mainloop.c
+++ b/lib/common/mainloop.c
@@ -49,6 +49,15 @@ struct trigger_s {
};
+struct mainloop_timer_s {
+ guint id;
+ guint period_ms;
+ bool repeat;
+ char *name;
+ GSourceFunc cb;
+ void *userdata;
+};
+
static gboolean
crm_trigger_prepare(GSource * source, gint * timeout)
{
@@ -875,6 +884,22 @@ pcmk__add_mainloop_ipc(crm_ipc_t *ipc, int priority, void *userdata,
return pcmk_rc_ok;
}
+/*!
+ * \brief Get period for mainloop timer
+ *
+ * \param[in] timer Timer
+ *
+ * \return Period in ms
+ */
+guint
+pcmk__mainloop_timer_get_period(mainloop_timer_t *timer)
+{
+ if (timer) {
+ return timer->period_ms;
+ }
+ return 0;
+}
+
mainloop_io_t *
mainloop_add_ipc_client(const char *name, int priority, size_t max_size,
void *userdata, struct ipc_client_callbacks *callbacks)
@@ -1252,15 +1277,6 @@ mainloop_child_add(pid_t pid, int timeout, const char *desc, void *privatedata,
mainloop_child_add_with_flags(pid, timeout, desc, privatedata, 0, callback);
}
-struct mainloop_timer_s {
- guint id;
- guint period_ms;
- bool repeat;
- char *name;
- GSourceFunc cb;
- void *userdata;
-};
-
static gboolean
mainloop_timer_cb(gpointer user_data)
{
--
1.8.3.1
From 1d33712201e42f0e8ee108999cd4cb8fa0eeca95 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Fri, 19 Feb 2021 12:34:04 +0100
Subject: [PATCH] Feature: fenced: retry getting metadata until we get it
---
daemons/fenced/fenced_commands.c | 35 +++++++++++++++++++++++++++++++++++
daemons/fenced/pacemaker-fenced.h | 1 +
2 files changed, 36 insertions(+)
diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c
index 41901e5..65b41c5 100644
--- a/daemons/fenced/fenced_commands.c
+++ b/daemons/fenced/fenced_commands.c
@@ -69,6 +69,9 @@ static void stonith_send_reply(xmlNode * reply, int call_options, const char *re
static void search_devices_record_result(struct device_search_s *search, const char *device,
gboolean can_fence);
+static xmlNode * get_agent_metadata(const char *agent);
+static void read_action_metadata(stonith_device_t *device);
+
typedef struct async_command_s {
int id;
@@ -323,6 +326,25 @@ fork_cb(GPid pid, gpointer user_data)
cmd->activating_on = NULL;
}
+static int
+get_agent_metadata_cb(gpointer data) {
+ stonith_device_t *device = data;
+
+ device->agent_metadata = get_agent_metadata(device->agent);
+ if (device->agent_metadata) {
+ read_action_metadata(device);
+ stonith__device_parameter_flags(&(device->flags), device->id,
+ device->agent_metadata);
+ return G_SOURCE_REMOVE;
+ } else {
+ guint period_ms = pcmk__mainloop_timer_get_period(device->timer);
+ if (period_ms < 160 * 1000) {
+ mainloop_timer_set_period(device->timer, 2 * period_ms);
+ }
+ return G_SOURCE_CONTINUE;
+ }
+}
+
static gboolean
stonith_device_execute(stonith_device_t * device)
{
@@ -569,6 +591,11 @@ free_device(gpointer data)
g_list_free_full(device->targets, free);
+ if (device->timer) {
+ mainloop_timer_stop(device->timer);
+ mainloop_timer_del(device->timer);
+ }
+
mainloop_destroy_trigger(device->work);
free_xml(device->agent_metadata);
@@ -916,6 +943,14 @@ build_device_from_xml(xmlNode * msg)
read_action_metadata(device);
stonith__device_parameter_flags(&(device->flags), device->id,
device->agent_metadata);
+ } else {
+ if (device->timer == NULL) {
+ device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000,
+ TRUE, get_agent_metadata_cb, device);
+ }
+ if (!mainloop_timer_running(device->timer)) {
+ mainloop_timer_start(device->timer);
+ }
}
value = g_hash_table_lookup(device->params, "nodeid");
diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h
index 13cf6dc..e342692 100644
--- a/daemons/fenced/pacemaker-fenced.h
+++ b/daemons/fenced/pacemaker-fenced.h
@@ -41,6 +41,7 @@ typedef struct stonith_device_s {
GHashTable *params;
GHashTable *aliases;
GList *pending_ops;
+ mainloop_timer_t *timer;
crm_trigger_t *work;
xmlNode *agent_metadata;
--
1.8.3.1
This diff is collapsed.
From ee7eba6a7a05bdf0a12d60ebabb334d8ee021101 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 9 Aug 2021 14:48:57 -0500
Subject: [PATCH] Fix: controller: ensure lost node's transient attributes are
cleared without DC
Previously, peer_update_callback() cleared a lost node's transient attributes
if either the local node is DC, or there is no DC.
However, that left the possibility of the DC being lost at the same time as
another node -- the local node would still have fsa_our_dc set while processing
the leave notifications, so no node would clear the attributes for the non-DC
node.
Now, the controller has its own CPG configuration change callback, which sets a
global boolean before calling the usual one, so that peer_update_callback() can
know when the DC has been lost.
---
daemons/controld/controld_callbacks.c | 4 ++-
daemons/controld/controld_corosync.c | 57 ++++++++++++++++++++++++++++++++++-
2 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c
index af24856..e564b3d 100644
--- a/daemons/controld/controld_callbacks.c
+++ b/daemons/controld/controld_callbacks.c
@@ -99,6 +99,8 @@ node_alive(const crm_node_t *node)
#define state_text(state) ((state)? (const char *)(state) : "in unknown state")
+bool controld_dc_left = false;
+
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
@@ -217,7 +219,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
cib_scope_local);
}
- } else if (AM_I_DC || (fsa_our_dc == NULL)) {
+ } else if (AM_I_DC || controld_dc_left || (fsa_our_dc == NULL)) {
/* This only needs to be done once, so normally the DC should do
* it. However if there is no DC, every node must do it, since
* there is no other way to ensure some one node does it.
diff --git a/daemons/controld/controld_corosync.c b/daemons/controld/controld_corosync.c
index db99630..c5ab658 100644
--- a/daemons/controld/controld_corosync.c
+++ b/daemons/controld/controld_corosync.c
@@ -87,6 +87,61 @@ crmd_cs_destroy(gpointer user_data)
}
}
+extern bool controld_dc_left;
+
+/*!
+ * \brief Handle a Corosync notification of a CPG configuration change
+ *
+ * \param[in] handle CPG connection
+ * \param[in] cpg_name CPG group name
+ * \param[in] member_list List of current CPG members
+ * \param[in] member_list_entries Number of entries in \p member_list
+ * \param[in] left_list List of CPG members that left
+ * \param[in] left_list_entries Number of entries in \p left_list
+ * \param[in] joined_list List of CPG members that joined
+ * \param[in] joined_list_entries Number of entries in \p joined_list
+ */
+static void
+cpg_membership_callback(cpg_handle_t handle, const struct cpg_name *cpg_name,
+ const struct cpg_address *member_list,
+ size_t member_list_entries,
+ const struct cpg_address *left_list,
+ size_t left_list_entries,
+ const struct cpg_address *joined_list,
+ size_t joined_list_entries)
+{
+ /* When nodes leave CPG, the DC clears their transient node attributes.
+ *
+ * However if there is no DC, or the DC is among the nodes that left, each
+ * remaining node needs to do the clearing, to ensure it gets done.
+ * Otherwise, the attributes would persist when the nodes rejoin, which
+ * could have serious consequences for unfencing, agents that use attributes
+ * for internal logic, etc.
+ *
+ * Here, we set a global boolean if the DC is among the nodes that left, for
+ * use by the peer callback.
+ */
+ if (fsa_our_dc != NULL) {
+ crm_node_t *peer = crm_find_peer(0, fsa_our_dc);
+
+ if (peer != NULL) {
+ for (int i = 0; i < left_list_entries; ++i) {
+ if (left_list[i].nodeid == peer->id) {
+ controld_dc_left = true;
+ break;
+ }
+ }
+ }
+ }
+
+ // Process the change normally, which will call the peer callback as needed
+ pcmk_cpg_membership(handle, cpg_name, member_list, member_list_entries,
+ left_list, left_list_entries,
+ joined_list, joined_list_entries);
+
+ controld_dc_left = false;
+}
+
extern gboolean crm_connect_corosync(crm_cluster_t * cluster);
gboolean
@@ -95,7 +150,7 @@ crm_connect_corosync(crm_cluster_t * cluster)
if (is_corosync_cluster()) {
crm_set_status_callback(&peer_update_callback);
cluster->cpg.cpg_deliver_fn = crmd_cs_dispatch;
- cluster->cpg.cpg_confchg_fn = pcmk_cpg_membership;
+ cluster->cpg.cpg_confchg_fn = cpg_membership_callback;
cluster->destroy = crmd_cs_destroy;
if (crm_cluster_connect(cluster)) {
--
1.8.3.1
......@@ -226,7 +226,7 @@
Name: pacemaker
Summary: Scalable High-Availability cluster resource manager
Version: %{pcmkversion}
Release: %{pcmk_release}%{?dist}.1
Release: %{pcmk_release}%{?dist}.3
%if %{defined _unitdir}
License: GPLv2+ and LGPLv2+
%else
......@@ -289,6 +289,10 @@ Patch38: 038-feature-set.patch
Patch39: 039-crm_mon.patch
Patch40: 040-crm_mon-shutdown.patch
Patch41: 041-crm_mon-shutdown.patch
Patch42: 042-unfencing-loop.patch
Patch43: 043-retry-metadata.patch
Patch44: 044-sbd.patch
Patch45: 045-controller-attribute.patch
# downstream-only commits
Patch100: 100-default-to-syncing-with-sbd.patch
......@@ -811,6 +815,7 @@ exit 0
%{_sbindir}/crm_attribute
%{_sbindir}/crm_master
%{_sbindir}/fence_watchdog
%doc %{_mandir}/man7/pacemaker-controld.*
%doc %{_mandir}/man7/pacemaker-schedulerd.*
......@@ -819,6 +824,7 @@ exit 0
%doc %{_mandir}/man7/ocf_pacemaker_remote.*
%doc %{_mandir}/man8/crm_attribute.*
%doc %{_mandir}/man8/crm_master.*
%doc %{_mandir}/man8/fence_watchdog.*
%doc %{_mandir}/man8/pacemakerd.*
%doc %{_datadir}/pacemaker/alerts
......@@ -893,6 +899,7 @@ exit 0
%doc %{_mandir}/man8/*
%exclude %{_mandir}/man8/crm_attribute.*
%exclude %{_mandir}/man8/crm_master.*
%exclude %{_mandir}/man8/fence_watchdog.*
%exclude %{_mandir}/man8/pacemakerd.*
%exclude %{_mandir}/man8/pacemaker-remoted.*
......@@ -986,6 +993,18 @@ exit 0
%license %{nagios_name}-%{nagios_hash}/COPYING
%changelog
* Mon Aug 9 2021 Klaus Wenninger <kwenning@redhat.com> - 2.0.5-9.3
- retry fence-agent metadata
- assure transient attributes of lost node are cleared
- added configurable watchdog-fencing feature
- Resolves: rhbz1992014
- Resolves: rhbz1989622
- Resolves: rhbz1993891
* Thu Jun 24 2021 Ken Gaillot <kgaillot@redhat.com> - 2.0.5-9.2
- Avoid remote node unfencing loop
- Resolves: rhbz1972273
* Mon Apr 19 2021 Ken Gaillot <kgaillot@redhat.com> - 2.0.5-9.1
- Fix regression in crm_mon during cluster shutdown that affects ocf:heartbeat:pgsql agent
- Resolves: rhbz1951098
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment