pacemaker 2.1.5-a3f44794f94
Scalable High-Availability cluster resource manager
failcounts.c
Go to the documentation of this file.
1/*
2 * Copyright 2008-2022 the Pacemaker project contributors
3 *
4 * This source code is licensed under the GNU Lesser General Public License
5 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6 */
7
8#include <crm_internal.h>
9
10#include <sys/types.h>
11#include <regex.h>
12#include <glib.h>
13
14#include <crm/crm.h>
15#include <crm/msg_xml.h>
16#include <crm/common/xml.h>
17#include <crm/common/util.h>
19
20static gboolean
21is_matched_failure(const char *rsc_id, xmlNode *conf_op_xml,
22 xmlNode *lrm_op_xml)
23{
24 gboolean matched = FALSE;
25 const char *conf_op_name = NULL;
26 const char *lrm_op_task = NULL;
27 const char *conf_op_interval_spec = NULL;
28 guint conf_op_interval_ms = 0;
29 guint lrm_op_interval_ms = 0;
30 const char *lrm_op_id = NULL;
31 char *last_failure_key = NULL;
32
33 if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
34 return FALSE;
35 }
36
37 // Get name and interval from configured op
38 conf_op_name = crm_element_value(conf_op_xml, "name");
39 conf_op_interval_spec = crm_element_value(conf_op_xml,
41 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
42
43 // Get name and interval from op history entry
44 lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
46 &lrm_op_interval_ms);
47
48 if ((conf_op_interval_ms != lrm_op_interval_ms)
49 || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
50 return FALSE;
51 }
52
53 lrm_op_id = ID(lrm_op_xml);
54 last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
55
56 if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
57 matched = TRUE;
58
59 } else {
60 char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
61 conf_op_interval_ms);
62
63 if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
64 int rc = 0;
65 int target_rc = pe__target_rc_from_xml(lrm_op_xml);
66
67 crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
68 if (rc != target_rc) {
69 matched = TRUE;
70 }
71 }
72 free(expected_op_key);
73 }
74
75 free(last_failure_key);
76 return matched;
77}
78
79static gboolean
80block_failure(pe_node_t *node, pe_resource_t *rsc, xmlNode *xml_op,
82{
83 char *xml_name = clone_strip(rsc->id);
84
85 /* @TODO This xpath search occurs after template expansion, but it is unable
86 * to properly detect on-fail in id-ref, operation meta-attributes, or
87 * op_defaults, or evaluate rules.
88 *
89 * Also, on-fail defaults to block (in unpack_operation()) for stop actions
90 * when stonith is disabled.
91 *
92 * Ideally, we'd unpack the operation before this point, and pass in a
93 * meta-attributes table that takes all that into consideration.
94 */
95 char *xpath = crm_strdup_printf("//primitive[@id='%s']//op[@on-fail='block']",
96 xml_name);
97
98 xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
99 gboolean should_block = FALSE;
100
101 free(xpath);
102
103 if (xpathObj) {
104 int max = numXpathResults(xpathObj);
105 int lpc = 0;
106
107 for (lpc = 0; lpc < max; lpc++) {
108 xmlNode *pref = getXpathResult(xpathObj, lpc);
109
110 if (xml_op) {
111 should_block = is_matched_failure(xml_name, pref, xml_op);
112 if (should_block) {
113 break;
114 }
115
116 } else {
117 const char *conf_op_name = NULL;
118 const char *conf_op_interval_spec = NULL;
119 guint conf_op_interval_ms = 0;
120 char *lrm_op_xpath = NULL;
121 xmlXPathObject *lrm_op_xpathObj = NULL;
122
123 // Get name and interval from configured op
124 conf_op_name = crm_element_value(pref, "name");
125 conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
126 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
127
128 lrm_op_xpath = crm_strdup_printf("//node_state[@uname='%s']"
129 "//lrm_resource[@id='%s']"
130 "/lrm_rsc_op[@operation='%s'][@interval='%u']",
131 node->details->uname, xml_name,
132 conf_op_name, conf_op_interval_ms);
133 lrm_op_xpathObj = xpath_search(data_set->input, lrm_op_xpath);
134
135 free(lrm_op_xpath);
136
137 if (lrm_op_xpathObj) {
138 int max2 = numXpathResults(lrm_op_xpathObj);
139 int lpc2 = 0;
140
141 for (lpc2 = 0; lpc2 < max2; lpc2++) {
142 xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
143 lpc2);
144
145 should_block = is_matched_failure(xml_name, pref,
146 lrm_op_xml);
147 if (should_block) {
148 break;
149 }
150 }
151 }
152 freeXpathObject(lrm_op_xpathObj);
153
154 if (should_block) {
155 break;
156 }
157 }
158 }
159 }
160
161 free(xml_name);
162 freeXpathObject(xpathObj);
163
164 return should_block;
165}
166
176static inline char *
177rsc_fail_name(pe_resource_t *rsc)
178{
179 const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
180
181 return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
182}
183
197static void
198generate_fail_regex(const char *prefix, const char *rsc_name,
199 gboolean is_legacy, gboolean is_unique, regex_t *re)
200{
201 char *pattern;
202
203 /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
204 * per-operation.
205 */
206 const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
207
208 /* Ignore instance numbers for anything other than globally unique clones.
209 * Anonymous clone fail counts could contain an instance number if the
210 * clone was initially unique, failed, then was converted to anonymous.
211 * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
212 * clone instance numbers.
213 */
214 const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
215
216 pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
217 instance_pattern, op_pattern);
218 CRM_LOG_ASSERT(regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) == 0);
219 free(pattern);
220}
221
233static void
234generate_fail_regexes(pe_resource_t *rsc, pe_working_set_t *data_set,
235 regex_t *failcount_re, regex_t *lastfailure_re)
236{
237 char *rsc_name = rsc_fail_name(rsc);
239 gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
240
241 generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
242 pcmk_is_set(rsc->flags, pe_rsc_unique), failcount_re);
243
244 generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name, is_legacy,
245 pcmk_is_set(rsc->flags, pe_rsc_unique), lastfailure_re);
246
247 free(rsc_name);
248}
249
250int
251pe_get_failcount(pe_node_t *node, pe_resource_t *rsc, time_t *last_failure,
252 uint32_t flags, xmlNode *xml_op, pe_working_set_t *data_set)
253{
254 char *key = NULL;
255 const char *value = NULL;
256 regex_t failcount_re, lastfailure_re;
257 int failcount = 0;
258 time_t last = 0;
259 GHashTableIter iter;
260
261 generate_fail_regexes(rsc, data_set, &failcount_re, &lastfailure_re);
262
263 /* Resource fail count is sum of all matching operation fail counts */
264 g_hash_table_iter_init(&iter, node->details->attrs);
265 while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
266 if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
267 failcount = pcmk__add_scores(failcount, char2score(value));
268 } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
269 long long last_ll;
270
271 if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
272 last = (time_t) QB_MAX(last, last_ll);
273 }
274 }
275 }
276
277 regfree(&failcount_re);
278 regfree(&lastfailure_re);
279
280 if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
281 *last_failure = last;
282 }
283
284 /* If failure blocks the resource, disregard any failure timeout */
285 if ((failcount > 0) && rsc->failure_timeout
286 && block_failure(node, rsc, xml_op, data_set)) {
287
288 pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
289 rsc->failure_timeout, rsc->id);
290 rsc->failure_timeout = 0;
291 }
292
293 /* If all failures have expired, ignore fail count */
294 if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
295 && rsc->failure_timeout) {
296
297 time_t now = get_effective_time(data_set);
298
299 if (now > (last + rsc->failure_timeout)) {
300 crm_debug("Failcount for %s on %s expired after %ds",
301 rsc->id, pe__node_name(node), rsc->failure_timeout);
302 failcount = 0;
303 }
304 }
305
306 /* We never want the fail counts of a bundle container's fillers to
307 * count towards the container's fail count.
308 *
309 * Most importantly, a Pacemaker Remote connection to a bundle container
310 * is a filler of the container, but can reside on a different node than the
311 * container itself. Counting its fail count on its node towards the
312 * container's fail count on that node could lead to attempting to stop the
313 * container on the wrong node.
314 */
315
317 && !pe_rsc_is_bundled(rsc)) {
318
319 GList *gIter = NULL;
320
321 for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
322 pe_resource_t *filler = (pe_resource_t *) gIter->data;
323 time_t filler_last_failure = 0;
324
325 failcount += pe_get_failcount(node, filler, &filler_last_failure,
326 flags, xml_op, data_set);
327
328 if (last_failure && filler_last_failure > *last_failure) {
329 *last_failure = filler_last_failure;
330 }
331 }
332
333 if (failcount > 0) {
334 crm_info("Container %s and the resources within it "
335 "have failed %s time%s on %s",
336 rsc->id, pcmk_readable_score(failcount),
337 pcmk__plural_s(failcount), pe__node_name(node));
338 }
339
340 } else if (failcount > 0) {
341 crm_info("%s has failed %s time%s on %s",
342 rsc->id, pcmk_readable_score(failcount),
343 pcmk__plural_s(failcount), pe__node_name(node));
344 }
345
346 return failcount;
347}
348
361 const char *reason, pe_working_set_t *data_set)
362{
363 char *key = NULL;
364 pe_action_t *clear = NULL;
365
366 CRM_CHECK(rsc && node && reason && data_set, return NULL);
367
368 key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
369 clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE,
370 data_set);
372 crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
373 rsc->id, pe__node_name(node), reason, clear->uuid);
374 return clear;
375}
const char * name
Definition: cib.c:24
#define PCMK__LAST_FAILURE_PREFIX
Definition: internal.h:312
#define PCMK__FAIL_COUNT_PREFIX
Definition: internal.h:311
char * pcmk__op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key (RESOURCE_ACTION_INTERVAL)
Definition: operations.c:45
uint64_t flags
Definition: remote.c:3
uint32_t version
Definition: remote.c:1
Utility functions.
char guint crm_parse_interval_spec(const char *input)
Parse milliseconds from a Pacemaker interval specification.
Definition: utils.c:271
int pcmk__add_scores(int score1, int score2)
Definition: scores.c:113
const char * pcmk_readable_score(int score)
Return a displayable static string for a score value.
Definition: scores.c:86
int char2score(const char *score)
Get the integer value of a score string.
Definition: scores.c:36
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
int compare_version(const char *version1, const char *version2)
Definition: utils.c:189
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition: util.h:121
A dumping ground.
#define CRM_OP_CLEAR_FAILCOUNT
Definition: crm.h:153
int pe_get_failcount(pe_node_t *node, pe_resource_t *rsc, time_t *last_failure, uint32_t flags, xmlNode *xml_op, pe_working_set_t *data_set)
Definition: failcounts.c:251
pe_action_t * pe__clear_failcount(pe_resource_t *rsc, pe_node_t *node, const char *reason, pe_working_set_t *data_set)
Schedule a controller operation to clear a fail count.
Definition: failcounts.c:360
#define crm_info(fmt, args...)
Definition: logging.h:362
#define CRM_XS
Definition: logging.h:55
#define CRM_LOG_ASSERT(expr)
Definition: logging.h:211
#define crm_notice(fmt, args...)
Definition: logging.h:361
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:227
#define crm_debug(fmt, args...)
Definition: logging.h:364
#define ID(x)
Definition: msg_xml.h:468
#define XML_ATTR_CRM_VERSION
Definition: msg_xml.h:118
#define XML_BOOLEAN_TRUE
Definition: msg_xml.h:146
#define XML_ATTR_TE_NOWAIT
Definition: msg_xml.h:415
#define XML_LRM_ATTR_INTERVAL
Definition: msg_xml.h:294
#define XML_LRM_ATTR_TASK
Definition: msg_xml.h:300
#define XML_LRM_ATTR_RC
Definition: msg_xml.h:311
#define XML_LRM_ATTR_INTERVAL_MS
Definition: msg_xml.h:298
pe_working_set_t * data_set
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition: nvpair.c:517
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
Definition: nvpair.c:553
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
Definition: nvpair.c:610
#define pe_rsc_unique
Definition: pe_types.h:262
int pe__target_rc_from_xml(xmlNode *xml_op)
Definition: unpack.c:3813
pe_action_t * custom_action(pe_resource_t *rsc, char *key, const char *task, const pe_node_t *on_node, gboolean optional, gboolean foo, pe_working_set_t *data_set)
Create or update an action object.
Definition: pe_actions.c:940
#define pe_warn(fmt...)
Definition: internal.h:54
time_t get_effective_time(pe_working_set_t *data_set)
Definition: utils.c:421
char * clone_strip(const char *last_rsc_id)
Definition: unpack.c:1647
@ pe_fc_effective
Definition: internal.h:345
@ pe_fc_fillers
Definition: internal.h:346
void add_hash_param(GHashTable *hash, const char *name, const char *value)
Definition: common.c:504
@ pcmk_rc_ok
Definition: results.h:148
#define pcmk__plural_s(i)
int pcmk__scan_ll(const char *text, long long *result, long long default_value)
Definition: strings.c:97
@ pcmk__str_casei
char * uuid
Definition: pe_types.h:411
GHashTable * meta
Definition: pe_types.h:420
struct pe_node_shared_s * details
Definition: pe_types.h:252
GHashTable * attrs
Definition: pe_types.h:241
const char * uname
Definition: pe_types.h:216
int failure_timeout
Definition: pe_types.h:350
char * clone_name
Definition: pe_types.h:330
char * id
Definition: pe_types.h:329
xmlNode * xml
Definition: pe_types.h:331
unsigned long long flags
Definition: pe_types.h:355
GList * fillers
Definition: pe_types.h:388
xmlNode * input
Definition: pe_types.h:144
Wrappers for and extensions to libxml2.
xmlXPathObjectPtr xpath_search(xmlNode *xml_top, const char *path)
Definition: xpath.c:139
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition: xpath.c:58
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition: xpath.c:39