pacemaker 2.1.5-a3f44794f94
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1/*
2 * Copyright 2013-2020 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU Lesser General Public License
7 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 */
9
10#include <crm_internal.h>
11
12#include <sched.h>
13#include <sys/ioctl.h>
14#include <sys/reboot.h>
15
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <unistd.h>
19#include <ctype.h>
20#include <dirent.h>
21#include <signal.h>
22
23#ifdef _POSIX_MEMLOCK
24# include <sys/mman.h>
25#endif
26
27static pid_t sbd_pid = 0;
28
29static void
30sysrq_trigger(char t)
31{
32#if HAVE_LINUX_PROCFS
33 FILE *procf;
34
35 // Root can always write here, regardless of kernel.sysrq value
36 procf = fopen("/proc/sysrq-trigger", "a");
37 if (!procf) {
38 crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
39 return;
40 }
41 crm_info("sysrq-trigger: %c", t);
42 fprintf(procf, "%c\n", t);
43 fclose(procf);
44#endif // HAVE_LINUX_PROCFS
45 return;
46}
47
48
53static void
54panic_local(void)
55{
56 int rc = pcmk_ok;
57 uid_t uid = geteuid();
58 pid_t ppid = getppid();
59
60 if(uid != 0 && ppid > 1) {
61 /* We're a non-root pacemaker daemon (pacemaker-based,
62 * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
63 * the original pacemakerd parent.
64 *
65 * Of these, only the controller is likely to be initiating resets.
66 */
67 crm_emerg("Signaling parent %lld to panic", (long long) ppid);
69 return;
70
71 } else if (uid != 0) {
72#if HAVE_LINUX_PROCFS
73 /*
74 * No permissions, and no pacemakerd parent to escalate to.
75 * Track down the new pacemakerd process and send a signal instead.
76 */
77 union sigval signal_value;
78
79 memset(&signal_value, 0, sizeof(signal_value));
80 ppid = pcmk__procfs_pid_of("pacemakerd");
81 crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
82
83 if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
84 crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
85 (long long) ppid);
86 }
87#endif // HAVE_LINUX_PROCFS
88
89 /* The best we can do now is die */
91 return;
92 }
93
94 /* We're either pacemakerd, or a pacemaker daemon running as root */
95
96 if (pcmk__str_eq("crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
97 sysrq_trigger('c');
98 } else if (pcmk__str_eq("sync-crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
99 sync();
100 sysrq_trigger('c');
101 } else {
102 if (pcmk__str_eq("sync-reboot", getenv("PCMK_panic_action"), pcmk__str_casei)) {
103 sync();
104 }
105 sysrq_trigger('b');
106 }
107 /* reboot(RB_HALT_SYSTEM); rc = errno; */
108 reboot(RB_AUTOBOOT);
109 rc = errno;
110
111 crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
112 (long long) ppid, pcmk_rc_str(rc), rc);
113
114 if(ppid > 1) {
115 /* child daemon */
116 exit(CRM_EX_PANIC);
117 } else {
118 /* pacemakerd or orphan child */
119 exit(CRM_EX_FATAL);
120 }
121}
122
127static void
128panic_sbd(void)
129{
130 union sigval signal_value;
131 pid_t ppid = getppid();
132
133 crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
134
135 memset(&signal_value, 0, sizeof(signal_value));
136 /* TODO: Arrange for a slightly less brutal option? */
137 if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
138 crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
139 (long long) sbd_pid);
140 panic_local();
141 }
142
143 if(ppid > 1) {
144 /* child daemon */
145 exit(CRM_EX_PANIC);
146 } else {
147 /* pacemakerd or orphan child */
148 exit(CRM_EX_FATAL);
149 }
150}
151
161void
162pcmk__panic(const char *origin)
163{
164 static struct qb_log_callsite *panic_cs = NULL;
165
166 if (panic_cs == NULL) {
167 panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay",
168 LOG_TRACE, __LINE__, crm_trace_nonlog);
169 }
170
171 /* Ensure sbd_pid is set */
172 (void) pcmk__locate_sbd();
173
174 if (panic_cs && panic_cs->targets) {
175 /* getppid() == 1 means our original parent no longer exists */
176 crm_emerg("Shutting down instead of panicking the node "
177 CRM_XS " origin=%s sbd=%lld parent=%d",
178 origin, (long long) sbd_pid, getppid());
180 return;
181 }
182
183 if(sbd_pid > 1) {
184 crm_emerg("Signaling sbd[%lld] to panic the system: %s",
185 (long long) sbd_pid, origin);
186 panic_sbd();
187
188 } else {
189 crm_emerg("Panicking the system directly: %s", origin);
190 panic_local();
191 }
192}
193
198pid_t
200{
201 char *pidfile = NULL;
202 char *sbd_path = NULL;
203 int rc;
204
205 if(sbd_pid > 1) {
206 return sbd_pid;
207 }
208
209 /* Look for the pid file */
210 pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
211 sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
212
213 /* Read the pid file */
214 rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
215 if (rc == pcmk_rc_ok) {
216 crm_trace("SBD detected at pid %lld (via PID file %s)",
217 (long long) sbd_pid, pidfile);
218
219#if HAVE_LINUX_PROCFS
220 } else {
221 /* Fall back to /proc for systems that support it */
222 sbd_pid = pcmk__procfs_pid_of("sbd");
223 crm_trace("SBD detected at pid %lld (via procfs)",
224 (long long) sbd_pid);
225#endif // HAVE_LINUX_PROCFS
226 }
227
228 if(sbd_pid < 0) {
229 sbd_pid = 0;
230 crm_trace("SBD not detected");
231 }
232
233 free(pidfile);
234 free(sbd_path);
235
236 return sbd_pid;
237}
238
239long
241{
242 static long sbd_timeout = -2;
243
244 if (sbd_timeout == -2) {
245 sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
246 }
247 return sbd_timeout;
248}
249
250bool
252{
253 static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
254 static bool checked_sync_resource_startup = false;
255
256 if (!checked_sync_resource_startup) {
257 const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
258
259 if (sync_env == NULL) {
260 crm_trace("Defaulting to %sstart-up synchronization with sbd",
261 (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
262
263 } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
264 crm_warn("Defaulting to %sstart-up synchronization with sbd "
265 "because environment value '%s' is invalid",
266 (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
267 }
268 checked_sync_resource_startup = true;
269 }
270 return sync_resource_startup != 0;
271}
272
273long
275{
276 long sbd_timeout = pcmk__get_sbd_timeout();
277
278 return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
279}
280
281bool
282pcmk__valid_sbd_timeout(const char *value)
283{
284 long st_timeout = value? crm_get_msec(value) : 0;
285
286 if (st_timeout < 0) {
287 st_timeout = pcmk__auto_watchdog_timeout();
288 crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
289 st_timeout, value);
290 }
291
292 if (st_timeout == 0) {
293 crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
294 value? value : "default");
295
296 } else if (pcmk__locate_sbd() == 0) {
297 crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
298 "but SBD not active", (value? value : "auto"));
300 return false;
301
302 } else {
303 long sbd_timeout = pcmk__get_sbd_timeout();
304
305 if (st_timeout < sbd_timeout) {
306 crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
307 "(must be >%ldms)", value, sbd_timeout);
309 return false;
310 }
311 crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
312 value, sbd_timeout);
313 }
314 return true;
315}
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition: pid.c:165
pid_t pcmk__procfs_pid_of(const char *name)
Definition: procfs.c:111
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition: strings.c:364
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
int crm_str_to_boolean(const char *s, int *ret)
Definition: strings.c:427
#define SBIN_DIR
Definition: config.h:556
#define PCMK__SBD_SYNC_DEFAULT
Definition: config.h:550
#define PCMK_RUN_DIR
Definition: config.h:529
#define crm_info(fmt, args...)
Definition: logging.h:362
#define crm_warn(fmt, args...)
Definition: logging.h:360
#define CRM_XS
Definition: logging.h:55
unsigned int crm_trace_nonlog
Definition: logging.c:46
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition: logging.h:310
#define crm_debug(fmt, args...)
Definition: logging.h:364
#define crm_trace(fmt, args...)
Definition: logging.h:365
#define LOG_TRACE
Definition: logging.h:37
#define crm_emerg(fmt, args...)
Definition: logging.h:357
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition: results.c:476
@ CRM_EX_PANIC
Panic the local host.
Definition: results.h:266
@ CRM_EX_FATAL
Do not respawn.
Definition: results.h:265
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:856
@ pcmk_rc_ok
Definition: results.h:148
#define pcmk_ok
Definition: results.h:68
@ pcmk__str_casei
pid_t pcmk__locate_sbd(void)
Definition: watchdog.c:199
long pcmk__get_sbd_timeout(void)
Definition: watchdog.c:240
bool pcmk__get_sbd_sync_resource_startup(void)
Definition: watchdog.c:251
bool pcmk__valid_sbd_timeout(const char *value)
Definition: watchdog.c:282
long pcmk__auto_watchdog_timeout(void)
Definition: watchdog.c:274
void pcmk__panic(const char *origin)
Definition: watchdog.c:162