1 /*        $NetBSD: clvmd-corosync.c,v 1.1.1.2 2009/12/02 00:27:02 haad Exp $    */
2 
3 /*
4  * Copyright (C) 2009 Red Hat, Inc. All rights reserved.
5  *
6  * This file is part of LVM2.
7  *
8  * This copyrighted material is made available to anyone wishing to use,
9  * modify, copy, or redistribute it subject to the terms and conditions
10  * of the GNU Lesser General Public License v.2.1.
11  *
12  * You should have received a copy of the GNU Lesser General Public License
13  * along with this program; if not, write to the Free Software Foundation,
14  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
15  */
16 
17 /*
18  * This provides the interface between clvmd and corosync/DLM as the cluster
19  * and lock manager.
20  */
21 
22 #define _GNU_SOURCE
23 #define _FILE_OFFSET_BITS 64
24 
25 #include <configure.h>
26 #include <pthread.h>
27 #include <sys/types.h>
28 #include <sys/utsname.h>
29 #include <sys/ioctl.h>
30 #include <sys/socket.h>
31 #include <sys/stat.h>
32 #include <sys/file.h>
33 #include <sys/socket.h>
34 #include <netinet/in.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdint.h>
38 #include <signal.h>
39 #include <fcntl.h>
40 #include <string.h>
41 #include <stddef.h>
42 #include <stdint.h>
43 #include <unistd.h>
44 #include <errno.h>
45 #include <utmpx.h>
46 #include <syslog.h>
47 #include <assert.h>
48 #include <libdevmapper.h>
49 
50 #include <corosync/corotypes.h>
51 #include <corosync/cpg.h>
52 #include <corosync/quorum.h>
53 #include <corosync/confdb.h>
54 #include <libdlm.h>
55 
56 #include "locking.h"
57 #include "lvm-logging.h"
58 #include "clvm.h"
59 #include "clvmd-comms.h"
60 #include "lvm-functions.h"
61 #include "clvmd.h"
62 
63 /* Timeout value for several corosync calls */
64 #define LOCKSPACE_NAME "clvmd"
65 
66 static void corosync_cpg_deliver_callback (cpg_handle_t handle,
67                                           const struct cpg_name *groupName,
68                                           uint32_t nodeid,
69                                           uint32_t pid,
70                                           void *msg,
71                                           size_t msg_len);
72 static void corosync_cpg_confchg_callback(cpg_handle_t handle,
73                                          const struct cpg_name *groupName,
74                                          const struct cpg_address *member_list, size_t member_list_entries,
75                                          const struct cpg_address *left_list, size_t left_list_entries,
76                                          const struct cpg_address *joined_list, size_t joined_list_entries);
77 static void _cluster_closedown(void);
78 
79 /* Hash list of nodes in the cluster */
80 static struct dm_hash_table *node_hash;
81 
82 /* Number of active nodes */
83 static int num_nodes;
84 static unsigned int our_nodeid;
85 
86 static struct local_client *cluster_client;
87 
88 /* Corosync handles */
89 static cpg_handle_t cpg_handle;
90 static quorum_handle_t quorum_handle;
91 
92 /* DLM Handle */
93 static dlm_lshandle_t *lockspace;
94 
95 static struct cpg_name cpg_group_name;
96 
97 /* Corosync callback structs */
98 cpg_callbacks_t corosync_cpg_callbacks = {
99           .cpg_deliver_fn =            corosync_cpg_deliver_callback,
100           .cpg_confchg_fn =            corosync_cpg_confchg_callback,
101 };
102 
103 quorum_callbacks_t quorum_callbacks = {
104           .quorum_notify_fn = NULL,
105 };
106 
107 struct node_info
108 {
109           enum {NODE_UNKNOWN, NODE_DOWN, NODE_UP, NODE_CLVMD} state;
110           int nodeid;
111 };
112 
113 
114 /* Set errno to something approximating the right value and return 0 or -1 */
cs_to_errno(cs_error_t err)115 static int cs_to_errno(cs_error_t err)
116 {
117           switch(err)
118           {
119           case CS_OK:
120                     return 0;
121         case CS_ERR_LIBRARY:
122                     errno = EINVAL;
123                     break;
124         case CS_ERR_VERSION:
125                     errno = EINVAL;
126                     break;
127         case CS_ERR_INIT:
128                     errno = EINVAL;
129                     break;
130         case CS_ERR_TIMEOUT:
131                     errno = ETIME;
132                     break;
133         case CS_ERR_TRY_AGAIN:
134                     errno = EAGAIN;
135                     break;
136         case CS_ERR_INVALID_PARAM:
137                     errno = EINVAL;
138                     break;
139         case CS_ERR_NO_MEMORY:
140                     errno = ENOMEM;
141                     break;
142         case CS_ERR_BAD_HANDLE:
143                     errno = EINVAL;
144                     break;
145         case CS_ERR_BUSY:
146                     errno = EBUSY;
147                     break;
148         case CS_ERR_ACCESS:
149                     errno = EPERM;
150                     break;
151         case CS_ERR_NOT_EXIST:
152                     errno = ENOENT;
153                     break;
154         case CS_ERR_NAME_TOO_LONG:
155                     errno = ENAMETOOLONG;
156                     break;
157         case CS_ERR_EXIST:
158                     errno = EEXIST;
159                     break;
160         case CS_ERR_NO_SPACE:
161                     errno = ENOSPC;
162                     break;
163         case CS_ERR_INTERRUPT:
164                     errno = EINTR;
165                     break;
166           case CS_ERR_NAME_NOT_FOUND:
167                     errno = ENOENT;
168                     break;
169         case CS_ERR_NO_RESOURCES:
170                     errno = ENOMEM;
171                     break;
172         case CS_ERR_NOT_SUPPORTED:
173                     errno = EOPNOTSUPP;
174                     break;
175         case CS_ERR_BAD_OPERATION:
176                     errno = EINVAL;
177                     break;
178         case CS_ERR_FAILED_OPERATION:
179                     errno = EIO;
180                     break;
181         case CS_ERR_MESSAGE_ERROR:
182                     errno = EIO;
183                     break;
184         case CS_ERR_QUEUE_FULL:
185                     errno = EXFULL;
186                     break;
187         case CS_ERR_QUEUE_NOT_AVAILABLE:
188                     errno = EINVAL;
189                     break;
190         case CS_ERR_BAD_FLAGS:
191                     errno = EINVAL;
192                     break;
193         case CS_ERR_TOO_BIG:
194                     errno = E2BIG;
195                     break;
196         case CS_ERR_NO_SECTIONS:
197                     errno = ENOMEM;
198                     break;
199           default:
200                     errno = EINVAL;
201                     break;
202           }
203           return -1;
204 }
205 
print_corosync_csid(const char * csid)206 static char *print_corosync_csid(const char *csid)
207 {
208           static char buf[128];
209           int id;
210 
211           memcpy(&id, csid, sizeof(int));
212           sprintf(buf, "%d", id);
213           return buf;
214 }
215 
corosync_cpg_deliver_callback(cpg_handle_t handle,const struct cpg_name * groupName,uint32_t nodeid,uint32_t pid,void * msg,size_t msg_len)216 static void corosync_cpg_deliver_callback (cpg_handle_t handle,
217                                           const struct cpg_name *groupName,
218                                           uint32_t nodeid,
219                                           uint32_t pid,
220                                           void *msg,
221                                           size_t msg_len)
222 {
223           int target_nodeid;
224 
225           memcpy(&target_nodeid, msg, COROSYNC_CSID_LEN);
226 
227           DEBUGLOG("%u got message from nodeid %d for %d. len %zd\n",
228                      our_nodeid, nodeid, target_nodeid, msg_len-4);
229 
230           if (nodeid != our_nodeid)
231                     if (target_nodeid == our_nodeid || target_nodeid == 0)
232                               process_message(cluster_client, (char *)msg+COROSYNC_CSID_LEN,
233                                                   msg_len-COROSYNC_CSID_LEN, (char*)&nodeid);
234 }
235 
corosync_cpg_confchg_callback(cpg_handle_t handle,const struct cpg_name * groupName,const struct cpg_address * member_list,size_t member_list_entries,const struct cpg_address * left_list,size_t left_list_entries,const struct cpg_address * joined_list,size_t joined_list_entries)236 static void corosync_cpg_confchg_callback(cpg_handle_t handle,
237                                          const struct cpg_name *groupName,
238                                          const struct cpg_address *member_list, size_t member_list_entries,
239                                          const struct cpg_address *left_list, size_t left_list_entries,
240                                          const struct cpg_address *joined_list, size_t joined_list_entries)
241 {
242           int i;
243           struct node_info *ninfo;
244 
245           DEBUGLOG("confchg callback. %zd joined, %zd left, %zd members\n",
246                      joined_list_entries, left_list_entries, member_list_entries);
247 
248           for (i=0; i<joined_list_entries; i++) {
249                     ninfo = dm_hash_lookup_binary(node_hash,
250                                                         (char *)&joined_list[i].nodeid,
251                                                         COROSYNC_CSID_LEN);
252                     if (!ninfo) {
253                               ninfo = malloc(sizeof(struct node_info));
254                               if (!ninfo) {
255                                         break;
256                               }
257                               else {
258                                         ninfo->nodeid = joined_list[i].nodeid;
259                                         dm_hash_insert_binary(node_hash,
260                                                                   (char *)&ninfo->nodeid,
261                                                                   COROSYNC_CSID_LEN, ninfo);
262                               }
263                     }
264                     ninfo->state = NODE_CLVMD;
265           }
266 
267           for (i=0; i<left_list_entries; i++) {
268                     ninfo = dm_hash_lookup_binary(node_hash,
269                                                         (char *)&left_list[i].nodeid,
270                                                         COROSYNC_CSID_LEN);
271                     if (ninfo)
272                               ninfo->state = NODE_DOWN;
273           }
274 
275           for (i=0; i<member_list_entries; i++) {
276                     if (member_list[i].nodeid == 0) continue;
277                     ninfo = dm_hash_lookup_binary(node_hash,
278                                         (char *)&member_list[i].nodeid,
279                                         COROSYNC_CSID_LEN);
280                     if (!ninfo) {
281                               ninfo = malloc(sizeof(struct node_info));
282                               if (!ninfo) {
283                                         break;
284                               }
285                               else {
286                                         ninfo->nodeid = member_list[i].nodeid;
287                                         dm_hash_insert_binary(node_hash,
288                                                             (char *)&ninfo->nodeid,
289                                                             COROSYNC_CSID_LEN, ninfo);
290                               }
291                     }
292                     ninfo->state = NODE_CLVMD;
293           }
294 
295           num_nodes = member_list_entries;
296 }
297 
_init_cluster(void)298 static int _init_cluster(void)
299 {
300           cs_error_t err;
301 
302           node_hash = dm_hash_create(100);
303 
304           err = cpg_initialize(&cpg_handle,
305                                    &corosync_cpg_callbacks);
306           if (err != CS_OK) {
307                     syslog(LOG_ERR, "Cannot initialise Corosync CPG service: %d",
308                            err);
309                     DEBUGLOG("Cannot initialise Corosync CPG service: %d", err);
310                     return cs_to_errno(err);
311           }
312 
313           err = quorum_initialize(&quorum_handle,
314                                         &quorum_callbacks);
315           if (err != CS_OK) {
316                     syslog(LOG_ERR, "Cannot initialise Corosync quorum service: %d",
317                            err);
318                     DEBUGLOG("Cannot initialise Corosync quorum service: %d", err);
319                     return cs_to_errno(err);
320           }
321 
322 
323           /* Create a lockspace for LV & VG locks to live in */
324           lockspace = dlm_create_lockspace(LOCKSPACE_NAME, 0600);
325           if (!lockspace) {
326                     if (errno == EEXIST) {
327                               lockspace = dlm_open_lockspace(LOCKSPACE_NAME);
328                     }
329                     if (!lockspace) {
330                               syslog(LOG_ERR, "Unable to create lockspace for CLVM: %m");
331                               quorum_finalize(quorum_handle);
332                               return -1;
333                     }
334           }
335           dlm_ls_pthread_init(lockspace);
336           DEBUGLOG("DLM initialisation complete\n");
337 
338           /* Connect to the clvmd group */
339           strcpy((char *)cpg_group_name.value, "clvmd");
340           cpg_group_name.length = strlen((char *)cpg_group_name.value);
341           err = cpg_join(cpg_handle, &cpg_group_name);
342           if (err != CS_OK) {
343                     cpg_finalize(cpg_handle);
344                     quorum_finalize(quorum_handle);
345                     dlm_release_lockspace(LOCKSPACE_NAME, lockspace, 1);
346                     syslog(LOG_ERR, "Cannot join clvmd process group");
347                     DEBUGLOG("Cannot join clvmd process group: %d\n", err);
348                     return cs_to_errno(err);
349           }
350 
351           err = cpg_local_get(cpg_handle,
352                                   &our_nodeid);
353           if (err != CS_OK) {
354                     cpg_finalize(cpg_handle);
355                     quorum_finalize(quorum_handle);
356                     dlm_release_lockspace(LOCKSPACE_NAME, lockspace, 1);
357                     syslog(LOG_ERR, "Cannot get local node id\n");
358                     return cs_to_errno(err);
359           }
360           DEBUGLOG("Our local node id is %d\n", our_nodeid);
361 
362           DEBUGLOG("Connected to Corosync\n");
363 
364           return 0;
365 }
366 
_cluster_closedown(void)367 static void _cluster_closedown(void)
368 {
369           DEBUGLOG("cluster_closedown\n");
370           destroy_lvhash();
371 
372           dlm_release_lockspace(LOCKSPACE_NAME, lockspace, 1);
373           cpg_finalize(cpg_handle);
374           quorum_finalize(quorum_handle);
375 }
376 
_get_our_csid(char * csid)377 static void _get_our_csid(char *csid)
378 {
379           memcpy(csid, &our_nodeid, sizeof(int));
380 }
381 
382 /* Corosync doesn't really have nmode names so we
383    just use the node ID in hex instead */
_csid_from_name(char * csid,const char * name)384 static int _csid_from_name(char *csid, const char *name)
385 {
386           int nodeid;
387           struct node_info *ninfo;
388 
389           if (sscanf(name, "%x", &nodeid) == 1) {
390                     ninfo = dm_hash_lookup_binary(node_hash, csid, COROSYNC_CSID_LEN);
391                     if (ninfo)
392                               return nodeid;
393           }
394           return -1;
395 }
396 
_name_from_csid(const char * csid,char * name)397 static int _name_from_csid(const char *csid, char *name)
398 {
399           struct node_info *ninfo;
400 
401           ninfo = dm_hash_lookup_binary(node_hash, csid, COROSYNC_CSID_LEN);
402           if (!ninfo)
403           {
404                     sprintf(name, "UNKNOWN %s", print_corosync_csid(csid));
405                     return -1;
406           }
407 
408           sprintf(name, "%x", ninfo->nodeid);
409           return 0;
410 }
411 
_get_num_nodes()412 static int _get_num_nodes()
413 {
414           DEBUGLOG("num_nodes = %d\n", num_nodes);
415           return num_nodes;
416 }
417 
418 /* Node is now known to be running a clvmd */
_add_up_node(const char * csid)419 static void _add_up_node(const char *csid)
420 {
421           struct node_info *ninfo;
422 
423           ninfo = dm_hash_lookup_binary(node_hash, csid, COROSYNC_CSID_LEN);
424           if (!ninfo) {
425                     DEBUGLOG("corosync_add_up_node no node_hash entry for csid %s\n",
426                                print_corosync_csid(csid));
427                     return;
428           }
429 
430           DEBUGLOG("corosync_add_up_node %d\n", ninfo->nodeid);
431 
432           ninfo->state = NODE_CLVMD;
433 
434           return;
435 }
436 
437 /* Call a callback for each node, so the caller knows whether it's up or down */
_cluster_do_node_callback(struct local_client * master_client,void (* callback)(struct local_client *,const char * csid,int node_up))438 static int _cluster_do_node_callback(struct local_client *master_client,
439                                              void (*callback)(struct local_client *,
440                                                                   const char *csid, int node_up))
441 {
442           struct dm_hash_node *hn;
443           struct node_info *ninfo;
444           int somedown = 0;
445 
446           dm_hash_iterate(hn, node_hash)
447           {
448                     char csid[COROSYNC_CSID_LEN];
449 
450                     ninfo = dm_hash_get_data(node_hash, hn);
451                     memcpy(csid, dm_hash_get_key(node_hash, hn), COROSYNC_CSID_LEN);
452 
453                     DEBUGLOG("down_callback. node %d, state = %d\n", ninfo->nodeid,
454                                ninfo->state);
455 
456                     if (ninfo->state != NODE_DOWN)
457                               callback(master_client, csid, ninfo->state == NODE_CLVMD);
458                     if (ninfo->state != NODE_CLVMD)
459                               somedown = -1;
460           }
461           return somedown;
462 }
463 
464 /* Real locking */
_lock_resource(const char * resource,int mode,int flags,int * lockid)465 static int _lock_resource(const char *resource, int mode, int flags, int *lockid)
466 {
467           struct dlm_lksb lksb;
468           int err;
469 
470           DEBUGLOG("lock_resource '%s', flags=%d, mode=%d\n", resource, flags, mode);
471 
472           if (flags & LKF_CONVERT)
473                     lksb.sb_lkid = *lockid;
474 
475           err = dlm_ls_lock_wait(lockspace,
476                                      mode,
477                                      &lksb,
478                                      flags,
479                                      resource,
480                                      strlen(resource),
481                                      0,
482                                      NULL, NULL, NULL);
483 
484           if (err != 0)
485           {
486                     DEBUGLOG("dlm_ls_lock returned %d\n", errno);
487                     return err;
488           }
489           if (lksb.sb_status != 0)
490           {
491                     DEBUGLOG("dlm_ls_lock returns lksb.sb_status %d\n", lksb.sb_status);
492                     errno = lksb.sb_status;
493                     return -1;
494           }
495 
496           DEBUGLOG("lock_resource returning %d, lock_id=%x\n", err, lksb.sb_lkid);
497 
498           *lockid = lksb.sb_lkid;
499 
500           return 0;
501 }
502 
503 
_unlock_resource(const char * resource,int lockid)504 static int _unlock_resource(const char *resource, int lockid)
505 {
506           struct dlm_lksb lksb;
507           int err;
508 
509           DEBUGLOG("unlock_resource: %s lockid: %x\n", resource, lockid);
510           lksb.sb_lkid = lockid;
511 
512           err = dlm_ls_unlock_wait(lockspace,
513                                          lockid,
514                                          0,
515                                          &lksb);
516           if (err != 0)
517           {
518                     DEBUGLOG("Unlock returned %d\n", err);
519                     return err;
520           }
521           if (lksb.sb_status != EUNLOCK)
522           {
523                     DEBUGLOG("dlm_ls_unlock_wait returns lksb.sb_status: %d\n", lksb.sb_status);
524                     errno = lksb.sb_status;
525                     return -1;
526           }
527 
528 
529           return 0;
530 }
531 
_is_quorate()532 static int _is_quorate()
533 {
534           int quorate;
535           if (quorum_getquorate(quorum_handle, &quorate) == CS_OK)
536                     return quorate;
537           else
538                     return 0;
539 }
540 
_get_main_cluster_fd(void)541 static int _get_main_cluster_fd(void)
542 {
543           int select_fd;
544 
545           cpg_fd_get(cpg_handle, &select_fd);
546           return select_fd;
547 }
548 
_cluster_fd_callback(struct local_client * fd,char * buf,int len,const char * csid,struct local_client ** new_client)549 static int _cluster_fd_callback(struct local_client *fd, char *buf, int len,
550                                         const char *csid,
551                                         struct local_client **new_client)
552 {
553           cluster_client = fd;
554           *new_client = NULL;
555           cpg_dispatch(cpg_handle, CS_DISPATCH_ONE);
556           return 1;
557 }
558 
_cluster_send_message(const void * buf,int msglen,const char * csid,const char * errtext)559 static int _cluster_send_message(const void *buf, int msglen, const char *csid,
560                                          const char *errtext)
561 {
562           struct iovec iov[2];
563           cs_error_t err;
564           int target_node;
565 
566           if (csid)
567                     memcpy(&target_node, csid, COROSYNC_CSID_LEN);
568           else
569                     target_node = 0;
570 
571           iov[0].iov_base = &target_node;
572           iov[0].iov_len = sizeof(int);
573           iov[1].iov_base = (char *)buf;
574           iov[1].iov_len = msglen;
575 
576           err = cpg_mcast_joined(cpg_handle, CPG_TYPE_AGREED, iov, 2);
577           return cs_to_errno(err);
578 }
579 
580 /*
581  * We are not necessarily connected to a Red Hat Cluster system,
582  * but if we are, this returns the cluster name from cluster.conf.
583  * I've used confdb rather than ccs to reduce the inter-package
584  * dependancies as well as to allow people to set a cluster name
585  * for themselves even if they are not running on RH cluster.
586  */
_get_cluster_name(char * buf,int buflen)587 static int _get_cluster_name(char *buf, int buflen)
588 {
589           confdb_handle_t handle;
590           int result;
591           size_t namelen = buflen;
592           hdb_handle_t cluster_handle;
593           confdb_callbacks_t callbacks = {
594                     .confdb_key_change_notify_fn = NULL,
595                     .confdb_object_create_change_notify_fn = NULL,
596                     .confdb_object_delete_change_notify_fn = NULL
597           };
598 
599           /* This is a default in case everything else fails */
600           strncpy(buf, "Corosync", buflen);
601 
602           /* Look for a cluster name in confdb */
603           result = confdb_initialize (&handle, &callbacks);
604         if (result != CS_OK)
605                     return 0;
606 
607         result = confdb_object_find_start(handle, OBJECT_PARENT_HANDLE);
608           if (result != CS_OK)
609                     goto out;
610 
611         result = confdb_object_find(handle, OBJECT_PARENT_HANDLE, (void *)"cluster", strlen("cluster"), &cluster_handle);
612         if (result != CS_OK)
613                     goto out;
614 
615         result = confdb_key_get(handle, cluster_handle, (void *)"name", strlen("name"), buf, &namelen);
616         if (result != CS_OK)
617                     goto out;
618 
619           buf[namelen] = '\0';
620 
621 out:
622           confdb_finalize(handle);
623           return 0;
624 }
625 
626 static struct cluster_ops _cluster_corosync_ops = {
627           .cluster_init_completed   = NULL,
628           .cluster_send_message     = _cluster_send_message,
629           .name_from_csid           = _name_from_csid,
630           .csid_from_name           = _csid_from_name,
631           .get_num_nodes            = _get_num_nodes,
632           .cluster_fd_callback      = _cluster_fd_callback,
633           .get_main_cluster_fd      = _get_main_cluster_fd,
634           .cluster_do_node_callback = _cluster_do_node_callback,
635           .is_quorate               = _is_quorate,
636           .get_our_csid             = _get_our_csid,
637           .add_up_node              = _add_up_node,
638           .reread_config            = NULL,
639           .cluster_closedown        = _cluster_closedown,
640           .get_cluster_name         = _get_cluster_name,
641           .sync_lock                = _lock_resource,
642           .sync_unlock              = _unlock_resource,
643 };
644 
init_corosync_cluster(void)645 struct cluster_ops *init_corosync_cluster(void)
646 {
647           if (!_init_cluster())
648                     return &_cluster_corosync_ops;
649           else
650                     return NULL;
651 }
652