blob: 280cf2bdbeff5b7478ff33cad21aaad68d2b7c3f [file] [log] [blame]
/*
* Copyright (C) 2014-2015 Red Hat, Inc.
*
* This file is part of LVM2.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*/
#define _XOPEN_SOURCE 500 /* pthread */
#define _ISOC99_SOURCE
#include "tool.h"
#include "daemon-server.h"
#include "xlate.h"
#include "lvmlockd-internal.h"
#include "lvmlockd-client.h"
/*
* Using synchronous _wait dlm apis so do not define _REENTRANT and
* link with non-threaded version of library, libdlm_lt.
*/
#include "libdlm.h"
#include <stddef.h>
#include <poll.h>
#include <errno.h>
#include <endian.h>
#include <fcntl.h>
#include <byteswap.h>
#include <syslog.h>
#include <dirent.h>
struct lm_dlm {
dlm_lshandle_t *dh;
};
struct rd_dlm {
struct dlm_lksb lksb;
struct val_blk *vb;
};
int lm_data_size_dlm(void)
{
return sizeof(struct rd_dlm);
}
/*
* lock_args format
*
* vg_lock_args format for dlm is
* vg_version_string:undefined:cluster_name
*
* lv_lock_args are not used for dlm
*
* version_string is MAJOR.MINOR.PATCH
* undefined may contain ":"
*/
#define VG_LOCK_ARGS_MAJOR 1
#define VG_LOCK_ARGS_MINOR 0
#define VG_LOCK_ARGS_PATCH 0
static int dlm_has_lvb_bug;
static int cluster_name_from_args(char *vg_args, char *clustername)
{
return last_string_from_args(vg_args, clustername);
}
static int check_args_version(char *vg_args)
{
unsigned int major = 0;
int rv;
rv = version_from_args(vg_args, &major, NULL, NULL);
if (rv < 0) {
log_error("check_args_version %s error %d", vg_args, rv);
return rv;
}
if (major > VG_LOCK_ARGS_MAJOR) {
log_error("check_args_version %s major %d %d", vg_args, major, VG_LOCK_ARGS_MAJOR);
return -1;
}
return 0;
}
/* This will be set after dlm_controld is started. */
#define DLM_CLUSTER_NAME_PATH "/sys/kernel/config/dlm/cluster/cluster_name"
static int read_cluster_name(char *clustername)
{
static const char close_error_msg[] = "read_cluster_name: close_error %d";
char *n;
int fd;
int rv;
if (daemon_test) {
sprintf(clustername, "%s", "test");
return 0;
}
fd = open(DLM_CLUSTER_NAME_PATH, O_RDONLY);
if (fd < 0) {
log_debug("read_cluster_name: open error %d, check dlm_controld", fd);
return fd;
}
rv = read(fd, clustername, MAX_ARGS);
if (rv < 0) {
log_error("read_cluster_name: cluster name read error %d, check dlm_controld", fd);
if (close(fd))
log_error(close_error_msg, fd);
return rv;
}
n = strstr(clustername, "\n");
if (n)
*n = '\0';
if (close(fd))
log_error(close_error_msg, fd);
return 0;
}
int lm_init_vg_dlm(char *ls_name, char *vg_name, uint32_t flags, char *vg_args)
{
char clustername[MAX_ARGS+1];
char lock_args_version[MAX_ARGS+1];
int rv;
memset(clustername, 0, sizeof(clustername));
memset(lock_args_version, 0, sizeof(lock_args_version));
snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u",
VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH);
rv = read_cluster_name(clustername);
if (rv < 0)
return -EMANAGER;
if (strlen(clustername) + strlen(lock_args_version) + 2 > MAX_ARGS) {
log_error("init_vg_dlm args too long");
return -EARGS;
}
snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, clustername);
rv = 0;
log_debug("init_vg_dlm done %s vg_args %s", ls_name, vg_args);
return rv;
}
int lm_prepare_lockspace_dlm(struct lockspace *ls)
{
char sys_clustername[MAX_ARGS+1];
char arg_clustername[MAX_ARGS+1];
uint32_t major = 0, minor = 0, patch = 0;
struct lm_dlm *lmd;
int rv;
if (daemon_test)
goto skip_args;
memset(sys_clustername, 0, sizeof(sys_clustername));
memset(arg_clustername, 0, sizeof(arg_clustername));
rv = read_cluster_name(sys_clustername);
if (rv < 0)
return -EMANAGER;
rv = dlm_kernel_version(&major, &minor, &patch);
if (rv < 0) {
log_error("prepare_lockspace_dlm kernel_version not detected %d", rv);
dlm_has_lvb_bug = 1;
}
if ((major == 6) && (minor == 0) && (patch == 1)) {
log_debug("dlm kernel version %u.%u.%u has lvb bug", major, minor, patch);
dlm_has_lvb_bug = 1;
}
if (!ls->vg_args[0]) {
/* global lockspace has no vg args */
goto skip_args;
}
rv = check_args_version(ls->vg_args);
if (rv < 0)
return -EARGS;
rv = cluster_name_from_args(ls->vg_args, arg_clustername);
if (rv < 0) {
log_error("prepare_lockspace_dlm %s no cluster name from args %s", ls->name, ls->vg_args);
return -EARGS;
}
if (strcmp(sys_clustername, arg_clustername)) {
log_error("prepare_lockspace_dlm %s mismatching cluster names sys %s arg %s",
ls->name, sys_clustername, arg_clustername);
return -EARGS;
}
skip_args:
lmd = malloc(sizeof(struct lm_dlm));
if (!lmd)
return -ENOMEM;
ls->lm_data = lmd;
return 0;
}
int lm_add_lockspace_dlm(struct lockspace *ls, int adopt)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
if (daemon_test)
return 0;
if (adopt)
lmd->dh = dlm_open_lockspace(ls->name);
else
lmd->dh = dlm_new_lockspace(ls->name, 0600, DLM_LSFL_NEWEXCL);
if (!lmd->dh) {
log_error("add_lockspace_dlm %s adopt %d error", ls->name, adopt);
free(lmd);
ls->lm_data = NULL;
return -1;
}
return 0;
}
int lm_rem_lockspace_dlm(struct lockspace *ls, int free_vg)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
int rv;
if (daemon_test)
goto out;
/*
* If free_vg is set, it means we are doing vgremove, and we may want
* to tell any other nodes to leave the lockspace. This is not really
* necessary since there should be no harm in having an unused
* lockspace sitting around. A new "notification lock" would need to
* be added with a callback to signal this.
*/
rv = dlm_release_lockspace(ls->name, lmd->dh, 1);
if (rv < 0) {
log_error("rem_lockspace_dlm error %d", rv);
return rv;
}
out:
free(lmd);
ls->lm_data = NULL;
return 0;
}
static int lm_add_resource_dlm(struct lockspace *ls, struct resource *r, int with_lock_nl)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data;
uint32_t flags = 0;
char *buf;
int rv;
if (r->type == LD_RT_GL || r->type == LD_RT_VG) {
buf = malloc(sizeof(struct val_blk) + DLM_LVB_LEN);
if (!buf)
return -ENOMEM;
memset(buf, 0, sizeof(struct val_blk) + DLM_LVB_LEN);
rdd->vb = (struct val_blk *)buf;
rdd->lksb.sb_lvbptr = buf + sizeof(struct val_blk);
flags |= LKF_VALBLK;
}
if (!with_lock_nl)
goto out;
/* because this is a new NL lock request */
flags |= LKF_EXPEDITE;
if (daemon_test)
goto out;
rv = dlm_ls_lock_wait(lmd->dh, LKM_NLMODE, &rdd->lksb, flags,
r->name, strlen(r->name),
0, NULL, NULL, NULL);
if (rv < 0) {
log_error("S %s R %s add_resource_dlm lock error %d", ls->name, r->name, rv);
return rv;
}
out:
return 0;
}
int lm_rem_resource_dlm(struct lockspace *ls, struct resource *r)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data;
struct dlm_lksb *lksb;
int rv = 0;
if (daemon_test)
goto out;
lksb = &rdd->lksb;
if (!lksb->sb_lkid)
goto out;
rv = dlm_ls_unlock_wait(lmd->dh, lksb->sb_lkid, 0, lksb);
if (rv < 0) {
log_error("S %s R %s rem_resource_dlm unlock error %d", ls->name, r->name, rv);
}
out:
if (rdd->vb)
free(rdd->vb);
memset(rdd, 0, sizeof(struct rd_dlm));
r->lm_init = 0;
return rv;
}
static int to_dlm_mode(int ld_mode)
{
switch (ld_mode) {
case LD_LK_EX:
return LKM_EXMODE;
case LD_LK_SH:
return LKM_PRMODE;
};
return -1;
}
static int lm_adopt_dlm(struct lockspace *ls, struct resource *r, int ld_mode,
struct val_blk *vb_out)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data;
struct dlm_lksb *lksb;
uint32_t flags = 0;
int mode;
int rv;
memset(vb_out, 0, sizeof(struct val_blk));
if (!r->lm_init) {
rv = lm_add_resource_dlm(ls, r, 0);
if (rv < 0)
return rv;
r->lm_init = 1;
}
lksb = &rdd->lksb;
flags |= LKF_PERSISTENT;
flags |= LKF_ORPHAN;
if (rdd->vb)
flags |= LKF_VALBLK;
mode = to_dlm_mode(ld_mode);
if (mode < 0) {
log_error("adopt_dlm invalid mode %d", ld_mode);
rv = -EINVAL;
goto fail;
}
log_debug("S %s R %s adopt_dlm", ls->name, r->name);
if (daemon_test)
return 0;
/*
* dlm returns 0 for success, -EAGAIN if an orphan is
* found with another mode, and -ENOENT if no orphan.
*
* cast/bast/param are (void *)1 because the kernel
* returns errors if some are null.
*/
rv = dlm_ls_lockx(lmd->dh, mode, lksb, flags,
r->name, strlen(r->name), 0,
(void *)1, (void *)1, (void *)1,
NULL, NULL);
if (rv == -1 && errno == -EAGAIN) {
log_debug("S %s R %s adopt_dlm adopt mode %d try other mode",
ls->name, r->name, ld_mode);
rv = -EUCLEAN;
goto fail;
}
if (rv < 0) {
log_debug("S %s R %s adopt_dlm mode %d flags %x error %d errno %d",
ls->name, r->name, mode, flags, rv, errno);
goto fail;
}
/*
* FIXME: For GL/VG locks we probably want to read the lvb,
* especially if adopting an ex lock, because when we
* release this adopted ex lock we may want to write new
* lvb values based on the current lvb values (at lease
* in the GL case where we increment the current values.)
*
* It should be possible to read the lvb by requesting
* this lock in the same mode it's already in.
*/
return rv;
fail:
lm_rem_resource_dlm(ls, r);
return rv;
}
/*
* Use PERSISTENT so that if lvmlockd exits while holding locks,
* the locks will remain orphaned in the dlm, still protecting what
* they were acquired to protect.
*/
int lm_lock_dlm(struct lockspace *ls, struct resource *r, int ld_mode,
struct val_blk *vb_out, int adopt)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data;
struct dlm_lksb *lksb;
struct val_blk vb;
uint32_t flags = 0;
int mode;
int rv;
if (adopt) {
/* When adopting, we don't follow the normal method
of acquiring a NL lock then converting it to the
desired mode. */
return lm_adopt_dlm(ls, r, ld_mode, vb_out);
}
if (!r->lm_init) {
rv = lm_add_resource_dlm(ls, r, 1);
if (rv < 0)
return rv;
r->lm_init = 1;
}
lksb = &rdd->lksb;
flags |= LKF_CONVERT;
flags |= LKF_NOQUEUE;
flags |= LKF_PERSISTENT;
if (rdd->vb)
flags |= LKF_VALBLK;
mode = to_dlm_mode(ld_mode);
if (mode < 0) {
log_error("lock_dlm invalid mode %d", ld_mode);
return -EINVAL;
}
log_debug("S %s R %s lock_dlm", ls->name, r->name);
if (daemon_test) {
if (rdd->vb) {
vb_out->version = le16_to_cpu(rdd->vb->version);
vb_out->flags = le16_to_cpu(rdd->vb->flags);
vb_out->r_version = le32_to_cpu(rdd->vb->r_version);
}
return 0;
}
/*
* The dlm lvb bug means that converting NL->EX will not return
* the latest lvb, so we have to convert NL->PR->EX to reread it.
*/
if (dlm_has_lvb_bug && (ld_mode == LD_LK_EX)) {
rv = dlm_ls_lock_wait(lmd->dh, LKM_PRMODE, lksb, flags,
r->name, strlen(r->name),
0, NULL, NULL, NULL);
if (rv == -1) {
log_debug("S %s R %s lock_dlm acquire mode PR for %d rv %d",
ls->name, r->name, mode, rv);
goto lockrv;
}
/* Fall through to request EX. */
}
rv = dlm_ls_lock_wait(lmd->dh, mode, lksb, flags,
r->name, strlen(r->name),
0, NULL, NULL, NULL);
lockrv:
if (rv == -1 && errno == EAGAIN) {
log_debug("S %s R %s lock_dlm acquire mode %d rv EAGAIN", ls->name, r->name, mode);
return -EAGAIN;
}
if (rv < 0) {
log_error("S %s R %s lock_dlm acquire error %d errno %d", ls->name, r->name, rv, errno);
return rv;
}
if (rdd->vb) {
if (lksb->sb_flags & DLM_SBF_VALNOTVALID) {
log_debug("S %s R %s lock_dlm VALNOTVALID", ls->name, r->name);
memset(rdd->vb, 0, sizeof(struct val_blk));
memset(vb_out, 0, sizeof(struct val_blk));
goto out;
}
/*
* 'vb' contains disk endian values, not host endian.
* It is copied directly to rdd->vb which is also kept
* in disk endian form.
* vb_out is returned to the caller in host endian form.
*/
memcpy(&vb, lksb->sb_lvbptr, sizeof(struct val_blk));
memcpy(rdd->vb, &vb, sizeof(vb));
vb_out->version = le16_to_cpu(vb.version);
vb_out->flags = le16_to_cpu(vb.flags);
vb_out->r_version = le32_to_cpu(vb.r_version);
}
out:
return 0;
}
int lm_convert_dlm(struct lockspace *ls, struct resource *r,
int ld_mode, uint32_t r_version)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data;
struct dlm_lksb *lksb = &rdd->lksb;
uint32_t mode;
uint32_t flags = 0;
int rv;
log_debug("S %s R %s convert_dlm", ls->name, r->name);
flags |= LKF_CONVERT;
flags |= LKF_NOQUEUE;
flags |= LKF_PERSISTENT;
if (rdd->vb && r_version && (r->mode == LD_LK_EX)) {
if (!rdd->vb->version) {
/* first time vb has been written */
rdd->vb->version = cpu_to_le16(VAL_BLK_VERSION);
}
rdd->vb->r_version = cpu_to_le32(r_version);
memcpy(lksb->sb_lvbptr, rdd->vb, sizeof(struct val_blk));
log_debug("S %s R %s convert_dlm set r_version %u",
ls->name, r->name, r_version);
flags |= LKF_VALBLK;
}
mode = to_dlm_mode(ld_mode);
if (daemon_test)
return 0;
rv = dlm_ls_lock_wait(lmd->dh, mode, lksb, flags,
r->name, strlen(r->name),
0, NULL, NULL, NULL);
if (rv == -1 && errno == EAGAIN) {
/* FIXME: When does this happen? Should something different be done? */
log_error("S %s R %s convert_dlm mode %d rv EAGAIN", ls->name, r->name, mode);
return -EAGAIN;
}
if (rv < 0) {
log_error("S %s R %s convert_dlm error %d", ls->name, r->name, rv);
}
return rv;
}
int lm_unlock_dlm(struct lockspace *ls, struct resource *r,
uint32_t r_version, uint32_t lmu_flags)
{
struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data;
struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data;
struct dlm_lksb *lksb = &rdd->lksb;
struct val_blk vb_prev;
struct val_blk vb_next;
uint32_t flags = 0;
int new_vb = 0;
int rv;
/*
* Do not set PERSISTENT, because we don't need an orphan
* NL lock to protect anything.
*/
flags |= LKF_CONVERT;
if (rdd->vb && (r->mode == LD_LK_EX)) {
/* vb_prev and vb_next are in disk endian form */
memcpy(&vb_prev, rdd->vb, sizeof(struct val_blk));
memcpy(&vb_next, rdd->vb, sizeof(struct val_blk));
if (!vb_prev.version) {
vb_next.version = cpu_to_le16(VAL_BLK_VERSION);
new_vb = 1;
}
if ((lmu_flags & LMUF_FREE_VG) && (r->type == LD_RT_VG)) {
vb_next.flags = cpu_to_le16(VBF_REMOVED);
new_vb = 1;
}
if (r_version) {
vb_next.r_version = cpu_to_le32(r_version);
new_vb = 1;
}
if (new_vb) {
memcpy(rdd->vb, &vb_next, sizeof(struct val_blk));
memcpy(lksb->sb_lvbptr, &vb_next, sizeof(struct val_blk));
log_debug("S %s R %s unlock_dlm vb old %x %x %u new %x %x %u",
ls->name, r->name,
le16_to_cpu(vb_prev.version),
le16_to_cpu(vb_prev.flags),
le32_to_cpu(vb_prev.r_version),
le16_to_cpu(vb_next.version),
le16_to_cpu(vb_next.flags),
le32_to_cpu(vb_next.r_version));
} else {
log_debug("S %s R %s unlock_dlm vb unchanged", ls->name, r->name);
}
flags |= LKF_VALBLK;
} else {
log_debug("S %s R %s unlock_dlm", ls->name, r->name);
}
if (daemon_test)
return 0;
rv = dlm_ls_lock_wait(lmd->dh, LKM_NLMODE, lksb, flags,
r->name, strlen(r->name),
0, NULL, NULL, NULL);
if (rv < 0) {
log_error("S %s R %s unlock_dlm error %d", ls->name, r->name, rv);
}
return rv;
}
/*
* This list could be read from dlm_controld via libdlmcontrol,
* but it's simpler to get it from sysfs.
*/
#define DLM_LOCKSPACES_PATH "/sys/kernel/config/dlm/cluster/spaces"
/*
* FIXME: this should be implemented differently.
* It's not nice to use an aspect of the dlm clustering
* implementation, which could change. It would be
* better to do something like use a special lock in the
* lockspace that was held PR by all nodes, and then an
* EX request on it could check if it's started (and
* possibly also notify others to stop it automatically).
* Or, possibly an enhancement to libdlm that would give
* info about lockspace members.
*
* (We could let the VG be removed while others still
* have the lockspace running, which largely works, but
* introduces problems if another VG with the same name is
* recreated while others still have the lockspace running
* for the previous VG. We'd also want a way to clean up
* the stale lockspaces on the others eventually.)
*/
int lm_hosts_dlm(struct lockspace *ls, int notify)
{
static const char closedir_err_msg[] = "lm_hosts_dlm: closedir failed";
char ls_nodes_path[PATH_MAX];
struct dirent *de;
DIR *ls_dir;
int count = 0;
if (daemon_test)
return 0;
memset(ls_nodes_path, 0, sizeof(ls_nodes_path));
snprintf(ls_nodes_path, PATH_MAX-1, "%s/%s/nodes",
DLM_LOCKSPACES_PATH, ls->name);
if (!(ls_dir = opendir(ls_nodes_path)))
return -ECONNREFUSED;
while ((de = readdir(ls_dir))) {
if (de->d_name[0] == '.')
continue;
count++;
}
if (closedir(ls_dir))
log_error(closedir_err_msg);
if (!count) {
log_error("lm_hosts_dlm found no nodes in %s", ls_nodes_path);
return 0;
}
/*
* Assume that a count of one node represents ourself,
* and any value over one represents other nodes.
*/
return count - 1;
}
int lm_get_lockspaces_dlm(struct list_head *ls_rejoin)
{
static const char closedir_err_msg[] = "lm_get_lockspace_dlm: closedir failed";
struct lockspace *ls;
struct dirent *de;
DIR *ls_dir;
if (!(ls_dir = opendir(DLM_LOCKSPACES_PATH)))
return -ECONNREFUSED;
while ((de = readdir(ls_dir))) {
if (de->d_name[0] == '.')
continue;
if (strncmp(de->d_name, LVM_LS_PREFIX, strlen(LVM_LS_PREFIX)))
continue;
if (!(ls = alloc_lockspace())) {
if (closedir(ls_dir))
log_error(closedir_err_msg);
return -ENOMEM;
}
ls->lm_type = LD_LM_DLM;
strncpy(ls->name, de->d_name, MAX_NAME);
strncpy(ls->vg_name, ls->name + strlen(LVM_LS_PREFIX), MAX_NAME);
list_add_tail(&ls->list, ls_rejoin);
}
if (closedir(ls_dir))
log_error(closedir_err_msg);
return 0;
}
int lm_is_running_dlm(void)
{
char sys_clustername[MAX_ARGS+1];
int rv;
if (daemon_test)
return gl_use_dlm;
memset(sys_clustername, 0, sizeof(sys_clustername));
rv = read_cluster_name(sys_clustername);
if (rv < 0)
return 0;
return 1;
}