daemons/cmirrord/functions.c - manifest_repos/lvm2 - Git at Google

 /*
  * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
  * of the GNU Lesser General Public License v.2.1.
  *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "logging.h"
 #include "functions.h"

 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <time.h>
 #include <unistd.h>

 #define BYTE_SHIFT 3

 /*
  * Magic for persistent mirrors: "MiRr"
  * Following on-disk header information is stolen from
  * drivers/md/dm-log.c
  */
 #define MIRROR_MAGIC 0x4D695272
 #define MIRROR_DISK_VERSION 2
 #define LOG_OFFSET 2

 #define RESYNC_HISTORY 50
 #define RESYNC_BUFLEN 128
 //static char resync_history[RESYNC_HISTORY][128];
 //static int idx = 0;
 #define LOG_SPRINT(_lc, f, arg...) do {					\
 		lc->idx++;						\
 		lc->idx = lc->idx % RESYNC_HISTORY;			\
 		snprintf(lc->resync_history[lc->idx], RESYNC_BUFLEN, f, ## arg); \
 	} while (0)

 struct log_header {
         uint32_t magic;
         uint32_t version;
         uint64_t nr_regions;
 };

 struct log_c {
 	struct dm_list list;

 	char uuid[DM_UUID_LEN];
 	uint64_t luid;

 	time_t delay; /* limits how fast a resume can happen after suspend */
 	int touched;
 	int in_sync;  /* An in-sync that stays set until suspend/resume */
 	uint32_t region_size;
 	uint32_t region_count;
 	uint64_t sync_count;

 	dm_bitset_t clean_bits;
 	dm_bitset_t sync_bits;
 	uint32_t recoverer;
 	uint64_t recovering_region; /* -1 means not recovering */
 	uint64_t skip_bit_warning; /* used to warn if region skipped */
 	int sync_search;

 	int resume_override;

 	uint32_t block_on_error;
         enum sync {
                 DEFAULTSYNC,    /* Synchronize if necessary */
                 NOSYNC,         /* Devices known to be already in sync */
                 FORCESYNC,      /* Force a sync to happen */
         } sync;

 	uint32_t state;         /* current operational state of the log */

 	struct dm_list mark_list;

 	uint32_t recovery_halted;
 	struct recovery_request *recovery_request_list;

 	int disk_fd;            /* -1 means no disk log */
 	int log_dev_failed;
 	uint64_t disk_nr_regions;
 	size_t disk_size;       /* size of disk_buffer in bytes */
 	void *disk_buffer;      /* aligned memory for O_DIRECT */
 	int idx;
 	char resync_history[RESYNC_HISTORY][RESYNC_BUFLEN];
 };

 struct mark_entry {
 	struct dm_list list;
 	uint32_t nodeid;
 	uint64_t region;
 };

 struct recovery_request {
 	uint64_t region;
 	struct recovery_request *next;
 };

 static DM_LIST_INIT(log_list);
 static DM_LIST_INIT(log_pending_list);

 static int log_test_bit(dm_bitset_t bs, int bit)
 {
 	return dm_bit(bs, bit) ? 1 : 0;
 }

 static void log_set_bit(struct log_c *lc, dm_bitset_t bs, int bit)
 {
 	dm_bit_set(bs, bit);
 	lc->touched = 1;
 }

 static void log_clear_bit(struct log_c *lc, dm_bitset_t bs, int bit)
 {
 	dm_bit_clear(bs, bit);
 	lc->touched = 1;
 }

 static uint64_t find_next_zero_bit(dm_bitset_t bs, unsigned start)
 {
 	for (; dm_bit(bs, start); start++)
 		if (start >= *bs)
 			return (uint64_t)-1;

 	return start;
 }

 static uint64_t count_bits32(dm_bitset_t bs)
 {
 	unsigned i, size = bs[0]/(unsigned)DM_BITS_PER_INT + 1;
 	unsigned count = 0;

 	for (i = 1; i <= size; i++)
 		count += hweight32(bs[i]);

 	return (uint64_t)count;
 }

 /*
  * get_log
  *
  * Returns: log if found, NULL otherwise
  */
 static struct log_c *get_log(const char *uuid, uint64_t luid)
 {
 	struct log_c *lc;

 	dm_list_iterate_items(lc, &log_list)
 		if (!strcmp(lc->uuid, uuid) &&
 		    (!luid || (luid == lc->luid)))
 			return lc;

 	return NULL;
 }

 /*
  * get_pending_log
  *
  * Pending logs are logs that have been 'clog_ctr'ed, but
  * have not joined the CPG (via clog_resume).
  *
  * Returns: log if found, NULL otherwise
  */
 static struct log_c *get_pending_log(const char *uuid, uint64_t luid)
 {
 	struct log_c *lc;

 	dm_list_iterate_items(lc, &log_pending_list)
 		if (!strcmp(lc->uuid, uuid) &&
 		    (!luid || (luid == lc->luid)))
 			return lc;

 	return NULL;
 }

 static void header_to_disk(struct log_header *mem, struct log_header *disk)
 {
 	memcpy(disk, mem, sizeof(struct log_header));
 }

 static void header_from_disk(struct log_header *mem, struct log_header *disk)
 {
 	memcpy(mem, disk, sizeof(struct log_header));
 }

 static int rw_log(struct log_c *lc, int do_write)
 {
 	int r;

 	r = (int)lseek(lc->disk_fd, 0, SEEK_SET);
 	if (r < 0) {
 		LOG_ERROR("[%s] rw_log:  lseek failure: %s",
 			  SHORT_UUID(lc->uuid), strerror(errno));
 		return -errno;
 	}

 	if (do_write) {
 		/* FIXME Cope with full set of non-error conditions */
 		r = write(lc->disk_fd, lc->disk_buffer, lc->disk_size);
 		if (r < 0) {
 			LOG_ERROR("[%s] rw_log:  write failure: %s",
 				  SHORT_UUID(lc->uuid), strerror(errno));
 			return -EIO; /* Failed disk write */
 		}
 		return 0;
 	}

 	/* Read */
 	/* FIXME Cope with full set of non-error conditions */
 	r = read(lc->disk_fd, lc->disk_buffer, lc->disk_size);
 	if (r < 0)
 		LOG_ERROR("[%s] rw_log:  read failure: %s",
 			  SHORT_UUID(lc->uuid), strerror(errno));
 	if (r != lc->disk_size)
 		return -EIO; /* Failed disk read */
 	return 0;
 }

 /*
  * read_log
  * @lc
  *
  * Valid return codes:
  *   -EINVAL:  Invalid header, bits not copied
  *   -EIO:     Unable to read disk log
  *    0:       Valid header, disk bit -> lc->clean_bits
  *
  * Returns: 0 on success, -EXXX on failure
  */
 static int read_log(struct log_c *lc)
 {
 	struct log_header lh = { 0 };
 	size_t bitset_size;

 	if (rw_log(lc, 0))
 		return -EIO; /* Failed disk read */

 	header_from_disk(&lh, lc->disk_buffer);
 	if (lh.magic != MIRROR_MAGIC)
 		return -EINVAL;

 	lc->disk_nr_regions = lh.nr_regions;

 	/* Read disk bits into sync_bits */
 	bitset_size = lc->region_count / 8;
 	bitset_size += (lc->region_count % 8) ? 1 : 0;

 	/* 'lc->clean_bits + 1' becasue dm_bitset_t leads with a uint32_t */
 	memcpy(lc->clean_bits + 1, (char *)lc->disk_buffer + 1024, bitset_size);

 	return 0;
 }

 /*
  * write_log
  * @lc
  *
  * Returns: 0 on success, -EIO on failure
  */
 static int write_log(struct log_c *lc)
 {
 	struct log_header lh;
 	size_t bitset_size;

 	lh.magic = MIRROR_MAGIC;
 	lh.version = MIRROR_DISK_VERSION;
 	lh.nr_regions = lc->region_count;

 	header_to_disk(&lh, lc->disk_buffer);

 	/* Write disk bits from clean_bits */
 	bitset_size = lc->region_count / 8;
 	bitset_size += (lc->region_count % 8) ? 1 : 0;

 	/* 'lc->clean_bits + 1' becasue dm_bitset_t leads with a uint32_t */
 	memcpy((char *)lc->disk_buffer + 1024, lc->clean_bits + 1, bitset_size);

 	if (rw_log(lc, 1)) {
 		lc->log_dev_failed = 1;
 		return -EIO; /* Failed disk write */
 	}
 	return 0;
 }

 /* FIXME Rewrite this function taking advantage of the udev changes (where in use) to improve its efficiency! */
 static int find_disk_path(char *major_minor_str, char *path_rtn, int *unlink_path __attribute__((unused)))
 {
 	int r;
 	DIR *dp;
 	struct dirent *dep;
 	struct stat statbuf;
 	int major, minor;

 	if (!strstr(major_minor_str, ":")) {
 		r = stat(major_minor_str, &statbuf);
 		if (r)
 			return -errno;
 		if (!S_ISBLK(statbuf.st_mode))
 			return -EINVAL;
 		sprintf(path_rtn, "%s", major_minor_str);
 		return 0;
 	}

 	r = sscanf(major_minor_str, "%d:%d", &major, &minor);
 	if (r != 2)
 		return -EINVAL;

 	/* FIXME dm_dir() */
 	LOG_DBG("Checking /dev/mapper for device %d:%d", major, minor);
 	/* Check /dev/mapper dir */
 	dp = opendir("/dev/mapper");
 	if (!dp)
 		return -ENOENT;

 	while ((dep = readdir(dp)) != NULL) {
 		/*
 		 * FIXME: This is racy.  By the time the path is used,
 		 * it may point to something else.  'fstat' will be
 		 * required upon opening to ensure we got what we
 		 * wanted.
 		 */

 		sprintf(path_rtn, "/dev/mapper/%s", dep->d_name);
 		if (stat(path_rtn, &statbuf) < 0) {
 			LOG_DBG("Unable to stat %s", path_rtn);
 			continue;
 		}
 		if (S_ISBLK(statbuf.st_mode) &&
 		    (major(statbuf.st_rdev) == major) &&
 		    (minor(statbuf.st_rdev) == minor)) {
 			LOG_DBG("  %s: YES", dep->d_name);
 			if (closedir(dp))
 				LOG_DBG("Unable to closedir /dev/mapper %s",
 					strerror(errno));
 			return 0;
 		} else {
 			LOG_DBG("  %s: NO", dep->d_name);
 		}
 	}

 	if (closedir(dp))
 		LOG_DBG("Unable to closedir /dev/mapper %s",
 			strerror(errno));

 	/* FIXME Find out why this was here and deal with underlying problem. */
 	LOG_DBG("Path not found for %d/%d", major, minor);
 	return -ENOENT;

 	// LOG_DBG("Creating /dev/mapper/%d-%d", major, minor);
 	// sprintf(path_rtn, "/dev/mapper/%d-%d", major, minor);
 	// r = mknod(path_rtn, S_IFBLK | S_IRUSR | S_IWUSR, MKDEV(major, minor));
 	/*
 	 * If we have to make the path, we unlink it after we open it
 	 */
 	// *unlink_path = 1;
 	// return r ? -errno : 0;
 }

 static int _clog_ctr(char *uuid, uint64_t luid,
 		     int argc, char **argv, uint64_t device_size)
 {
 	int i;
 	int r = 0;
 	char *p;
 	uint64_t region_size;
 	uint64_t region_count;
 	struct log_c *lc = NULL;
 	enum sync log_sync = DEFAULTSYNC;
 	uint32_t block_on_error = 0;

 	int disk_log;
 	char disk_path[128];
 	int unlink_path = 0;
 	long page_size;
 	int pages;

 	/* If core log request, then argv[0] will be region_size */
 	if (!strtoll(argv[0], &p, 0) || *p) {
 		disk_log = 1;

 		if ((argc < 2) || (argc > 4)) {
 			LOG_ERROR("Too %s arguments to clustered-disk log type",
 				  (argc < 3) ? "few" : "many");
 			r = -EINVAL;
 			goto fail;
 		}

 		r = find_disk_path(argv[0], disk_path, &unlink_path);
 		if (r) {
 			LOG_ERROR("Unable to find path to device %s", argv[0]);
 			goto fail;
 		}
 		LOG_DBG("Clustered log disk is %s", disk_path);
 	} else {
 		disk_log = 0;

 		if ((argc < 1) || (argc > 3)) {
 			LOG_ERROR("Too %s arguments to clustered-core log type",
 				  (argc < 2) ? "few" : "many");
 			r = -EINVAL;
 			goto fail;
 		}
 	}

 	if (!(region_size = strtoll(argv[disk_log], &p, 0)) || *p) {
 		LOG_ERROR("Invalid region_size argument to clustered-%s log type",
 			  (disk_log) ? "disk" : "core");
 		r = -EINVAL;
 		goto fail;
 	}

 	region_count = device_size / region_size;
 	if (device_size % region_size) {
 		/*
 		 * I can't remember if device_size must be a multiple
 		 * of region_size, so check it anyway.
 		 */
 		region_count++;
 	}

 	for (i = 0; i < argc; i++) {
 		if (!strcmp(argv[i], "sync"))
 			log_sync = FORCESYNC;
 		else if (!strcmp(argv[i], "nosync"))
 			log_sync = NOSYNC;
 		else if (!strcmp(argv[i], "block_on_error"))
 			block_on_error = 1;
 	}

 	lc = dm_zalloc(sizeof(*lc));
 	if (!lc) {
 		LOG_ERROR("Unable to allocate cluster log context");
 		r = -ENOMEM;
 		goto fail;
 	}

 	lc->region_size = region_size;
 	lc->region_count = region_count;
 	lc->sync = log_sync;
 	lc->block_on_error = block_on_error;
 	lc->sync_search = 0;
 	lc->recovering_region = (uint64_t)-1;
 	lc->skip_bit_warning = region_count;
 	lc->disk_fd = -1;
 	lc->log_dev_failed = 0;
 	strncpy(lc->uuid, uuid, DM_UUID_LEN);
 	lc->luid = luid;

 	if (get_log(lc->uuid, lc->luid) ||
 	    get_pending_log(lc->uuid, lc->luid)) {
 		LOG_ERROR("[%s/%" PRIu64 "u] Log already exists, unable to create.",
 			  SHORT_UUID(lc->uuid), lc->luid);
 		dm_free(lc);
 		return -EINVAL;
 	}

 	dm_list_init(&lc->mark_list);

 	lc->clean_bits = dm_bitset_create(NULL, region_count);
 	if (!lc->clean_bits) {
 		LOG_ERROR("Unable to allocate clean bitset");
 		r = -ENOMEM;
 		goto fail;
 	}

 	lc->sync_bits = dm_bitset_create(NULL, region_count);
 	if (!lc->sync_bits) {
 		LOG_ERROR("Unable to allocate sync bitset");
 		r = -ENOMEM;
 		goto fail;
 	}
 	if (log_sync == NOSYNC)
 		dm_bit_set_all(lc->sync_bits);

 	lc->sync_count = (log_sync == NOSYNC) ? region_count : 0;

 	if (disk_log) {
 		if ((page_size = sysconf(_SC_PAGESIZE)) < 0) {
 			LOG_ERROR("Unable to read pagesize: %s",
 				  strerror(errno));
 			r = errno;
 			goto fail;
 		}
 		pages = *(lc->clean_bits) / page_size;
 		pages += *(lc->clean_bits) % page_size ? 1 : 0;
 		pages += 1; /* for header */

 		r = open(disk_path, O_RDWR | O_DIRECT);
 		if (r < 0) {
 			LOG_ERROR("Unable to open log device, %s: %s",
 				  disk_path, strerror(errno));
 			r = errno;
 			goto fail;
 		}
 		if (unlink_path)
 			if (unlink(disk_path) < 0) {
 				LOG_DBG("Warning: Unable to unlink log device, %s: %s",
 					disk_path, strerror(errno));
 			}

 		lc->disk_fd = r;
 		lc->disk_size = pages * page_size;

 		r = posix_memalign(&(lc->disk_buffer), page_size,
 				   lc->disk_size);
 		if (r) {
 			LOG_ERROR("Unable to allocate memory for disk_buffer");
 			goto fail;
 		}
 		memset(lc->disk_buffer, 0, lc->disk_size);
 		LOG_DBG("Disk log ready");
 	}

 	dm_list_add(&log_pending_list, &lc->list);

 	return 0;
 fail:
 	if (lc) {
 		if (lc->disk_fd >= 0 && close(lc->disk_fd))
 			LOG_ERROR("Close device error, %s: %s",
 				  disk_path, strerror(errno));
 		free(lc->disk_buffer);
 		dm_free(lc->sync_bits);
 		dm_free(lc->clean_bits);
 		dm_free(lc);
 	}
 	return r;
 }

 /*
  * clog_ctr
  * @rq
  *
  * rq->data should contain constructor string as follows:
  *	<log_type> [disk] <region_size> [[no]sync] <device_len>
  * The kernel is responsible for adding the <dev_len> argument
  * to the end; otherwise, we cannot compute the region_count.
  *
  * FIXME: Currently relies on caller to fill in rq->error
  */
 static int clog_dtr(struct dm_ulog_request *rq);
 static int clog_ctr(struct dm_ulog_request *rq)
 {
 	int argc, i, r = 0;
 	char *p, **argv = NULL;
 	char *dev_size_str;
 	uint64_t device_size;

 	/* Sanity checks */
 	if (!rq->data_size) {
 		LOG_ERROR("Received constructor request with no data");
 		return -EINVAL;
 	}

 	if (strlen(rq->data) > rq->data_size) {
 		LOG_ERROR("Received constructor request with bad data");
 		LOG_ERROR("strlen(rq->data)[%d] != rq->data_size[%llu]",
 			  (int)strlen(rq->data),
 			  (unsigned long long)rq->data_size);
 		LOG_ERROR("rq->data = '%s' [%d]",
 			  rq->data, (int)strlen(rq->data));
 		return -EINVAL;
 	}

 	/* Split up args */
 	for (argc = 0, p = rq->data; (p = strstr(p, " ")); p++, argc++)
 		*p = '\0';

 	if (!argc) {
 		LOG_ERROR("Received constructor request with bad data %s",
 			  rq->data);
 		return -EINVAL;
 	}

 	argv = malloc(argc * sizeof(char *));
 	if (!argv)
 		return -ENOMEM;

 	p = dev_size_str = rq->data;
 	p += strlen(p) + 1;
 	for (i = 0; i < argc; i++, p = p + strlen(p) + 1)
 		argv[i] = p;

 	if (strcmp(argv[0], "clustered-disk") &&
 	    strcmp(argv[0], "clustered-core")) {
 		LOG_ERROR("Unsupported userspace log type, \"%s\"", argv[0]);
 		free(argv);
 		return -EINVAL;
 	}

 	if (!(device_size = strtoll(dev_size_str, &p, 0)) || *p) {
 		LOG_ERROR("Invalid device size argument: %s", dev_size_str);
 		free(argv);
 		return -EINVAL;
 	}

 	r = _clog_ctr(rq->uuid, rq->luid, argc - 1, argv + 1, device_size);

 	/* We join the CPG when we resume */

 	/* No returning data */
 	if ((rq->version > 1) && !strcmp(argv[0], "clustered-disk"))
 		rq->data_size = sprintf(rq->data, "%s", argv[1]) + 1;
 	else
 		rq->data_size = 0;

 	if (r) {
 		LOG_ERROR("Failed to create cluster log (%s)", rq->uuid);
 		for (i = 0; i < argc; i++)
 			LOG_ERROR("argv[%d] = %s", i, argv[i]);
 	}
 	else
 		LOG_DBG("[%s] Cluster log created",
 			SHORT_UUID(rq->uuid));

 	free(argv);
 	return r;
 }

 /*
  * clog_dtr
  * @rq
  *
  */
 static int clog_dtr(struct dm_ulog_request *rq)
 {
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (lc) {
 		/*
 		 * The log should not be on the official list.  There
 		 * should have been a suspend first.
 		 */
 		LOG_ERROR("[%s] DTR before SUS: leaving CPG",
 			  SHORT_UUID(rq->uuid));
 		destroy_cluster_cpg(rq->uuid);
 	} else if (!(lc = get_pending_log(rq->uuid, rq->luid))) {
 		LOG_ERROR("clog_dtr called on log that is not official or pending");
 		return -EINVAL;
 	}

 	LOG_DBG("[%s] Cluster log removed", SHORT_UUID(lc->uuid));

 	dm_list_del(&lc->list);
 	if (lc->disk_fd != -1 && close(lc->disk_fd))
 		LOG_ERROR("Failed to close disk log: %s",
 			  strerror(errno));
 	if (lc->disk_buffer)
 		free(lc->disk_buffer);
 	dm_free(lc->clean_bits);
 	dm_free(lc->sync_bits);
 	dm_free(lc);

 	return 0;
 }

 /*
  * clog_presuspend
  * @rq
  *
  */
 static int clog_presuspend(struct dm_ulog_request *rq)
 {
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (lc->touched)
 		LOG_DBG("WARNING: log still marked as 'touched' during suspend");

 	lc->recovery_halted = 1;

 	return 0;
 }

 /*
  * clog_postsuspend
  * @rq
  *
  */
 static int clog_postsuspend(struct dm_ulog_request *rq)
 {
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid));
 	destroy_cluster_cpg(rq->uuid);

 	lc->state = LOG_SUSPENDED;
 	lc->recovering_region = (uint64_t)-1;
 	lc->recoverer = (uint32_t)-1;
 	lc->delay = time(NULL);

 	return 0;
 }

 /*
  * cluster_postsuspend
  * @rq
  *
  */
 int cluster_postsuspend(char *uuid, uint64_t luid)
 {
 	struct log_c *lc = get_log(uuid, luid);

 	if (!lc)
 		return -EINVAL;

 	LOG_DBG("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid));
 	lc->resume_override = 0;

 	/* move log to pending list */
 	dm_list_del(&lc->list);
 	dm_list_add(&log_pending_list, &lc->list);

 	return 0;
 }

 /*
  * clog_resume
  * @rq
  *
  * Does the main work of resuming.
  */
 static int clog_resume(struct dm_ulog_request *rq)
 {
 	uint32_t i;
 	int commit_log = 0;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	lc->in_sync = 0;
 	switch (lc->resume_override) {
 	case 1000:
 		LOG_ERROR("[%s] Additional resume issued before suspend",
 			  SHORT_UUID(rq->uuid));
 #ifdef DEBUG
 		kill(getpid(), SIGUSR1);
 #endif
 		return 0;
 	case 0:
 		lc->resume_override = 1000;
 		if (lc->disk_fd == -1) {
 			LOG_DBG("[%s] Master resume.",
 				SHORT_UUID(lc->uuid));
 			goto no_disk;
 		}

 		LOG_DBG("[%s] Master resume: reading disk log",
 			SHORT_UUID(lc->uuid));
 		commit_log = 1;
 		break;
 	case 1:
 		LOG_ERROR("Error:: partial bit loading (just sync_bits)");
 		return -EINVAL;
 	case 2:
 		LOG_ERROR("Error:: partial bit loading (just clean_bits)");
 		return -EINVAL;
 	case 3:
 		LOG_DBG("[%s] Non-master resume: bits pre-loaded",
 			SHORT_UUID(lc->uuid));
 		lc->resume_override = 1000;
 		goto out;
 	default:
 		LOG_ERROR("Error:: multiple loading of bits (%d)",
 			  lc->resume_override);
 		return -EINVAL;
 	}

 	if (lc->log_dev_failed) {
 		LOG_ERROR("Log device has failed, unable to read bits");
 		rq->error = 0;  /* We can handle this so far */
 		lc->disk_nr_regions = 0;
 	} else
 		rq->error = read_log(lc);

 	switch (rq->error) {
 	case 0:
 		if (lc->disk_nr_regions < lc->region_count)
 			LOG_DBG("[%s] Mirror has grown, updating log bits",
 				SHORT_UUID(lc->uuid));
 		else if (lc->disk_nr_regions > lc->region_count)
 			LOG_DBG("[%s] Mirror has shrunk, updating log bits",
 				SHORT_UUID(lc->uuid));
 		break;
 	case -EINVAL:
 		LOG_DBG("[%s] (Re)initializing mirror log - resync issued.",
 			SHORT_UUID(lc->uuid));
 		lc->disk_nr_regions = 0;
 		break;
 	default:
 		LOG_ERROR("Failed to read disk log");
 		lc->disk_nr_regions = 0;
 		break;
 	}

 no_disk:
 	/* If mirror has grown, set bits appropriately */
 	if (lc->sync == NOSYNC)
 		for (i = lc->disk_nr_regions; i < lc->region_count; i++)
 			log_set_bit(lc, lc->clean_bits, i);
 	else
 		for (i = lc->disk_nr_regions; i < lc->region_count; i++)
 			log_clear_bit(lc, lc->clean_bits, i);

 	/* Clear any old bits if device has shrunk */
 	for (i = lc->region_count; i % 32; i++)
 		log_clear_bit(lc, lc->clean_bits, i);

 	/* copy clean across to sync */
 	dm_bit_copy(lc->sync_bits, lc->clean_bits);

 	if (commit_log && (lc->disk_fd >= 0)) {
 		rq->error = write_log(lc);
 		if (rq->error)
 			LOG_ERROR("Failed initial disk log write");
 		else
 			LOG_DBG("Disk log initialized");
 		lc->touched = 0;
 	}
 out:
 	/*
 	 * Clear any old bits if device has shrunk - necessary
 	 * for non-master resume
 	 */
 	for (i = lc->region_count; i % 32; i++) {
 		log_clear_bit(lc, lc->clean_bits, i);
 		log_clear_bit(lc, lc->sync_bits, i);
 	}

 	lc->sync_count = count_bits32(lc->sync_bits);

 	LOG_SPRINT(lc, "[%s] Initial sync_count = %llu",
 		   SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count);
 	lc->sync_search = 0;
 	lc->state = LOG_RESUMED;
 	lc->recovery_halted = 0;

 	return rq->error;
 }

 /*
  * local_resume
  * @rq
  *
  * If the log is pending, we must first join the cpg and
  * put the log in the official list.
  *
  */
 int local_resume(struct dm_ulog_request *rq)
 {
 	int r;
 	time_t t;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc) {
 		/* Is the log in the pending list? */
 		lc = get_pending_log(rq->uuid, rq->luid);
 		if (!lc) {
 			LOG_ERROR("clog_resume called on log that is not official or pending");
 			return -EINVAL;
 		}

 		t = time(NULL);
 		t -= lc->delay;
 		/*
 		 * This should be considered a temporary fix.  It addresses
 		 * a problem that exists when nodes suspend/resume in rapid
 		 * succession.  While the problem is very rare, it has been
 		 * seen to happen in real-world-like testing.
 		 *
 		 * The problem:
 		 * - Node A joins cluster
 		 * - Node B joins cluster
 		 * - Node A prepares checkpoint
 		 * - Node A gets ready to write checkpoint
 		 * - Node B leaves
 		 * - Node B joins
 		 * - Node A finishes write of checkpoint
 		 * - Node B receives checkpoint meant for previous session
 		 * -- Node B can now be non-coherent
 		 *
 		 * This timer will solve the problem for now, but could be
 		 * replaced by a generation number sent with the resume
 		 * command from the kernel.  The generation number would
 		 * be included in the name of the checkpoint to prevent
 		 * reading stale data.
 		 */
 		if ((t < 3) && (t >= 0))
 			sleep(3 - t);

 		/* Join the CPG */
 		r = create_cluster_cpg(rq->uuid, rq->luid);
 		if (r) {
 			LOG_ERROR("clog_resume:  Failed to create cluster CPG");
 			return r;
 		}

 		/* move log to official list */
 		dm_list_del(&lc->list);
 		dm_list_add(&log_list, &lc->list);
 	}

 	return 0;
 }

 /*
  * clog_get_region_size
  * @rq
  *
  * Since this value doesn't change, the kernel
  * should not need to talk to server to get this
  * The function is here for completness
  *
  * Returns: 0 on success, -EXXX on failure
  */
 static int clog_get_region_size(struct dm_ulog_request *rq)
 {
 	uint64_t *rtn = (uint64_t *)rq->data;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc && !(lc = get_pending_log(rq->uuid, rq->luid)))
 		return -EINVAL;

 	*rtn = lc->region_size;
 	rq->data_size = sizeof(*rtn);

 	return 0;
 }

 /*
  * clog_is_clean
  * @rq
  *
  * Returns: 1 if clean, 0 otherwise
  */
 static int clog_is_clean(struct dm_ulog_request *rq)
 {
 	int64_t *rtn = (int64_t *)rq->data;
 	uint64_t *region = (uint64_t *)rq->data;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	*rtn = log_test_bit(lc->clean_bits, *region);
 	rq->data_size = sizeof(*rtn);

 	return 0;
 }

 /*
  * clog_in_sync
  * @rq
  *
  * We ignore any request for non-block.  That
  * should be handled elsewhere.  (If the request
  * has come this far, it has already blocked.)
  *
  * Returns: 1 if in-sync, 0 otherwise
  */
 static int clog_in_sync(struct dm_ulog_request *rq)
 {
 	int64_t *rtn = (int64_t *)rq->data;
 	uint64_t *region_p = (uint64_t *)rq->data;
 	uint64_t region = *region_p;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (region > lc->region_count)
 		return -EINVAL;

 	*rtn = log_test_bit(lc->sync_bits, region);

 	/*
 	 * If the mirror was successfully recovered, we want to always
 	 * force every machine to write to all devices - otherwise,
 	 * corruption will occur.  Here's how:
 	 *    Node1 suffers a failure and marks a region out-of-sync
 	 *    Node2 attempts a write, gets by is_remote_recovering,
    	 *          and queries the sync status of the region - finding
 	 *	    it out-of-sync.
 	 *    Node2 thinks the write should be a nosync write, but it
 	 *          hasn't suffered the drive failure that Node1 has yet.
 	 *          It then issues a generic_make_request directly to
 	 *          the primary image only - which is exactly the device
 	 *          that has suffered the failure.
 	 *    Node2 suffers a lost write - which completely bypasses the
 	 *          mirror layer because it had gone through generic_m_r.
 	 *    The file system will likely explode at this point due to
 	 *    I/O errors.  If it wasn't the primary that failed, it is
 	 *    easily possible in this case to issue writes to just one
 	 *    of the remaining images - also leaving the mirror inconsistent.
 	 *
 	 * We let in_sync() return 1 in a cluster regardless of what is
 	 * in the bitmap once recovery has successfully completed on a
 	 * mirror.  This ensures the mirroring code will continue to
 	 * attempt to write to all mirror images.  The worst that can
 	 * happen for reads is that additional read attempts may be
 	 * taken.
 	 *
 	 * Futher investigation may be required to determine if there are
 	 * similar possible outcomes when the mirror is in the process of
 	 * recovering.  In that case, lc->in_sync would not have been set
 	 * yet.
 	 */
 	if (!*rtn && lc->in_sync)
 		*rtn = 1;

 	if (*rtn)
 		LOG_DBG("[%s] Region is in-sync: %llu",
 			SHORT_UUID(lc->uuid), (unsigned long long)region);
 	else
 		LOG_DBG("[%s] Region is not in-sync: %llu",
 			SHORT_UUID(lc->uuid), (unsigned long long)region);

 	rq->data_size = sizeof(*rtn);

 	return 0;
 }

 /*
  * clog_flush
  * @rq
  *
  */
 static int clog_flush(struct dm_ulog_request *rq, int server)
 {
 	int r = 0;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (!lc->touched)
 		return 0;

 	/*
 	 * Do the actual flushing of the log only
 	 * if we are the server.
 	 */
 	if (server && (lc->disk_fd >= 0)) {
 		r = rq->error = write_log(lc);
 		if (r)
 			LOG_ERROR("[%s] Error writing to disk log",
 				  SHORT_UUID(lc->uuid));
 		else
 			LOG_DBG("[%s] Disk log written", SHORT_UUID(lc->uuid));
 	}

 	lc->touched = 0;

 	return r;

 }

 /*
  * mark_region
  * @lc
  * @region
  * @who
  *
  * Put a mark region request in the tree for tracking.
  *
  * Returns: 0 on success, -EXXX on error
  */
 static int mark_region(struct log_c *lc, uint64_t region, uint32_t who)
 {
 	int found = 0;
 	struct mark_entry *m;

 	dm_list_iterate_items(m, &lc->mark_list)
 		if (m->region == region) {
 			found = 1;
 			if (m->nodeid == who)
 				return 0;
 		}

 	if (!found)
 		log_clear_bit(lc, lc->clean_bits, region);

 	/*
 	 * Save allocation until here - if there is a failure,
 	 * at least we have cleared the bit.
 	 */
 	m = malloc(sizeof(*m));
 	if (!m) {
 		LOG_ERROR("Unable to allocate space for mark_entry: %llu/%u",
 			  (unsigned long long)region, who);
 		return -ENOMEM;
 	}

 	m->nodeid = who;
 	m->region = region;
 	dm_list_add(&lc->mark_list, &m->list);

 	return 0;
 }

 /*
  * clog_mark_region
  * @rq
  *
  * rq may contain more than one mark request.  We
  * can determine the number from the 'data_size' field.
  *
  * Returns: 0 on success, -EXXX on failure
  */
 static int clog_mark_region(struct dm_ulog_request *rq, uint32_t originator)
 {
 	int r;
 	int count;
 	uint64_t *region;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (rq->data_size % sizeof(uint64_t)) {
 		LOG_ERROR("Bad data size given for mark_region request");
 		return -EINVAL;
 	}

 	count = rq->data_size / sizeof(uint64_t);
 	region = (uint64_t *)&rq->data;

 	for (; count > 0; count--, region++) {
 		r = mark_region(lc, *region, originator);
 		if (r)
 			return r;
 	}

 	rq->data_size = 0;

 	return 0;
 }

 static int clear_region(struct log_c *lc, uint64_t region, uint32_t who)
 {
 	int other_matches = 0;
 	struct mark_entry *m, *n;

 	dm_list_iterate_items_safe(m, n, &lc->mark_list)
 		if (m->region == region) {
 			if (m->nodeid == who) {
 				dm_list_del(&m->list);
 				free(m);
 			} else
 				other_matches = 1;
 		}

 	/*
 	 * Clear region if:
 	 *  1) It is in-sync
 	 *  2) There are no other machines that have it marked
 	 */
 	if (!other_matches && log_test_bit(lc->sync_bits, region))
 		log_set_bit(lc, lc->clean_bits, region);

 	return 0;
 }

 /*
  * clog_clear_region
  * @rq
  *
  * rq may contain more than one clear request.  We
  * can determine the number from the 'data_size' field.
  *
  * Returns: 0 on success, -EXXX on failure
  */
 static int clog_clear_region(struct dm_ulog_request *rq, uint32_t originator)
 {
 	int r;
 	int count;
 	uint64_t *region;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (rq->data_size % sizeof(uint64_t)) {
 		LOG_ERROR("Bad data size given for clear_region request");
 		return -EINVAL;
 	}

 	count = rq->data_size / sizeof(uint64_t);
 	region = (uint64_t *)&rq->data;

 	for (; count > 0; count--, region++) {
 		r = clear_region(lc, *region, originator);
 		if (r)
 			return r;
 	}

 	rq->data_size = 0;

 	return 0;
 }

 /*
  * clog_get_resync_work
  * @rq
  *
  */
 static int clog_get_resync_work(struct dm_ulog_request *rq, uint32_t originator)
 {
 	struct {
 		int64_t i;
 		uint64_t r;
 	} *pkg = (void *)rq->data;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	rq->data_size = sizeof(*pkg);
 	pkg->i = 0;

 	if (lc->sync_search >= lc->region_count) {
 		/*
 		 * FIXME: handle intermittent errors during recovery
 		 * by resetting sync_search... but not to many times.
 		 */
 		LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 			   "Recovery finished",
 			   rq->seq, SHORT_UUID(lc->uuid), originator);
 		return 0;
 	}

 	if (lc->recovering_region != (uint64_t)-1) {
 		if (lc->recoverer == originator) {
 			LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 				   "Re-requesting work (%llu)",
 				   rq->seq, SHORT_UUID(lc->uuid), originator,
 				   (unsigned long long)lc->recovering_region);
 			pkg->r = lc->recovering_region;
 			pkg->i = 1;
 			LOG_COND(log_resend_requests, "***** RE-REQUEST *****");
 		} else {
 			LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 				   "Someone already recovering (%llu)",
 				   rq->seq, SHORT_UUID(lc->uuid), originator,
 				   (unsigned long long)lc->recovering_region);
 		}

 		return 0;
 	}

 	while (lc->recovery_request_list) {
 		struct recovery_request *del;

 		del = lc->recovery_request_list;
 		lc->recovery_request_list = del->next;

 		pkg->r = del->region;
 		free(del);

 		if (!log_test_bit(lc->sync_bits, pkg->r)) {
 			LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 				   "Assigning priority resync work (%llu)",
 				   rq->seq, SHORT_UUID(lc->uuid), originator,
 				   (unsigned long long)pkg->r);
 			pkg->i = 1;
 			lc->recovering_region = pkg->r;
 			lc->recoverer = originator;
 			return 0;
 		}
 	}

 	pkg->r = find_next_zero_bit(lc->sync_bits, lc->sync_search);

 	if (pkg->r >= lc->region_count) {
 		LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 			   "Resync work complete.",
 			   rq->seq, SHORT_UUID(lc->uuid), originator);
 		lc->sync_search = lc->region_count + 1;
 		return 0;
 	}

 	lc->sync_search = pkg->r + 1;

 	LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 		   "Assigning resync work (%llu)",
 		   rq->seq, SHORT_UUID(lc->uuid), originator,
 		   (unsigned long long)pkg->r);
 	pkg->i = 1;
 	lc->recovering_region = pkg->r;
 	lc->recoverer = originator;

 	return 0;
 }

 /*
  * clog_set_region_sync
  * @rq
  */
 static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator)
 {
 	struct {
 		uint64_t region;
 		int64_t in_sync;
 	} *pkg = (void *)rq->data;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	lc->recovering_region = (uint64_t)-1;

 	if (pkg->in_sync) {
 		if (log_test_bit(lc->sync_bits, pkg->region)) {
 			LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 				   "Region already set (%llu)",
 				   rq->seq, SHORT_UUID(lc->uuid), originator,
 				   (unsigned long long)pkg->region);
 		} else {
 			log_set_bit(lc, lc->sync_bits, pkg->region);
 			lc->sync_count++;

 			/* The rest of this section is all for debugging */
 			LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 				   "Setting region (%llu)",
 				   rq->seq, SHORT_UUID(lc->uuid), originator,
 				   (unsigned long long)pkg->region);
 			if (pkg->region == lc->skip_bit_warning)
 				lc->skip_bit_warning = lc->region_count;

 			if (pkg->region > (lc->skip_bit_warning + 5)) {
 				LOG_SPRINT(lc, "*** Region #%llu skipped during recovery ***",
 					  (unsigned long long)lc->skip_bit_warning);
 				lc->skip_bit_warning = lc->region_count;
 #ifdef DEBUG
 				kill(getpid(), SIGUSR1);
 #endif
 			}

 			if (!log_test_bit(lc->sync_bits,
 					  (pkg->region) ? pkg->region - 1 : 0)) {
 				LOG_SPRINT(lc, "*** Previous bit not set ***");
 				lc->skip_bit_warning = (pkg->region) ?
 					pkg->region - 1 : 0;
 			}
 		}
 	} else if (log_test_bit(lc->sync_bits, pkg->region)) {
 		lc->sync_count--;
 		log_clear_bit(lc, lc->sync_bits, pkg->region);
 		LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 			   "Unsetting region (%llu)",
 			   rq->seq, SHORT_UUID(lc->uuid), originator,
 			   (unsigned long long)pkg->region);
 	}

 	if (lc->sync_count != count_bits32(lc->sync_bits)) {
 		unsigned long long reset = count_bits32(lc->sync_bits);

 		LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 			   "sync_count(%llu) != bitmap count(%llu)",
 			   rq->seq, SHORT_UUID(lc->uuid), originator,
 			   (unsigned long long)lc->sync_count, reset);
 #ifdef DEBUG
 		kill(getpid(), SIGUSR1);
 #endif
 		lc->sync_count = reset;
 	}

 	if (lc->sync_count > lc->region_count)
 		LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
 			   "(lc->sync_count > lc->region_count) - this is bad",
 			   rq->seq, SHORT_UUID(lc->uuid), originator);

 	if (lc->sync_count == lc->region_count)
 		lc->in_sync = 1;

 	rq->data_size = 0;
 	return 0;
 }

 /*
  * clog_get_sync_count
  * @rq
  */
 static int clog_get_sync_count(struct dm_ulog_request *rq, uint32_t originator)
 {
 	uint64_t *sync_count = (uint64_t *)rq->data;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	/*
 	 * FIXME: Mirror requires us to be able to ask for
 	 * the sync count while pending... but I don't like
 	 * it because other machines may not be suspended and
 	 * the stored value may not be accurate.
 	 */
 	if (!lc)
 		lc = get_pending_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	*sync_count = lc->sync_count;

 	rq->data_size = sizeof(*sync_count);

 	if (lc->sync_count != count_bits32(lc->sync_bits)) {
 		unsigned long long reset = count_bits32(lc->sync_bits);

 		LOG_SPRINT(lc, "get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
 			   "sync_count(%llu) != bitmap count(%llu)",
 			   rq->seq, SHORT_UUID(lc->uuid), originator,
 			   (unsigned long long)lc->sync_count, reset);
 #ifdef DEBUG
 		kill(getpid(), SIGUSR1);
 #endif
 		lc->sync_count = reset;
 	}

 	return 0;
 }

 static int core_status_info(struct log_c *lc __attribute__((unused)), struct dm_ulog_request *rq)
 {
 	int r;
 	char *data = (char *)rq->data;

 	r = sprintf(data, "1 clustered-core");
 	if (r < 0)
 		return r;

 	rq->data_size = r;

 	return 0;
 }

 static int disk_status_info(struct log_c *lc, struct dm_ulog_request *rq)
 {
 	int r;
 	char *data = (char *)rq->data;
 	struct stat statbuf;

 	if (fstat(lc->disk_fd, &statbuf)) {
 		rq->error = -errno;
 		return -errno;
 	}

 	r = sprintf(data, "3 clustered-disk %d:%d %c",
 		    major(statbuf.st_rdev), minor(statbuf.st_rdev),
 		    (lc->log_dev_failed) ? 'D' : 'A');
 	if (r < 0)
 		return r;

 	rq->data_size = r;

 	return 0;
 }

 /*
  * clog_status_info
  * @rq
  *
  */
 static int clog_status_info(struct dm_ulog_request *rq)
 {
 	int r;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		lc = get_pending_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (lc->disk_fd == -1)
 		r = core_status_info(lc, rq);
 	else
 		r = disk_status_info(lc, rq);

 	return r;
 }

 static int core_status_table(struct log_c *lc, struct dm_ulog_request *rq)
 {
 	int r;
 	char *data = (char *)rq->data;

 	r = sprintf(data, "clustered-core %u %s%s ",
 		    lc->region_size,
 		    (lc->sync == DEFAULTSYNC) ? "" :
 		    (lc->sync == NOSYNC) ? "nosync " : "sync ",
 		    (lc->block_on_error) ? "block_on_error" : "");
 	if (r < 0)
 		return r;

 	rq->data_size = r;
 	return 0;
 }

 static int disk_status_table(struct log_c *lc, struct dm_ulog_request *rq)
 {
 	int r;
 	char *data = (char *)rq->data;
 	struct stat statbuf;

 	if (fstat(lc->disk_fd, &statbuf)) {
 		rq->error = -errno;
 		return -errno;
 	}

 	r = sprintf(data, "clustered-disk %d:%d %u %s%s ",
 		    major(statbuf.st_rdev), minor(statbuf.st_rdev),
 		    lc->region_size,
 		    (lc->sync == DEFAULTSYNC) ? "" :
 		    (lc->sync == NOSYNC) ? "nosync " : "sync ",
 		    (lc->block_on_error) ? "block_on_error" : "");
 	if (r < 0)
 		return r;

 	rq->data_size = r;
 	return 0;
 }

 /*
  * clog_status_table
  * @rq
  *
  */
 static int clog_status_table(struct dm_ulog_request *rq)
 {
 	int r;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		lc = get_pending_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (lc->disk_fd == -1)
 		r = core_status_table(lc, rq);
 	else
 		r = disk_status_table(lc, rq);

 	return r;
 }

 /*
  * clog_is_remote_recovering
  * @rq
  *
  */
 static int clog_is_remote_recovering(struct dm_ulog_request *rq)
 {
 	uint64_t *region_p = (uint64_t *)rq->data;
 	uint64_t region = *region_p;
 	struct {
 		int64_t is_recovering;
 		uint64_t in_sync_hint;
 	} *pkg = (void *)rq->data;
 	struct log_c *lc = get_log(rq->uuid, rq->luid);

 	if (!lc)
 		return -EINVAL;

 	if (region > lc->region_count)
 		return -EINVAL;

 	if (lc->recovery_halted) {
 		LOG_DBG("[%s] Recovery halted... [not remote recovering]: %llu",
 			SHORT_UUID(lc->uuid), (unsigned long long)region);
 		pkg->is_recovering = 0;
 		pkg->in_sync_hint = lc->region_count; /* none are recovering */
 	} else {
 		pkg->is_recovering = !log_test_bit(lc->sync_bits, region);

 		/*
 		 * Remember, 'lc->sync_search' is 1 plus the region
 		 * currently being recovered.  So, we must take off 1
 		 * to account for that; but only if 'sync_search > 1'.
 		 */
 		pkg->in_sync_hint = lc->sync_search ? (lc->sync_search - 1) : 0;
 		LOG_DBG("[%s] Region is %s: %llu",
 			SHORT_UUID(lc->uuid),
 			(region == lc->recovering_region) ?
 			"currently remote recovering" :
 			(pkg->is_recovering) ? "pending remote recovery" :
 			"not remote recovering", (unsigned long long)region);
 	}

 	if (pkg->is_recovering &&
 	    (region != lc->recovering_region)) {
 		struct recovery_request *rr;

 		/* Already in the list? */
 		for (rr = lc->recovery_request_list; rr; rr = rr->next)
 			if (rr->region == region)
 				goto out;

 		/* Failure to allocated simply means we can't prioritize it */
 		rr = malloc(sizeof(*rr));
 		if (!rr)
 			goto out;

 		LOG_DBG("[%s] Adding region to priority list: %llu",
 			SHORT_UUID(lc->uuid), (unsigned long long)region);
 		rr->region = region;
 		rr->next = lc->recovery_request_list;
 		lc->recovery_request_list = rr;
 	}

 out:

 	rq->data_size = sizeof(*pkg);

 	return 0;
 }


 /*
  * do_request
  * @rq: the request
  * @server: is this request performed by the server
  *
  * An inability to perform this function will return an error
  * from this function.  However, an inability to successfully
  * perform the request will fill in the 'rq->error' field.
  *
  * 'rq' (or more correctly, rq->u_rq.data) should be of sufficient
  * size to hold any returning data.  Currently, local.c uses 2kiB
  * to hold 'rq' - leaving ~1.5kiB for return data... more than
  * enough for all the implemented functions here.
  *
  * Returns: 0 on success, -EXXX on error
  */
 int do_request(struct clog_request *rq, int server)
 {
 	int r;

 	if (!rq)
 		return 0;

 	if (rq->u_rq.error)
 		LOG_DBG("Programmer error: rq struct has error set");

 	switch (rq->u_rq.request_type) {
 	case DM_ULOG_CTR:
 		r = clog_ctr(&rq->u_rq);
 		break;
 	case DM_ULOG_DTR:
 		r = clog_dtr(&rq->u_rq);
 		break;
 	case DM_ULOG_PRESUSPEND:
 		r = clog_presuspend(&rq->u_rq);
 		break;
 	case DM_ULOG_POSTSUSPEND:
 		r = clog_postsuspend(&rq->u_rq);
 		break;
 	case DM_ULOG_RESUME:
 		r = clog_resume(&rq->u_rq);
 		break;
 	case DM_ULOG_GET_REGION_SIZE:
 		r = clog_get_region_size(&rq->u_rq);
 		break;
 	case DM_ULOG_IS_CLEAN:
 		r = clog_is_clean(&rq->u_rq);
 		break;
 	case DM_ULOG_IN_SYNC:
 		r = clog_in_sync(&rq->u_rq);
 		break;
 	case DM_ULOG_FLUSH:
 		r = clog_flush(&rq->u_rq, server);
 		break;
 	case DM_ULOG_MARK_REGION:
 		r = clog_mark_region(&rq->u_rq, rq->originator);
 		break;
 	case DM_ULOG_CLEAR_REGION:
 		r = clog_clear_region(&rq->u_rq, rq->originator);
 		break;
 	case DM_ULOG_GET_RESYNC_WORK:
 		r = clog_get_resync_work(&rq->u_rq, rq->originator);
 		break;
 	case DM_ULOG_SET_REGION_SYNC:
 		r = clog_set_region_sync(&rq->u_rq, rq->originator);
 		break;
 	case DM_ULOG_GET_SYNC_COUNT:
 		r = clog_get_sync_count(&rq->u_rq, rq->originator);
 		break;
 	case DM_ULOG_STATUS_INFO:
 		r = clog_status_info(&rq->u_rq);
 		break;
 	case DM_ULOG_STATUS_TABLE:
 		r = clog_status_table(&rq->u_rq);
 		break;
 	case DM_ULOG_IS_REMOTE_RECOVERING:
 		r = clog_is_remote_recovering(&rq->u_rq);
 		break;
 	default:
 		LOG_ERROR("Unknown request");
 		r = rq->u_rq.error = -EINVAL;
 		break;
 	}

 	if (r && !rq->u_rq.error)
 		rq->u_rq.error = r;
 	else if (r != rq->u_rq.error)
 		LOG_DBG("Warning:  error from function != rq->u_rq.error");

 	if (rq->u_rq.error && rq->u_rq.data_size) {
 		/* Make sure I'm handling errors correctly above */
 		LOG_DBG("Programmer error: rq->u_rq.error && rq->u_rq.data_size");
 		rq->u_rq.data_size = 0;
 	}

 	return 0;
 }

 static void print_bits(dm_bitset_t bs, int print)
 {
 	int i, size;
 	char outbuf[128] = { 0 };
 	unsigned char *buf = (unsigned char *)(bs + 1);

 	size = (*bs % 8) ? 1 : 0;
 	size += (*bs / 8);

 	for (i = 0; i < size; i++) {
 		if (!(i % 16)) {
 			if (outbuf[0] != '\0') {
 				if (print)
 					LOG_PRINT("%s", outbuf);
 				else
 					LOG_DBG("%s", outbuf);
 			}
 			memset(outbuf, 0, sizeof(outbuf));
 			sprintf(outbuf, "[%3d - %3d]", i, i+15);
 		}
 		sprintf(outbuf + strlen(outbuf), " %.2X", (unsigned char)buf[i]);
 	}
 	if (outbuf[0] != '\0') {
 		if (print)
 			LOG_PRINT("%s", outbuf);
 		else
 			LOG_DBG("%s", outbuf);
 	}
 }

 /* int store_bits(const char *uuid, const char *which, char **buf)*/
 int push_state(const char *uuid, uint64_t luid,
 	       const char *which, char **buf, uint32_t debug_who)
 {
 	int bitset_size;
 	struct log_c *lc;

 	if (*buf)
 		LOG_ERROR("store_bits: *buf != NULL");

 	lc = get_log(uuid, luid);
 	if (!lc) {
 		LOG_ERROR("store_bits: No log found for %s", uuid);
 		return -EINVAL;
 	}

 	if (!strcmp(which, "recovering_region")) {
 		*buf = malloc(64); /* easily handles the 2 written numbers */
 		if (!*buf)
 			return -ENOMEM;
 		sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
 			lc->recoverer);

 		LOG_SPRINT(lc, "CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
 			   "recovering_region=%llu, recoverer=%u, sync_count=%llu",
 			   SHORT_UUID(lc->uuid), debug_who,
 			   (unsigned long long)lc->recovering_region,
 			   lc->recoverer,
 			   (unsigned long long)count_bits32(lc->sync_bits));
 		return 64;
 	}

 	/* Size in 'int's */
 	bitset_size = (*(lc->clean_bits) / DM_BITS_PER_INT) + 1;

 	/* Size in bytes */
 	bitset_size *= 4;

 	*buf = malloc(bitset_size);

 	if (!*buf) {
 		LOG_ERROR("store_bits: Unable to allocate memory");
 		return -ENOMEM;
 	}

 	if (!strncmp(which, "sync_bits", 9)) {
 		memcpy(*buf, lc->sync_bits + 1, bitset_size);

 		LOG_DBG("[%s] storing sync_bits (sync_count = %llu):",
 			SHORT_UUID(uuid), (unsigned long long)
 			count_bits32(lc->sync_bits));

 		print_bits(lc->sync_bits, 0);
 	} else if (!strncmp(which, "clean_bits", 9)) {
 		memcpy(*buf, lc->clean_bits + 1, bitset_size);

 		LOG_DBG("[%s] storing clean_bits:", SHORT_UUID(lc->uuid));

 		print_bits(lc->clean_bits, 0);
 	}

 	return bitset_size;
 }

 /*int load_bits(const char *uuid, const char *which, char *buf, int size)*/
 int pull_state(const char *uuid, uint64_t luid,
 	       const char *which, char *buf, int size)
 {
 	int bitset_size;
 	struct log_c *lc;

 	if (!buf) {
 		LOG_ERROR("pull_state: buf == NULL");
 		return -EINVAL;
 	}

 	lc = get_log(uuid, luid);
 	if (!lc) {
 		LOG_ERROR("pull_state: No log found for %s", uuid);
 		return -EINVAL;
 	}

 	if (!strncmp(which, "recovering_region", 17)) {
 		if (sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region,
 			   &lc->recoverer) != 2) {
 			LOG_ERROR("cannot parse recovering region from: %s", buf);
 			return -EINVAL;
 		}
 		LOG_SPRINT(lc, "CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
 			   "recovering_region=%llu, recoverer=%u",
 			   SHORT_UUID(lc->uuid),
 			   (unsigned long long)lc->recovering_region, lc->recoverer);
 		return 0;
 	}

 	/* Size in 'int's */
 	bitset_size = (*(lc->clean_bits) /DM_BITS_PER_INT) + 1;

 	/* Size in bytes */
 	bitset_size *= 4;

 	if (bitset_size != size) {
 		LOG_ERROR("pull_state(%s): bad bitset_size (%d vs %d)",
 			  which, size, bitset_size);
 		return -EINVAL;
 	}

 	if (!strncmp(which, "sync_bits", 9)) {
 		lc->resume_override += 1;
 		memcpy(lc->sync_bits + 1, buf, bitset_size);

 		LOG_DBG("[%s] loading sync_bits (sync_count = %llu):",
 			SHORT_UUID(lc->uuid),(unsigned long long)
 			count_bits32(lc->sync_bits));

 		print_bits(lc->sync_bits, 0);
 	} else if (!strncmp(which, "clean_bits", 9)) {
 		lc->resume_override += 2;
 		memcpy(lc->clean_bits + 1, buf, bitset_size);

 		LOG_DBG("[%s] loading clean_bits:", SHORT_UUID(lc->uuid));

 		print_bits(lc->clean_bits, 0);
 	}

 	return 0;
 }

 int log_get_state(struct dm_ulog_request *rq)
 {
 	struct log_c *lc;

 	lc = get_log(rq->uuid, rq->luid);
 	if (!lc)
 		/* FIXME Callers are ignoring this */
 		return -EINVAL;

 	return (int)lc->state;
 }

 /*
  * log_status
  *
  * Returns: 1 if logs are still present, 0 otherwise
  */
 int log_status(void)
 {
 	if (!dm_list_empty(&log_list) || !dm_list_empty(&log_pending_list))
 		return 1;

 	return 0;
 }

 void log_debug(void)
 {
 	struct log_c *lc;
 	uint64_t r;
 	int i;

 	LOG_ERROR("");
 	LOG_ERROR("LOG COMPONENT DEBUGGING::");
 	LOG_ERROR("Official log list:");
 	LOG_ERROR("Pending log list:");
 	dm_list_iterate_items(lc, &log_pending_list) {
 		LOG_ERROR("%s", lc->uuid);
 		LOG_ERROR("sync_bits:");
 		print_bits(lc->sync_bits, 1);
 		LOG_ERROR("clean_bits:");
 		print_bits(lc->clean_bits, 1);
 	}

 	dm_list_iterate_items(lc, &log_list) {
 		LOG_ERROR("%s", lc->uuid);
 		LOG_ERROR("  recoverer        : %" PRIu32, lc->recoverer);
 		LOG_ERROR("  recovering_region: %" PRIu64, lc->recovering_region);
 		LOG_ERROR("  recovery_halted  : %s", (lc->recovery_halted) ?
 			  "YES" : "NO");
 		LOG_ERROR("sync_bits:");
 		print_bits(lc->sync_bits, 1);
 		LOG_ERROR("clean_bits:");
 		print_bits(lc->clean_bits, 1);

 		LOG_ERROR("Validating %s::", SHORT_UUID(lc->uuid));
 		r = find_next_zero_bit(lc->sync_bits, 0);
 		LOG_ERROR("  lc->region_count = %" PRIu32, lc->region_count);
 		LOG_ERROR("  lc->sync_count = %" PRIu64, lc->sync_count);
 		LOG_ERROR("  next zero bit  = %" PRIu64, r);
 		if ((r > lc->region_count) ||
 		    ((r == lc->region_count) && (lc->sync_count > lc->region_count))) {
 			LOG_ERROR("ADJUSTING SYNC_COUNT");
 			lc->sync_count = lc->region_count;
 		}

 		LOG_ERROR("Resync request history:");
 		for (i = 0; i < RESYNC_HISTORY; i++) {
 			lc->idx++;
 			lc->idx = lc->idx % RESYNC_HISTORY;
 			if (lc->resync_history[lc->idx][0] == '\0')
 				continue;
 			LOG_ERROR("%d:%d) %s", i, lc->idx,
 				  lc->resync_history[lc->idx]);
 		}
 	}
 }