dect
/
linux-2.6
Archived
13
0
Fork 0

Merge branch 'for-3.8/drivers' of git://git.kernel.dk/linux-block

Pull block driver update from Jens Axboe:
 "Now that the core bits are in, here are the driver bits for 3.8.  The
  branch contains:

   - A huge pile of drbd bits that were dumped from the 3.7 merge
     window.  Following that, it was both made perfectly clear that
     there is going to be no more over-the-wall pulls and how the
     situation on individual pulls can be improved.

   - A few cleanups from Akinobu Mita for drbd and cciss.

   - Queue improvement for loop from Lukas.  This grew into adding a
     generic interface for waiting/checking an even with a specific
     lock, allowing this to be pulled out of md and now loop and drbd is
     also using it.

   - A few fixes for xen back/front block driver from Roger Pau Monne.

   - Partition improvements from Stephen Warren, allowing partiion UUID
     to be used as an identifier."

* 'for-3.8/drivers' of git://git.kernel.dk/linux-block: (609 commits)
  drbd: update Kconfig to match current dependencies
  drbd: Fix drbdsetup wait-connect, wait-sync etc... commands
  drbd: close race between drbd_set_role and drbd_connect
  drbd: respect no-md-barriers setting also when changed online via disk-options
  drbd: Remove obsolete check
  drbd: fixup after wait_even_lock_irq() addition to generic code
  loop: Limit the number of requests in the bio list
  wait: add wait_event_lock_irq() interface
  xen-blkfront: free allocated page
  xen-blkback: move free persistent grants code
  block: partition: msdos: provide UUIDs for partitions
  init: reduce PARTUUID min length to 1 from 36
  block: store partition_meta_info.uuid as a string
  cciss: use check_signature()
  cciss: cleanup bitops usage
  drbd: use copy_highpage
  drbd: if the replication link breaks during handshake, keep retrying
  drbd: check return of kmalloc in receive_uuids
  drbd: Broadcast sync progress no more often than once per second
  drbd: don't try to clear bits once the disk has failed
  ...
This commit is contained in:
Linus Torvalds 2012-12-17 13:39:11 -08:00
commit 9228ff9038
49 changed files with 13021 additions and 8737 deletions

View File

@ -743,7 +743,6 @@ void __init printk_all_partitions(void)
struct hd_struct *part;
char name_buf[BDEVNAME_SIZE];
char devt_buf[BDEVT_SIZE];
char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5];
/*
* Don't show empty devices or things that have been
@ -762,16 +761,11 @@ void __init printk_all_partitions(void)
while ((part = disk_part_iter_next(&piter))) {
bool is_part0 = part == &disk->part0;
uuid_buf[0] = '\0';
if (part->info)
snprintf(uuid_buf, sizeof(uuid_buf), "%pU",
part->info->uuid);
printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
bdevt_str(part_devt(part), devt_buf),
(unsigned long long)part_nr_sects_read(part) >> 1
, disk_name(disk, part->partno, name_buf),
uuid_buf);
part->info ? part->info->uuid : "");
if (is_part0) {
if (disk->driverfs_dev != NULL &&
disk->driverfs_dev->driver != NULL)

View File

@ -620,7 +620,6 @@ int efi_partition(struct parsed_partitions *state)
gpt_entry *ptes = NULL;
u32 i;
unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
u8 unparsed_guid[37];
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@ -649,11 +648,7 @@ int efi_partition(struct parsed_partitions *state)
state->parts[i + 1].flags = ADDPART_FLAG_RAID;
info = &state->parts[i + 1].info;
/* Instead of doing a manual swap to big endian, reuse the
* common ASCII hex format as the interim.
*/
efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
part_pack_uuid(unparsed_guid, info->uuid);
efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
/* Naively convert UTF16-LE to 7 bits. */
label_max = min(sizeof(info->volname) - 1,

View File

@ -94,6 +94,17 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
return ret;
}
static void set_info(struct parsed_partitions *state, int slot,
u32 disksig)
{
struct partition_meta_info *info = &state->parts[slot].info;
snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
slot);
info->volname[0] = 0;
state->parts[slot].has_info = true;
}
/*
* Create devices for each logical partition in an extended partition.
* The logical partitions form a linked list, with each entry being
@ -106,7 +117,8 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
*/
static void parse_extended(struct parsed_partitions *state,
sector_t first_sector, sector_t first_size)
sector_t first_sector, sector_t first_size,
u32 disksig)
{
struct partition *p;
Sector sect;
@ -166,6 +178,7 @@ static void parse_extended(struct parsed_partitions *state,
}
put_partition(state, state->next, next, size);
set_info(state, state->next, disksig);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
state->parts[state->next].flags = ADDPART_FLAG_RAID;
loopct = 0;
@ -437,6 +450,7 @@ int msdos_partition(struct parsed_partitions *state)
struct partition *p;
struct fat_boot_sector *fb;
int slot;
u32 disksig;
data = read_part_sector(state, 0, &sect);
if (!data)
@ -491,6 +505,8 @@ int msdos_partition(struct parsed_partitions *state)
#endif
p = (struct partition *) (data + 0x1be);
disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
/*
* Look for partitions in two passes:
* First find the primary and DOS-type extended partitions.
@ -515,11 +531,12 @@ int msdos_partition(struct parsed_partitions *state)
put_partition(state, slot, start, n);
strlcat(state->pp_buf, " <", PAGE_SIZE);
parse_extended(state, start, size);
parse_extended(state, start, size, disksig);
strlcat(state->pp_buf, " >", PAGE_SIZE);
continue;
}
put_partition(state, slot, start, size);
set_info(state, slot, disksig);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
state->parts[slot].flags = ADDPART_FLAG_RAID;
if (SYS_IND(p) == DM6_PARTITION)

View File

@ -41,8 +41,9 @@
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/mutex.h>
#include <linux/bitmap.h>
#include <linux/io.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
@ -978,8 +979,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h)
i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
if (i == h->nr_cmds)
return NULL;
} while (test_and_set_bit(i & (BITS_PER_LONG - 1),
h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
} while (test_and_set_bit(i, h->cmd_pool_bits) != 0);
c = h->cmd_pool + i;
memset(c, 0, sizeof(CommandList_struct));
cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct);
@ -1046,8 +1046,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c)
int i;
i = c - h->cmd_pool;
clear_bit(i & (BITS_PER_LONG - 1),
h->cmd_pool_bits + (i / BITS_PER_LONG));
clear_bit(i, h->cmd_pool_bits);
h->nr_frees++;
}
@ -4268,10 +4267,7 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h)
static inline bool CISS_signature_present(ctlr_info_t *h)
{
if ((readb(&h->cfgtable->Signature[0]) != 'C') ||
(readb(&h->cfgtable->Signature[1]) != 'I') ||
(readb(&h->cfgtable->Signature[2]) != 'S') ||
(readb(&h->cfgtable->Signature[3]) != 'S')) {
if (!check_signature(h->cfgtable->Signature, "CISS", 4)) {
dev_warn(&h->pdev->dev, "not a valid CISS config table\n");
return false;
}
@ -4812,8 +4808,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
{
h->cmd_pool_bits = kmalloc(
DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) *
sizeof(unsigned long), GFP_KERNEL);
h->cmd_pool = pci_alloc_consistent(h->pdev,
h->nr_cmds * sizeof(CommandList_struct),
@ -5068,9 +5063,7 @@ reinit_after_soft_reset:
pci_set_drvdata(pdev, h);
/* command and error info recs zeroed out before
they are used */
memset(h->cmd_pool_bits, 0,
DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
* sizeof(unsigned long));
bitmap_zero(h->cmd_pool_bits, h->nr_cmds);
h->num_luns = 0;
h->highest_lun = -1;

View File

@ -2,13 +2,14 @@
# DRBD device driver configuration
#
comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
depends on PROC_FS='n' || INET='n' || CONNECTOR='n'
comment "DRBD disabled because PROC_FS or INET not selected"
depends on PROC_FS='n' || INET='n'
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
depends on PROC_FS && INET && CONNECTOR
depends on PROC_FS && INET
select LRU_CACHE
select LIBCRC32C
default n
help
@ -58,7 +59,8 @@ config DRBD_FAULT_INJECTION
32 data read
64 read ahead
128 kmalloc of bitmap
256 allocation of EE (epoch_entries)
256 allocation of peer_requests
512 insert data corruption on receiving side
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent

View File

@ -1,5 +1,7 @@
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o

File diff suppressed because it is too large Load Diff

View File

@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
func, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
drbd_task_to_thread_name(mdev->tconn, current),
func, b->bm_why ?: "?",
drbd_task_to_thread_name(mdev->tconn, b->bm_task));
}
void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
if (trylock_failed) {
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
why, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
drbd_task_to_thread_name(mdev->tconn, current),
why, b->bm_why ?: "?",
drbd_task_to_thread_name(mdev->tconn, b->bm_task));
mutex_lock(&b->bm_change);
}
if (BM_LOCKED_MASK & b->bm_flags)
@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
/* to mark for lazy writeout once syncer cleared all clearable bits,
* we if bits have been cleared since last IO. */
#define BM_PAGE_LAZY_WRITEOUT 28
/* pages marked with this "HINT" will be considered for writeout
* on activity log transactions */
#define BM_PAGE_HINT_WRITEOUT 27
/* store_page_idx uses non-atomic assignment. It is only used directly after
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to
@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
{
struct drbd_bitmap *b = mdev->bitmap;
void *addr = &page_private(b->bm_pages[page_nr]);
clear_bit(BM_PAGE_IO_LOCK, addr);
smp_mb__after_clear_bit();
clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
wake_up(&mdev->bitmap->bm_io_wait);
}
@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
/**
* drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
* @mdev: DRBD device.
* @page_nr: the bitmap page to mark with the "hint" flag
*
* From within an activity log transaction, we mark a few pages with these
* hints, then call drbd_bm_write_hinted(), which will only write out changed
* pages which are flagged with this mark.
*/
void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
{
struct page *page;
if (page_nr >= mdev->bitmap->bm_number_of_pages) {
dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
page_nr, (int)mdev->bitmap->bm_number_of_pages);
return;
}
page = mdev->bitmap->bm_pages[page_nr];
set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
}
static int bm_test_page_unchanged(struct page *page)
{
volatile const unsigned long *addr = &page_private(page);
@ -373,14 +388,16 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
return old_pages;
/* Trying kmalloc first, falling back to vmalloc.
* GFP_KERNEL is ok, as this is done when a lower level disk is
* "attached" to the drbd. Context is receiver thread or cqueue
* thread. As we have no disk yet, we are not in the IO path,
* not even the IO path of the peer. */
* GFP_NOIO, as this is called while drbd IO is "suspended",
* and during resize or attach on diskless Primary,
* we must not block on IO to ourselves.
* Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_KERNEL);
new_pages = kzalloc(bytes, GFP_NOIO);
if (!new_pages) {
new_pages = vzalloc(bytes);
new_pages = __vmalloc(bytes,
GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
if (!new_pages)
return NULL;
vmalloced = 1;
@ -390,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
for (i = 0; i < have; i++)
new_pages[i] = old_pages[i];
for (; i < want; i++) {
page = alloc_page(GFP_HIGHUSER);
page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
if (!page) {
bm_free_pages(new_pages + have, i - have);
bm_vk_free(new_pages, vmalloced);
@ -439,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev)
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
{
ERR_IF(!mdev->bitmap) return 0;
if (!expect(mdev->bitmap))
return 0;
return mdev->bitmap->bm_dev_capacity;
}
@ -447,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev)
*/
void drbd_bm_cleanup(struct drbd_conf *mdev)
{
ERR_IF (!mdev->bitmap) return;
if (!expect(mdev->bitmap))
return;
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
kfree(mdev->bitmap);
@ -610,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
int err = 0, growing;
int opages_vmalloced;
ERR_IF(!b) return -ENOMEM;
if (!expect(b))
return -ENOMEM;
drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
@ -732,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
unsigned long s;
unsigned long flags;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
@ -756,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
size_t drbd_bm_words(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
return b->bm_words;
}
@ -765,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev)
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
if (!expect(b))
return 0;
return b->bm_bits;
}
@ -786,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
@ -831,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
@ -860,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
void drbd_bm_set_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
@ -874,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
void drbd_bm_clear_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
@ -889,7 +922,8 @@ struct bm_aio_ctx {
unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
#define BM_WRITE_ALL_PAGES 2
#define BM_AIO_WRITE_HINTED 2
#define BM_WRITE_ALL_PAGES 4
int error;
struct kref kref;
};
@ -977,17 +1011,11 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
bm_set_page_unchanged(b->bm_pages[page_nr]);
if (ctx->flags & BM_AIO_COPY_PAGES) {
void *src, *dest;
page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
dest = kmap_atomic(page);
src = kmap_atomic(b->bm_pages[page_nr]);
memcpy(dest, src, PAGE_SIZE);
kunmap_atomic(src);
kunmap_atomic(dest);
copy_highpage(page, b->bm_pages[page_nr]);
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
bio->bi_bdev = mdev->ldev->md_bdev;
bio->bi_sector = on_disk_sector;
/* bio_add_page of a single page to an empty bio will always succeed,
@ -1060,6 +1088,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (rw & WRITE) {
if ((flags & BM_AIO_WRITE_HINTED) &&
!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
if (!(flags & BM_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
@ -1088,13 +1121,15 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
* "in_flight reached zero, all done" event.
*/
if (!atomic_dec_and_test(&ctx->in_flight))
wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done);
else
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
/* summary for global bitmap IO */
if (flags == 0)
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
if (ctx->error) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
@ -1103,7 +1138,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
}
if (atomic_read(&ctx->in_flight))
err = -EIO; /* Disk failed during IO... */
err = -EIO; /* Disk timeout/force-detach during IO... */
now = jiffies;
if (rw == WRITE) {
@ -1115,8 +1150,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
}
now = b->bm_set;
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
if (flags == 0)
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
@ -1179,9 +1215,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}
/**
* drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
* @mdev: DRBD device.
*/
int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
{
return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
}
/**
* drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
* drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
* @mdev: DRBD device.
* @idx: bitmap page index
*
@ -1222,11 +1266,11 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
}
bm_page_io_async(ctx, idx, WRITE_SYNC);
wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done);
if (ctx->error)
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
/* that should force detach, so the in memory bitmap will be
/* that causes us to detach, so the in memory bitmap will be
* gone in a moment as well. */
mdev->bm_writ_cnt++;
@ -1289,8 +1333,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
ERR_IF(!b) return i;
ERR_IF(!b->bm_pages) return i;
if (!expect(b))
return i;
if (!expect(b->bm_pages))
return i;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_TEST & b->bm_flags)
@ -1391,8 +1437,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
struct drbd_bitmap *b = mdev->bitmap;
int c = 0;
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 1;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
@ -1423,13 +1471,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
int changed = 0;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
b->bm_set += BITS_PER_LONG - bits;
changed += BITS_PER_LONG - bits;
}
kunmap_atomic(paddr);
if (changed) {
/* We only need lazy writeout, the information is still in the
* remote bitmap as well, and is reconstructed during the next
* bitmap exchange, if lost locally due to a crash. */
bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
b->bm_set += changed;
}
}
/* Same thing as drbd_bm_set_bits,
@ -1524,8 +1580,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
unsigned long *p_addr;
int i;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@ -1559,8 +1617,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 1;
if (!expect(b))
return 1;
if (!expect(b->bm_pages))
return 1;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@ -1573,11 +1633,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
ERR_IF (bitnr >= b->bm_bits) {
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
} else {
if (expect(bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
}
else
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
}
if (p_addr)
bm_unmap(p_addr);
@ -1607,8 +1666,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
unsigned long flags;
unsigned long *p_addr, *bm;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@ -1630,47 +1691,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
/* Set all bits covered by the AL-extent al_enr.
* Returns number of bits changed. */
unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr, *bm;
unsigned long weight;
unsigned long s, e;
int count, i, do_now;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_SET & b->bm_flags)
bm_print_lock_info(mdev);
weight = b->bm_set;
s = al_enr * BM_WORDS_PER_AL_EXT;
e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
/* assert that s and e are on the same page */
D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
== s >> (PAGE_SHIFT - LN2_BPL + 3));
count = 0;
if (s < b->bm_words) {
i = do_now = e-s;
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
bm = p_addr + MLPP(s);
while (i--) {
count += hweight_long(*bm);
*bm = -1UL;
bm++;
}
bm_unmap(p_addr);
b->bm_set += do_now*BITS_PER_LONG - count;
if (e == b->bm_words)
b->bm_set -= bm_clear_surplus(b);
} else {
dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
}
weight = b->bm_set - weight;
spin_unlock_irq(&b->bm_lock);
return weight;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,207 @@
#include <asm/bug.h>
#include <linux/rbtree_augmented.h>
#include "drbd_interval.h"
/**
* interval_end - return end of @node
*/
static inline
sector_t interval_end(struct rb_node *node)
{
struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
return this->end;
}
/**
* compute_subtree_last - compute end of @node
*
* The end of an interval is the highest (start + (size >> 9)) value of this
* node and of its children. Called for @node and its parents whenever the end
* may have changed.
*/
static inline sector_t
compute_subtree_last(struct drbd_interval *node)
{
sector_t max = node->sector + (node->size >> 9);
if (node->rb.rb_left) {
sector_t left = interval_end(node->rb.rb_left);
if (left > max)
max = left;
}
if (node->rb.rb_right) {
sector_t right = interval_end(node->rb.rb_right);
if (right > max)
max = right;
}
return max;
}
static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
{
while (rb != stop) {
struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
sector_t subtree_last = compute_subtree_last(node);
if (node->end == subtree_last)
break;
node->end = subtree_last;
rb = rb_parent(&node->rb);
}
}
static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
{
struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
new->end = old->end;
}
static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
{
struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
new->end = old->end;
old->end = compute_subtree_last(old);
}
static const struct rb_augment_callbacks augment_callbacks = {
augment_propagate,
augment_copy,
augment_rotate,
};
/**
* drbd_insert_interval - insert a new interval into a tree
*/
bool
drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
{
struct rb_node **new = &root->rb_node, *parent = NULL;
BUG_ON(!IS_ALIGNED(this->size, 512));
while (*new) {
struct drbd_interval *here =
rb_entry(*new, struct drbd_interval, rb);
parent = *new;
if (this->sector < here->sector)
new = &(*new)->rb_left;
else if (this->sector > here->sector)
new = &(*new)->rb_right;
else if (this < here)
new = &(*new)->rb_left;
else if (this > here)
new = &(*new)->rb_right;
else
return false;
}
rb_link_node(&this->rb, parent, new);
rb_insert_augmented(&this->rb, root, &augment_callbacks);
return true;
}
/**
* drbd_contains_interval - check if a tree contains a given interval
* @sector: start sector of @interval
* @interval: may not be a valid pointer
*
* Returns if the tree contains the node @interval with start sector @start.
* Does not dereference @interval until @interval is known to be a valid object
* in @tree. Returns %false if @interval is in the tree but with a different
* sector number.
*/
bool
drbd_contains_interval(struct rb_root *root, sector_t sector,
struct drbd_interval *interval)
{
struct rb_node *node = root->rb_node;
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (sector < here->sector)
node = node->rb_left;
else if (sector > here->sector)
node = node->rb_right;
else if (interval < here)
node = node->rb_left;
else if (interval > here)
node = node->rb_right;
else
return true;
}
return false;
}
/**
* drbd_remove_interval - remove an interval from a tree
*/
void
drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
{
rb_erase_augmented(&this->rb, root, &augment_callbacks);
}
/**
* drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
* @sector: start sector
* @size: size, aligned to 512 bytes
*
* Returns an interval overlapping with [sector, sector + size), or NULL if
* there is none. When there is more than one overlapping interval in the
* tree, the interval with the lowest start sector is returned, and all other
* overlapping intervals will be on the right side of the tree, reachable with
* rb_next().
*/
struct drbd_interval *
drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
{
struct rb_node *node = root->rb_node;
struct drbd_interval *overlap = NULL;
sector_t end = sector + (size >> 9);
BUG_ON(!IS_ALIGNED(size, 512));
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (node->rb_left &&
sector < interval_end(node->rb_left)) {
/* Overlap if any must be on left side */
node = node->rb_left;
} else if (here->sector < end &&
sector < here->sector + (here->size >> 9)) {
overlap = here;
break;
} else if (sector >= here->sector) {
/* Overlap if any must be on right side */
node = node->rb_right;
} else
break;
}
return overlap;
}
struct drbd_interval *
drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
{
sector_t end = sector + (size >> 9);
struct rb_node *node;
for (;;) {
node = rb_next(&i->rb);
if (!node)
return NULL;
i = rb_entry(node, struct drbd_interval, rb);
if (i->sector >= end)
return NULL;
if (sector < i->sector + (i->size >> 9))
return i;
}
}

View File

@ -0,0 +1,40 @@
#ifndef __DRBD_INTERVAL_H
#define __DRBD_INTERVAL_H
#include <linux/types.h>
#include <linux/rbtree.h>
struct drbd_interval {
struct rb_node rb;
sector_t sector; /* start sector of the interval */
unsigned int size; /* size in bytes */
sector_t end; /* highest interval end in subtree */
int local:1 /* local or remote request? */;
int waiting:1;
};
static inline void drbd_clear_interval(struct drbd_interval *i)
{
RB_CLEAR_NODE(&i->rb);
}
static inline bool drbd_interval_empty(struct drbd_interval *i)
{
return RB_EMPTY_NODE(&i->rb);
}
extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
extern bool drbd_contains_interval(struct rb_root *, sector_t,
struct drbd_interval *);
extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
unsigned int);
extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
unsigned int);
#define drbd_for_each_overlap(i, root, sector, size) \
for (i = drbd_find_overlap(root, sector, size); \
i; \
i = drbd_next_overlap(i, sector, size))
#endif /* __DRBD_INTERVAL_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,55 @@
#include "drbd_wrappers.h"
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>
#include "drbd_nla.h"
static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
{
struct nlattr *head = nla_data(nla);
int len = nla_len(nla);
int rem;
/*
* validate_nla (called from nla_parse_nested) ignores attributes
* beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
* In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
* flag set also, check and remove that flag before calling
* nla_parse_nested.
*/
nla_for_each_attr(nla, head, len, rem) {
if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
if (nla_type(nla) > maxtype)
return -EOPNOTSUPP;
}
}
return 0;
}
int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy)
{
int err;
err = drbd_nla_check_mandatory(maxtype, nla);
if (!err)
err = nla_parse_nested(tb, maxtype, nla, policy);
return err;
}
struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
{
int err;
/*
* If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
* we don't know about that attribute, reject all the nested
* attributes.
*/
err = drbd_nla_check_mandatory(maxtype, nla);
if (err)
return ERR_PTR(err);
return nla_find_nested(nla, attrtype);
}

View File

@ -0,0 +1,8 @@
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy);
extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
#endif /* __DRBD_NLA_H */

View File

@ -167,18 +167,24 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
* we convert to sectors in the display below. */
unsigned long bm_bits = drbd_bm_bits(mdev);
unsigned long bit_pos;
unsigned long long stop_sector = 0;
if (mdev->state.conn == C_VERIFY_S ||
mdev->state.conn == C_VERIFY_T)
mdev->state.conn == C_VERIFY_T) {
bit_pos = bm_bits - mdev->ov_left;
else
if (verify_can_do_stop_sector(mdev))
stop_sector = mdev->ov_stop_sector;
} else
bit_pos = mdev->bm_resync_fo;
/* Total sectors may be slightly off for oddly
* sized devices. So what. */
seq_printf(seq,
"\t%3d%% sector pos: %llu/%llu\n",
"\t%3d%% sector pos: %llu/%llu",
(int)(bit_pos / (bm_bits/100+1)),
(unsigned long long)bit_pos * BM_SECT_PER_BIT,
(unsigned long long)bm_bits * BM_SECT_PER_BIT);
if (stop_sector != 0 && stop_sector != ULLONG_MAX)
seq_printf(seq, " stop sector: %llu", stop_sector);
seq_printf(seq, "\n");
}
}
@ -194,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
static int drbd_seq_show(struct seq_file *seq, void *v)
{
int i, hole = 0;
int i, prev_i = -1;
const char *sn;
struct drbd_conf *mdev;
struct net_conf *nc;
char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
@ -227,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
oos .. known out-of-sync kB
*/
for (i = 0; i < minor_count; i++) {
mdev = minor_to_mdev(i);
if (!mdev) {
hole = 1;
continue;
}
if (hole) {
hole = 0;
rcu_read_lock();
idr_for_each_entry(&minors, mdev, i) {
if (prev_i != i - 1)
seq_printf(seq, "\n");
}
prev_i = i;
sn = drbd_conn_str(mdev->state.conn);
@ -248,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
/* reset mdev->congestion_reason */
bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
nc = rcu_dereference(mdev->tconn->net_conf);
wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
@ -257,9 +262,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
is_susp(mdev->state) ? 's' : 'r',
wp,
drbd_suspended(mdev) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
@ -276,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
mdev->epochs,
write_ordering_chars[mdev->write_ordering]
mdev->tconn->epochs,
write_ordering_chars[mdev->tconn->write_ordering]
);
seq_printf(seq, " oos:%llu\n",
Bit2KB((unsigned long long)
@ -302,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
}
}
}
rcu_read_unlock();
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -77,40 +77,41 @@
*/
enum drbd_req_event {
created,
to_be_send,
to_be_submitted,
CREATED,
TO_BE_SENT,
TO_BE_SUBMITTED,
/* XXX yes, now I am inconsistent...
* these are not "events" but "actions"
* oh, well... */
queue_for_net_write,
queue_for_net_read,
queue_for_send_oos,
QUEUE_FOR_NET_WRITE,
QUEUE_FOR_NET_READ,
QUEUE_FOR_SEND_OOS,
send_canceled,
send_failed,
handed_over_to_network,
oos_handed_to_network,
connection_lost_while_pending,
read_retry_remote_canceled,
recv_acked_by_peer,
write_acked_by_peer,
write_acked_by_peer_and_sis, /* and set_in_sync */
conflict_discarded_by_peer,
neg_acked,
barrier_acked, /* in protocol A and B */
data_received, /* (remote read) */
SEND_CANCELED,
SEND_FAILED,
HANDED_OVER_TO_NETWORK,
OOS_HANDED_TO_NETWORK,
CONNECTION_LOST_WHILE_PENDING,
READ_RETRY_REMOTE_CANCELED,
RECV_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
CONFLICT_RESOLVED,
POSTPONE_WRITE,
NEG_ACKED,
BARRIER_ACKED, /* in protocol A and B */
DATA_RECEIVED, /* (remote read) */
read_completed_with_error,
read_ahead_completed_with_error,
write_completed_with_error,
abort_disk_io,
completed_ok,
resend,
fail_frozen_disk_io,
restart_frozen_disk_io,
nothing, /* for tracing only */
READ_COMPLETED_WITH_ERROR,
READ_AHEAD_COMPLETED_WITH_ERROR,
WRITE_COMPLETED_WITH_ERROR,
ABORT_DISK_IO,
COMPLETED_OK,
RESEND,
FAIL_FROZEN_DISK_IO,
RESTART_FROZEN_DISK_IO,
NOTHING,
};
/* encoding of request states for now. we don't actually need that many bits.
@ -142,8 +143,8 @@ enum drbd_req_state_bits {
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
* 11100: write_acked (C),
* data_received (for remote read, any protocol)
* 11100: write acked (C),
* data received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
@ -198,6 +199,22 @@ enum drbd_req_state_bits {
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
/* The peer has sent a retry ACK */
__RQ_POSTPONED,
/* would have been completed,
* but was not, because of drbd_suspended() */
__RQ_COMPLETION_SUSP,
/* We expect a receive ACK (wire proto B) */
__RQ_EXP_RECEIVE_ACK,
/* We expect a write ACK (wite proto C) */
__RQ_EXP_WRITE_ACK,
/* waiting for a barrier ack, did an extra kref_get */
__RQ_EXP_BARR_ACK,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
@ -219,56 +236,16 @@ enum drbd_req_state_bits {
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
#define MR_WRITE_SHIFT 0
#define MR_WRITE (1 << MR_WRITE_SHIFT)
#define MR_READ_SHIFT 1
#define MR_READ (1 << MR_READ_SHIFT)
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->ee_hash_s == 0);
return mdev->ee_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
}
/* transfer log (drbd_request objects) */
static inline
struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->tl_hash_s == 0);
return mdev->tl_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
}
/* application reads (drbd_request objects) */
static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
return mdev->app_reads_hash
+ ((unsigned int)(sector) % APP_R_HSIZE);
}
/* when we receive the answer for a read request,
* verify that we actually know about it */
static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
u64 id, sector_t sector)
{
struct hlist_head *slot = ar_hash_slot(mdev, sector);
struct hlist_node *n;
struct drbd_request *req;
hlist_for_each_entry(req, n, slot, collision) {
if ((unsigned long)req == (unsigned long)id) {
D_ASSERT(req->sector == sector);
return req;
}
}
return NULL;
}
#define MR_WRITE 1
#define MR_READ 2
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi
req->private_bio = bio;
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_end_io = drbd_request_endio;
bio->bi_next = NULL;
}
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->mdev = mdev;
req->master_bio = bio_src;
req->epoch = 0;
req->sector = bio_src->bi_sector;
req->size = bio_src->bi_size;
INIT_HLIST_NODE(&req->collision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
}
return req;
}
static inline void drbd_req_free(struct drbd_request *req)
{
mempool_free(req, drbd_request_mempool);
}
static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
{
return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_size, or similar. But that would be too ugly. */
@ -321,6 +267,7 @@ struct bio_and_error {
int error;
};
extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
extern void request_timer_fn(unsigned long data);
extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
/* this is in drbd_main.c */
extern void drbd_restart_request(struct drbd_request *req);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
unsigned long flags;
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
spin_lock_irqsave(&mdev->req_lock, flags);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
rv = __req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (m.bio)
complete_master_bio(mdev, &m);
@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req,
return rv;
}
static inline bool drbd_should_do_remote(union drbd_state s)
static inline bool drbd_should_do_remote(union drbd_dev_state s)
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s)
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
static inline bool drbd_should_send_oos(union drbd_state s)
static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,161 @@
#ifndef DRBD_STATE_H
#define DRBD_STATE_H
struct drbd_conf;
struct drbd_tconn;
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
*
* The NS macros expand to a mask and a value, that can be bit ored onto the
* current state as soon as the spinlock (req_lock) was taken.
*
* The _NS macros are used for state functions that get called with the
* spinlock. These macros expand directly to the new state value.
*
* Besides the basic forms NS() and _NS() additional _?NS[23] are defined
* to express state changes that affect more than one aspect of the state.
*
* E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
* Means that the network connection was established and that the peer
* is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
#define disk_MASK D_MASK
#define pdsk_MASK D_MASK
#define conn_MASK C_MASK
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define susp_nod_MASK 1
#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T = (S); val; })
#define NS2(T1, S1, T2, S2) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val; })
#define NS3(T1, S1, T2, S2, T3, S3) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val.T3 = (S3); val; })
#define _NS(D, T, S) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
#define _NS2(D, T1, S1, T2, S2) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns; })
#define _NS3(D, T1, S1, T2, S2, T3, S3) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
enum chg_state_flags {
CS_HARD = 1 << 0,
CS_VERBOSE = 1 << 1,
CS_WAIT_COMPLETE = 1 << 2,
CS_SERIALIZE = 1 << 3,
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
CS_DC_PEER = 1 << 6,
CS_DC_CONN = 1 << 7,
CS_DC_DISK = 1 << 8,
CS_DC_PDSK = 1 << 9,
CS_DC_SUSP = 1 << 10,
CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
CS_IGN_OUTD_FAIL = 1 << 11,
};
/* drbd_dev_state and drbd_state are different types. This is to stress the
small difference. There is no suspended flag (.susp), and no suspended
while fence handler runs flas (susp_fen). */
union drbd_dev_state {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned _unused:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11;
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned _unused:1 ;
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned conn:5 ; /* 17/32 cstates */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else
# error "this endianess is not supported"
#endif
};
unsigned int i;
};
extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
enum chg_state_flags f,
union drbd_state mask,
union drbd_state val);
extern void drbd_force_state(struct drbd_conf *, union drbd_state,
union drbd_state);
extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
union drbd_state,
union drbd_state,
enum chg_state_flags);
extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
enum chg_state_flags,
struct completion *done);
extern void print_st_err(struct drbd_conf *, union drbd_state,
union drbd_state, int);
enum drbd_state_rv
_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
enum drbd_state_rv
conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
extern void drbd_resume_al(struct drbd_conf *mdev);
extern bool conn_all_vols_unconf(struct drbd_tconn *tconn);
/**
* drbd_request_state() - Reqest a state change
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*
* This is the most graceful way of requesting a state change. It is verbose
* quite verbose in case the state change is not possible, and all those
* state changes are globally serialized.
*/
static inline int drbd_request_state(struct drbd_conf *mdev,
union drbd_state mask,
union drbd_state val)
{
return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
}
enum drbd_role conn_highest_role(struct drbd_tconn *tconn);
enum drbd_role conn_highest_peer(struct drbd_tconn *tconn);
enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn);
enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn);
enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn);
enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn);
#endif

View File

@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
const char *drbd_conn_str(enum drbd_conns s)

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@
#include <linux/ctype.h>
#include <linux/mm.h>
#include "drbd_int.h"
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
extern void drbd_endio_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
extern void drbd_peer_request_endio(struct bio *bio, int error);
extern void drbd_request_endio(struct bio *bio, int error);
/*
* used to submit our private bio
@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
generic_make_request(bio);
}
static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
{
return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
== CRYPTO_ALG_TYPE_HASH;
}
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)

View File

@ -463,6 +463,7 @@ out:
*/
static void loop_add_bio(struct loop_device *lo, struct bio *bio)
{
lo->lo_bio_count++;
bio_list_add(&lo->lo_bio_list, bio);
}
@ -471,6 +472,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
*/
static struct bio *loop_get_bio(struct loop_device *lo)
{
lo->lo_bio_count--;
return bio_list_pop(&lo->lo_bio_list);
}
@ -489,6 +491,10 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio)
goto out;
if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
goto out;
if (lo->lo_bio_count >= q->nr_congestion_on)
wait_event_lock_irq(lo->lo_req_wait,
lo->lo_bio_count < q->nr_congestion_off,
lo->lo_lock);
loop_add_bio(lo, old_bio);
wake_up(&lo->lo_event);
spin_unlock_irq(&lo->lo_lock);
@ -546,6 +552,8 @@ static int loop_thread(void *data)
continue;
spin_lock_irq(&lo->lo_lock);
bio = loop_get_bio(lo);
if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
wake_up(&lo->lo_req_wait);
spin_unlock_irq(&lo->lo_lock);
BUG_ON(!bio);
@ -873,6 +881,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->transfer = transfer_none;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
@ -1673,6 +1682,7 @@ static int loop_add(struct loop_device **l, int i)
lo->lo_number = i;
lo->lo_thread = NULL;
init_waitqueue_head(&lo->lo_event);
init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;

View File

@ -39,6 +39,7 @@
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/bitmap.h>
#include <xen/events.h>
#include <xen/page.h>
@ -79,6 +80,7 @@ struct pending_req {
unsigned short operation;
int status;
struct list_head free_list;
DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
};
#define BLKBACK_INVALID_HANDLE (~0)
@ -98,6 +100,36 @@ struct xen_blkbk {
static struct xen_blkbk *blkbk;
/*
* Maximum number of grant pages that can be mapped in blkback.
* BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
* pages that blkback will persistently map.
* Currently, this is:
* RING_SIZE = 32 (for all known ring types)
* BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
* sizeof(struct persistent_gnt) = 48
* So the maximum memory used to store the grants is:
* 32 * 11 * 48 = 16896 bytes
*/
static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
{
switch (protocol) {
case BLKIF_PROTOCOL_NATIVE:
return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
BLKIF_MAX_SEGMENTS_PER_REQUEST;
case BLKIF_PROTOCOL_X86_32:
return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
BLKIF_MAX_SEGMENTS_PER_REQUEST;
case BLKIF_PROTOCOL_X86_64:
return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
BLKIF_MAX_SEGMENTS_PER_REQUEST;
default:
BUG();
}
return 0;
}
/*
* Little helpful macro to figure out the index and virtual address of the
* pending_pages[..]. For each 'pending_req' we have have up to
@ -129,6 +161,90 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
static void make_response(struct xen_blkif *blkif, u64 id,
unsigned short op, int st);
#define foreach_grant(pos, rbtree, node) \
for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \
&(pos)->node != NULL; \
(pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node))
static void add_persistent_gnt(struct rb_root *root,
struct persistent_gnt *persistent_gnt)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
struct persistent_gnt *this;
/* Figure out where to put new node */
while (*new) {
this = container_of(*new, struct persistent_gnt, node);
parent = *new;
if (persistent_gnt->gnt < this->gnt)
new = &((*new)->rb_left);
else if (persistent_gnt->gnt > this->gnt)
new = &((*new)->rb_right);
else {
pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
BUG();
}
}
/* Add new node and rebalance tree. */
rb_link_node(&(persistent_gnt->node), parent, new);
rb_insert_color(&(persistent_gnt->node), root);
}
static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
grant_ref_t gref)
{
struct persistent_gnt *data;
struct rb_node *node = root->rb_node;
while (node) {
data = container_of(node, struct persistent_gnt, node);
if (gref < data->gnt)
node = node->rb_left;
else if (gref > data->gnt)
node = node->rb_right;
else
return data;
}
return NULL;
}
static void free_persistent_gnts(struct rb_root *root, unsigned int num)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt;
int ret = 0;
int segs_to_unmap = 0;
foreach_grant(persistent_gnt, root, node) {
BUG_ON(persistent_gnt->handle ==
BLKBACK_INVALID_HANDLE);
gnttab_set_unmap_op(&unmap[segs_to_unmap],
(unsigned long) pfn_to_kaddr(page_to_pfn(
persistent_gnt->page)),
GNTMAP_host_map,
persistent_gnt->handle);
pages[segs_to_unmap] = persistent_gnt->page;
rb_erase(&persistent_gnt->node, root);
kfree(persistent_gnt);
num--;
if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
!rb_next(&persistent_gnt->node)) {
ret = gnttab_unmap_refs(unmap, NULL, pages,
segs_to_unmap);
BUG_ON(ret);
segs_to_unmap = 0;
}
}
BUG_ON(num != 0);
}
/*
* Retrieve from the 'pending_reqs' a free pending_req structure to be used.
*/
@ -302,6 +418,14 @@ int xen_blkif_schedule(void *arg)
print_stats(blkif);
}
/* Free all persistent grant pages */
if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
free_persistent_gnts(&blkif->persistent_gnts,
blkif->persistent_gnt_c);
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
blkif->persistent_gnt_c = 0;
if (log_stats)
print_stats(blkif);
@ -328,6 +452,8 @@ static void xen_blkbk_unmap(struct pending_req *req)
int ret;
for (i = 0; i < req->nr_pages; i++) {
if (!test_bit(i, req->unmap_seg))
continue;
handle = pending_handle(req, i);
if (handle == BLKBACK_INVALID_HANDLE)
continue;
@ -344,12 +470,26 @@ static void xen_blkbk_unmap(struct pending_req *req)
static int xen_blkbk_map(struct blkif_request *req,
struct pending_req *pending_req,
struct seg_buf seg[])
struct seg_buf seg[],
struct page *pages[])
{
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
int i;
struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt = NULL;
struct xen_blkif *blkif = pending_req->blkif;
phys_addr_t addr = 0;
int i, j;
bool new_map;
int nseg = req->u.rw.nr_segments;
int segs_to_map = 0;
int ret = 0;
int use_persistent_gnts;
use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
BUG_ON(blkif->persistent_gnt_c >
max_mapped_grant_pages(pending_req->blkif->blk_protocol));
/*
* Fill out preq.nr_sects with proper amount of sectors, and setup
@ -359,36 +499,146 @@ static int xen_blkbk_map(struct blkif_request *req,
for (i = 0; i < nseg; i++) {
uint32_t flags;
flags = GNTMAP_host_map;
if (pending_req->operation != BLKIF_OP_READ)
flags |= GNTMAP_readonly;
gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
req->u.rw.seg[i].gref,
pending_req->blkif->domid);
if (use_persistent_gnts)
persistent_gnt = get_persistent_gnt(
&blkif->persistent_gnts,
req->u.rw.seg[i].gref);
if (persistent_gnt) {
/*
* We are using persistent grants and
* the grant is already mapped
*/
new_map = false;
} else if (use_persistent_gnts &&
blkif->persistent_gnt_c <
max_mapped_grant_pages(blkif->blk_protocol)) {
/*
* We are using persistent grants, the grant is
* not mapped but we have room for it
*/
new_map = true;
persistent_gnt = kmalloc(
sizeof(struct persistent_gnt),
GFP_KERNEL);
if (!persistent_gnt)
return -ENOMEM;
persistent_gnt->page = alloc_page(GFP_KERNEL);
if (!persistent_gnt->page) {
kfree(persistent_gnt);
return -ENOMEM;
}
persistent_gnt->gnt = req->u.rw.seg[i].gref;
persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
pages_to_gnt[segs_to_map] =
persistent_gnt->page;
addr = (unsigned long) pfn_to_kaddr(
page_to_pfn(persistent_gnt->page));
add_persistent_gnt(&blkif->persistent_gnts,
persistent_gnt);
blkif->persistent_gnt_c++;
pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
persistent_gnt->gnt, blkif->persistent_gnt_c,
max_mapped_grant_pages(blkif->blk_protocol));
} else {
/*
* We are either using persistent grants and
* hit the maximum limit of grants mapped,
* or we are not using persistent grants.
*/
if (use_persistent_gnts &&
!blkif->vbd.overflow_max_grants) {
blkif->vbd.overflow_max_grants = 1;
pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
blkif->domid, blkif->vbd.handle);
}
new_map = true;
pages[i] = blkbk->pending_page(pending_req, i);
addr = vaddr(pending_req, i);
pages_to_gnt[segs_to_map] =
blkbk->pending_page(pending_req, i);
}
if (persistent_gnt) {
pages[i] = persistent_gnt->page;
persistent_gnts[i] = persistent_gnt;
} else {
persistent_gnts[i] = NULL;
}
if (new_map) {
flags = GNTMAP_host_map;
if (!persistent_gnt &&
(pending_req->operation != BLKIF_OP_READ))
flags |= GNTMAP_readonly;
gnttab_set_map_op(&map[segs_to_map++], addr,
flags, req->u.rw.seg[i].gref,
blkif->domid);
}
}
ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg);
BUG_ON(ret);
if (segs_to_map) {
ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
BUG_ON(ret);
}
/*
* Now swizzle the MFN in our domain with the MFN from the other domain
* so that when we access vaddr(pending_req,i) it has the contents of
* the page from the other domain.
*/
for (i = 0; i < nseg; i++) {
if (unlikely(map[i].status != 0)) {
pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
map[i].handle = BLKBACK_INVALID_HANDLE;
ret |= 1;
bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
for (i = 0, j = 0; i < nseg; i++) {
if (!persistent_gnts[i] ||
persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
/* This is a newly mapped grant */
BUG_ON(j >= segs_to_map);
if (unlikely(map[j].status != 0)) {
pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
map[j].handle = BLKBACK_INVALID_HANDLE;
ret |= 1;
if (persistent_gnts[i]) {
rb_erase(&persistent_gnts[i]->node,
&blkif->persistent_gnts);
blkif->persistent_gnt_c--;
kfree(persistent_gnts[i]);
persistent_gnts[i] = NULL;
}
}
}
if (persistent_gnts[i]) {
if (persistent_gnts[i]->handle ==
BLKBACK_INVALID_HANDLE) {
/*
* If this is a new persistent grant
* save the handler
*/
persistent_gnts[i]->handle = map[j].handle;
persistent_gnts[i]->dev_bus_addr =
map[j++].dev_bus_addr;
}
pending_handle(pending_req, i) =
persistent_gnts[i]->handle;
pending_handle(pending_req, i) = map[i].handle;
if (ret)
continue;
if (ret)
continue;
seg[i].buf = persistent_gnts[i]->dev_bus_addr |
(req->u.rw.seg[i].first_sect << 9);
} else {
pending_handle(pending_req, i) = map[j].handle;
bitmap_set(pending_req->unmap_seg, i, 1);
seg[i].buf = map[i].dev_bus_addr |
(req->u.rw.seg[i].first_sect << 9);
if (ret) {
j++;
continue;
}
seg[i].buf = map[j++].dev_bus_addr |
(req->u.rw.seg[i].first_sect << 9);
}
}
return ret;
}
@ -591,6 +841,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
int operation;
struct blk_plug plug;
bool drain = false;
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
switch (req->operation) {
case BLKIF_OP_READ:
@ -677,7 +928,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* the hypercall to unmap the grants - that is all done in
* xen_blkbk_unmap.
*/
if (xen_blkbk_map(req, pending_req, seg))
if (xen_blkbk_map(req, pending_req, seg, pages))
goto fail_flush;
/*
@ -689,7 +940,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
for (i = 0; i < nseg; i++) {
while ((bio == NULL) ||
(bio_add_page(bio,
blkbk->pending_page(pending_req, i),
pages[i],
seg[i].nsec << 9,
seg[i].buf & ~PAGE_MASK) == 0)) {

View File

@ -34,6 +34,7 @@
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/io.h>
#include <linux/rbtree.h>
#include <asm/setup.h>
#include <asm/pgalloc.h>
#include <asm/hypervisor.h>
@ -160,10 +161,21 @@ struct xen_vbd {
sector_t size;
unsigned int flush_support:1;
unsigned int discard_secure:1;
unsigned int feature_gnt_persistent:1;
unsigned int overflow_max_grants:1;
};
struct backend_info;
struct persistent_gnt {
struct page *page;
grant_ref_t gnt;
grant_handle_t handle;
uint64_t dev_bus_addr;
struct rb_node node;
};
struct xen_blkif {
/* Unique identifier for this interface. */
domid_t domid;
@ -190,6 +202,10 @@ struct xen_blkif {
struct task_struct *xenblkd;
unsigned int waiting_reqs;
/* tree to store persistent grants */
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
/* statistics */
unsigned long st_print;
int st_rd_req;

View File

@ -117,6 +117,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
atomic_set(&blkif->drain, 0);
blkif->st_print = jiffies;
init_waitqueue_head(&blkif->waiting_to_free);
blkif->persistent_gnts.rb_node = NULL;
return blkif;
}
@ -672,6 +673,13 @@ again:
xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1);
if (err) {
xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
dev->nodename);
goto abort;
}
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(&be->blkif->vbd));
if (err) {
@ -720,6 +728,7 @@ static int connect_ring(struct backend_info *be)
struct xenbus_device *dev = be->dev;
unsigned long ring_ref;
unsigned int evtchn;
unsigned int pers_grants;
char protocol[64] = "";
int err;
@ -749,8 +758,18 @@ static int connect_ring(struct backend_info *be)
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -1;
}
pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
ring_ref, evtchn, be->blkif->blk_protocol, protocol);
err = xenbus_gather(XBT_NIL, dev->otherend,
"feature-persistent", "%u",
&pers_grants, NULL);
if (err)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
ring_ref, evtchn, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
/* Map the shared frame, irq etc. */
err = xen_blkif_map(be->blkif, ring_ref, evtchn);

View File

@ -44,6 +44,7 @@
#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <linux/bitmap.h>
#include <linux/llist.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
@ -64,10 +65,17 @@ enum blkif_state {
BLKIF_STATE_SUSPENDED,
};
struct grant {
grant_ref_t gref;
unsigned long pfn;
struct llist_node node;
};
struct blk_shadow {
struct blkif_request req;
struct request *request;
unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
static DEFINE_MUTEX(blkfront_mutex);
@ -97,6 +105,8 @@ struct blkfront_info
struct work_struct work;
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_RING_SIZE];
struct llist_head persistent_gnts;
unsigned int persistent_gnts_c;
unsigned long shadow_free;
unsigned int feature_flush;
unsigned int flush_op;
@ -104,6 +114,7 @@ struct blkfront_info
unsigned int feature_secdiscard:1;
unsigned int discard_granularity;
unsigned int discard_alignment;
unsigned int feature_persistent:1;
int is_ready;
};
@ -287,21 +298,36 @@ static int blkif_queue_request(struct request *req)
unsigned long id;
unsigned int fsect, lsect;
int i, ref;
/*
* Used to store if we are able to queue the request by just using
* existing persistent grants, or if we have to get new grants,
* as there are not sufficiently many free.
*/
bool new_persistent_gnts;
grant_ref_t gref_head;
struct page *granted_page;
struct grant *gnt_list_entry = NULL;
struct scatterlist *sg;
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return 1;
if (gnttab_alloc_grant_references(
BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
gnttab_request_free_callback(
&info->callback,
blkif_restart_queue_callback,
info,
BLKIF_MAX_SEGMENTS_PER_REQUEST);
return 1;
}
/* Check if we have enought grants to allocate a requests */
if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
new_persistent_gnts = 1;
if (gnttab_alloc_grant_references(
BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
&gref_head) < 0) {
gnttab_request_free_callback(
&info->callback,
blkif_restart_queue_callback,
info,
BLKIF_MAX_SEGMENTS_PER_REQUEST);
return 1;
}
} else
new_persistent_gnts = 0;
/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
@ -341,18 +367,73 @@ static int blkif_queue_request(struct request *req)
BLKIF_MAX_SEGMENTS_PER_REQUEST);
for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
fsect = sg->offset >> 9;
lsect = fsect + (sg->length >> 9) - 1;
/* install a grant reference. */
ref = gnttab_claim_grant_reference(&gref_head);
BUG_ON(ref == -ENOSPC);
gnttab_grant_foreign_access_ref(
ref,
if (info->persistent_gnts_c) {
BUG_ON(llist_empty(&info->persistent_gnts));
gnt_list_entry = llist_entry(
llist_del_first(&info->persistent_gnts),
struct grant, node);
ref = gnt_list_entry->gref;
buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
info->persistent_gnts_c--;
} else {
ref = gnttab_claim_grant_reference(&gref_head);
BUG_ON(ref == -ENOSPC);
gnt_list_entry =
kmalloc(sizeof(struct grant),
GFP_ATOMIC);
if (!gnt_list_entry)
return -ENOMEM;
granted_page = alloc_page(GFP_ATOMIC);
if (!granted_page) {
kfree(gnt_list_entry);
return -ENOMEM;
}
gnt_list_entry->pfn =
page_to_pfn(granted_page);
gnt_list_entry->gref = ref;
buffer_mfn = pfn_to_mfn(page_to_pfn(
granted_page));
gnttab_grant_foreign_access_ref(ref,
info->xbdev->otherend_id,
buffer_mfn,
rq_data_dir(req));
buffer_mfn, 0);
}
info->shadow[id].grants_used[i] = gnt_list_entry;
if (rq_data_dir(req)) {
char *bvec_data;
void *shared_data;
BUG_ON(sg->offset + sg->length > PAGE_SIZE);
shared_data = kmap_atomic(
pfn_to_page(gnt_list_entry->pfn));
bvec_data = kmap_atomic(sg_page(sg));
/*
* this does not wipe data stored outside the
* range sg->offset..sg->offset+sg->length.
* Therefore, blkback *could* see data from
* previous requests. This is OK as long as
* persistent grants are shared with just one
* domain. It may need refactoring if this
* changes
*/
memcpy(shared_data + sg->offset,
bvec_data + sg->offset,
sg->length);
kunmap_atomic(bvec_data);
kunmap_atomic(shared_data);
}
info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
ring_req->u.rw.seg[i] =
@ -368,7 +449,8 @@ static int blkif_queue_request(struct request *req)
/* Keep a private copy so we can reissue requests when recovering. */
info->shadow[id].req = *ring_req;
gnttab_free_grant_references(gref_head);
if (new_persistent_gnts)
gnttab_free_grant_references(gref_head);
return 0;
}
@ -480,12 +562,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
static void xlvbd_flush(struct blkfront_info *info)
{
blk_queue_flush(info->rq, info->feature_flush);
printk(KERN_INFO "blkfront: %s: %s: %s\n",
printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
info->gd->disk_name,
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
"flush diskcache" : "barrier or flush"),
info->feature_flush ? "enabled" : "disabled");
info->feature_flush ? "enabled" : "disabled",
info->feature_persistent ? "using persistent grants" : "");
}
static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@ -707,6 +790,9 @@ static void blkif_restart_queue(struct work_struct *work)
static void blkif_free(struct blkfront_info *info, int suspend)
{
struct llist_node *all_gnts;
struct grant *persistent_gnt;
/* Prevent new requests being issued until we fix things up. */
spin_lock_irq(&info->io_lock);
info->connected = suspend ?
@ -714,6 +800,18 @@ static void blkif_free(struct blkfront_info *info, int suspend)
/* No more blkif_request(). */
if (info->rq)
blk_stop_queue(info->rq);
/* Remove all persistent grants */
if (info->persistent_gnts_c) {
all_gnts = llist_del_all(&info->persistent_gnts);
llist_for_each_entry(persistent_gnt, all_gnts, node) {
gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
__free_page(pfn_to_page(persistent_gnt->pfn));
kfree(persistent_gnt);
}
info->persistent_gnts_c = 0;
}
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
spin_unlock_irq(&info->io_lock);
@ -734,13 +832,43 @@ static void blkif_free(struct blkfront_info *info, int suspend)
}
static void blkif_completion(struct blk_shadow *s)
static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
struct blkif_response *bret)
{
int i;
/* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
* flag. */
for (i = 0; i < s->req.u.rw.nr_segments; i++)
gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
struct bio_vec *bvec;
struct req_iterator iter;
unsigned long flags;
char *bvec_data;
void *shared_data;
unsigned int offset = 0;
if (bret->operation == BLKIF_OP_READ) {
/*
* Copy the data received from the backend into the bvec.
* Since bv_offset can be different than 0, and bv_len different
* than PAGE_SIZE, we have to keep track of the current offset,
* to be sure we are copying the data from the right shared page.
*/
rq_for_each_segment(bvec, s->request, iter) {
BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
i = offset >> PAGE_SHIFT;
BUG_ON(i >= s->req.u.rw.nr_segments);
shared_data = kmap_atomic(
pfn_to_page(s->grants_used[i]->pfn));
bvec_data = bvec_kmap_irq(bvec, &flags);
memcpy(bvec_data, shared_data + bvec->bv_offset,
bvec->bv_len);
bvec_kunmap_irq(bvec_data, &flags);
kunmap_atomic(shared_data);
offset += bvec->bv_len;
}
}
/* Add the persistent grant into the list of free grants */
for (i = 0; i < s->req.u.rw.nr_segments; i++) {
llist_add(&s->grants_used[i]->node, &info->persistent_gnts);
info->persistent_gnts_c++;
}
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@ -783,7 +911,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
req = info->shadow[id].request;
if (bret->operation != BLKIF_OP_DISCARD)
blkif_completion(&info->shadow[id]);
blkif_completion(&info->shadow[id], info, bret);
if (add_id_to_freelist(info, id)) {
WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@ -942,6 +1070,11 @@ again:
message = "writing protocol";
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename,
"feature-persistent", "%u", 1);
if (err)
dev_warn(&dev->dev,
"writing persistent grants feature to xenbus");
err = xenbus_transaction_end(xbt, 0);
if (err) {
@ -1029,6 +1162,8 @@ static int blkfront_probe(struct xenbus_device *dev,
spin_lock_init(&info->io_lock);
info->xbdev = dev;
info->vdevice = vdevice;
init_llist_head(&info->persistent_gnts);
info->persistent_gnts_c = 0;
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
@ -1093,7 +1228,7 @@ static int blkif_recover(struct blkfront_info *info)
req->u.rw.seg[j].gref,
info->xbdev->otherend_id,
pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
rq_data_dir(info->shadow[req->u.rw.id].request));
0);
}
info->shadow[req->u.rw.id].req = *req;
@ -1225,7 +1360,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size;
unsigned int binfo;
int err;
int barrier, flush, discard;
int barrier, flush, discard, persistent;
switch (info->connected) {
case BLKIF_STATE_CONNECTED:
@ -1303,6 +1438,14 @@ static void blkfront_connect(struct blkfront_info *info)
if (!err && discard)
blkfront_setup_discard(info);
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
"feature-persistent", "%u", &persistent,
NULL);
if (err)
info->feature_persistent = 0;
else
info->feature_persistent = persistent;
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",

View File

@ -452,7 +452,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
mddev->write_lock, /*nothing*/);
mddev->write_lock);
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);

View File

@ -551,32 +551,6 @@ struct md_thread {
#define THREAD_WAKEUP 0
#define __wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
wait_queue_t __wait; \
init_waitqueue_entry(&__wait, current); \
\
add_wait_queue(&wq, &__wait); \
for (;;) { \
set_current_state(TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while (0)
#define wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, cmd); \
} while (0)
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);

View File

@ -822,7 +822,7 @@ static void raise_barrier(struct r1conf *conf)
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
conf->resync_lock, );
conf->resync_lock);
/* block any new IO from starting */
conf->barrier++;
@ -830,7 +830,7 @@ static void raise_barrier(struct r1conf *conf)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
conf->resync_lock, );
conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
}
@ -864,8 +864,7 @@ static void wait_barrier(struct r1conf *conf)
(conf->nr_pending &&
current->bio_list &&
!bio_list_empty(current->bio_list)),
conf->resync_lock,
);
conf->resync_lock);
conf->nr_waiting--;
}
conf->nr_pending++;
@ -898,10 +897,10 @@ static void freeze_array(struct r1conf *conf)
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
flush_pending_writes(conf));
wait_event_lock_irq_cmd(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(struct r1conf *conf)

View File

@ -952,7 +952,7 @@ static void raise_barrier(struct r10conf *conf, int force)
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
conf->resync_lock, );
conf->resync_lock);
/* block any new IO from starting */
conf->barrier++;
@ -960,7 +960,7 @@ static void raise_barrier(struct r10conf *conf, int force)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
conf->resync_lock, );
conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
}
@ -993,8 +993,7 @@ static void wait_barrier(struct r10conf *conf)
(conf->nr_pending &&
current->bio_list &&
!bio_list_empty(current->bio_list)),
conf->resync_lock,
);
conf->resync_lock);
conf->nr_waiting--;
}
conf->nr_pending++;
@ -1027,10 +1026,10 @@ static void freeze_array(struct r10conf *conf)
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
flush_pending_writes(conf));
wait_event_lock_irq_cmd(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}

View File

@ -466,7 +466,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
do {
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0 || noquiesce,
conf->device_lock, /* nothing */);
conf->device_lock);
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!conf->inactive_blocked)
@ -480,8 +480,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
);
conf->device_lock);
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
@ -1646,8 +1645,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list),
conf->device_lock,
);
conf->device_lock);
osh = get_free_stripe(conf);
spin_unlock_irq(&conf->device_lock);
atomic_set(&nsh->count, 1);
@ -4003,7 +4001,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0,
conf->device_lock, /* nothing */);
conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
spin_unlock_irq(&conf->device_lock);
@ -6095,7 +6093,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
conf->device_lock);
conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
/* allow reshape to continue */

View File

@ -51,12 +51,11 @@
#endif
extern const char *drbd_buildtag(void);
#define REL_VERSION "8.3.13"
#define API_VERSION 88
#define REL_VERSION "8.4.2"
#define API_VERSION 1
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 96
#define PRO_VERSION_MAX 101
enum drbd_io_error_p {
@ -66,7 +65,8 @@ enum drbd_io_error_p {
};
enum drbd_fencing_p {
FP_DONT_CARE,
FP_NOT_AVAIL = -1, /* Not a policy */
FP_DONT_CARE = 0,
FP_RESOURCE,
FP_STONITH
};
@ -102,6 +102,20 @@ enum drbd_on_congestion {
OC_DISCONNECT,
};
enum drbd_read_balancing {
RB_PREFER_LOCAL,
RB_PREFER_REMOTE,
RB_ROUND_ROBIN,
RB_LEAST_PENDING,
RB_CONGESTED_REMOTE,
RB_32K_STRIPING,
RB_64K_STRIPING,
RB_128K_STRIPING,
RB_256K_STRIPING,
RB_512K_STRIPING,
RB_1M_STRIPING,
};
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_code {
ERR_CODE_BASE = 100,
@ -122,7 +136,7 @@ enum drbd_ret_code {
ERR_AUTH_ALG = 120,
ERR_AUTH_ALG_ND = 121,
ERR_NOMEM = 122,
ERR_DISCARD = 123,
ERR_DISCARD_IMPOSSIBLE = 123,
ERR_DISK_CONFIGURED = 124,
ERR_NET_CONFIGURED = 125,
ERR_MANDATORY_TAG = 126,
@ -130,8 +144,8 @@ enum drbd_ret_code {
ERR_INTR = 129, /* EINTR */
ERR_RESIZE_RESYNC = 130,
ERR_NO_PRIMARY = 131,
ERR_SYNC_AFTER = 132,
ERR_SYNC_AFTER_CYCLE = 133,
ERR_RESYNC_AFTER = 132,
ERR_RESYNC_AFTER_CYCLE = 133,
ERR_PAUSE_IS_SET = 134,
ERR_PAUSE_IS_CLEAR = 135,
ERR_PACKET_NR = 137,
@ -155,6 +169,14 @@ enum drbd_ret_code {
ERR_CONG_NOT_PROTO_A = 155,
ERR_PIC_AFTER_DEP = 156,
ERR_PIC_PEER_DEP = 157,
ERR_RES_NOT_KNOWN = 158,
ERR_RES_IN_USE = 159,
ERR_MINOR_CONFIGURED = 160,
ERR_MINOR_EXISTS = 161,
ERR_INVALID_REQUEST = 162,
ERR_NEED_APV_100 = 163,
ERR_NEED_ALLOW_TWO_PRI = 164,
ERR_MD_UNCLEAN = 165,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
@ -296,7 +318,8 @@ enum drbd_state_rv {
SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
SS_O_VOL_PEER_PRI = -20,
SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */
};
/* from drbd_strings.c */
@ -313,7 +336,9 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv);
#define MDF_FULL_SYNC (1 << 3)
#define MDF_WAS_UP_TO_DATE (1 << 4)
#define MDF_PEER_OUT_DATED (1 << 5)
#define MDF_CRASHED_PRIMARY (1 << 6)
#define MDF_CRASHED_PRIMARY (1 << 6)
#define MDF_AL_CLEAN (1 << 7)
#define MDF_AL_DISABLED (1 << 8)
enum drbd_uuid_index {
UI_CURRENT,
@ -333,37 +358,23 @@ enum drbd_timeout_flag {
#define UUID_JUST_CREATED ((__u64)4)
/* magic numbers used in meta data and network packets */
#define DRBD_MAGIC 0x83740267
#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
#define DRBD_MAGIC_BIG 0x835a
#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
#define DRBD_MAGIC_100 0x8620ec20
#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3)
#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4)
#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5)
/* how I came up with this magic?
* base64 decode "actlog==" ;) */
#define DRBD_AL_MAGIC 0x69cb65a2
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2
#define DRBD_MD_INDEX_FLEX_INT -3
/* Start of the new netlink/connector stuff */
#define DRBD_NL_CREATE_DEVICE 0x01
#define DRBD_NL_SET_DEFAULTS 0x02
/* For searching a vacant cn_idx value */
#define CN_IDX_STEP 6977
struct drbd_nl_cfg_req {
int packet_type;
unsigned int drbd_minor;
int flags;
unsigned short tag_list[];
};
struct drbd_nl_cfg_reply {
int packet_type;
unsigned int minor;
int ret_code; /* enum ret_code or set_st_err_t */
unsigned short tag_list[]; /* only used with get_* calls */
};
#endif

378
include/linux/drbd_genl.h Normal file
View File

@ -0,0 +1,378 @@
/*
* General overview:
* full generic netlink message:
* |nlmsghdr|genlmsghdr|<payload>
*
* payload:
* |optional fixed size family header|<sequence of netlink attributes>
*
* sequence of netlink attributes:
* I chose to have all "top level" attributes NLA_NESTED,
* corresponding to some real struct.
* So we have a sequence of |tla, len|<nested nla sequence>
*
* nested nla sequence:
* may be empty, or contain a sequence of netlink attributes
* representing the struct fields.
*
* The tag number of any field (regardless of containing struct)
* will be available as T_ ## field_name,
* so you cannot have the same field name in two differnt structs.
*
* The tag numbers themselves are per struct, though,
* so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type,
* which we won't use here).
* The tag numbers are used as index in the respective nla_policy array.
*
* GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy
* genl_magic_struct.h
* generates the struct declaration,
* generates an entry in the tla enum,
* genl_magic_func.h
* generates an entry in the static tla policy
* with .type = NLA_NESTED
* generates the static <struct_name>_nl_policy definition,
* and static conversion functions
*
* genl_magic_func.h
*
* GENL_mc_group(group)
* genl_magic_struct.h
* does nothing
* genl_magic_func.h
* defines and registers the mcast group,
* and provides a send helper
*
* GENL_notification(op_name, op_num, mcast_group, tla list)
* These are notifications to userspace.
*
* genl_magic_struct.h
* generates an entry in the genl_ops enum,
* genl_magic_func.h
* does nothing
*
* mcast group: the name of the mcast group this notification should be
* expected on
* tla list: the list of expected top level attributes,
* for documentation and sanity checking.
*
* GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations"
* These are requests from userspace.
*
* _op and _notification share the same "number space",
* op_nr will be assigned to "genlmsghdr->cmd"
*
* genl_magic_struct.h
* generates an entry in the genl_ops enum,
* genl_magic_func.h
* generates an entry in the static genl_ops array,
* and static register/unregister functions to
* genl_register_family_with_ops().
*
* flags and handler:
* GENL_op_init( .doit = x, .dumpit = y, .flags = something)
* GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM
* tla list: the list of expected top level attributes,
* for documentation and sanity checking.
*/
/*
* STRUCTS
*/
/* this is sent kernel -> userland on various error conditions, and contains
* informational textual info, which is supposedly human readable.
* The computer relevant return code is in the drbd_genlmsghdr.
*/
GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply,
/* "arbitrary" size strings, nla_policy.len = 0 */
__str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0)
)
/* Configuration requests typically need a context to operate on.
* Possible keys are device minor (fits in the drbd_genlmsghdr),
* the replication link (aka connection) name,
* and/or the replication group (aka resource) name,
* and the volume id within the resource. */
GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context,
__u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume)
__str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128)
__bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128)
__bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128)
)
GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
__str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128)
__str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128)
__s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx)
/* use the resize command to try and change the disk_size */
__u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size)
/* we could change the max_bio_bvecs,
* but it won't propagate through the stack */
__u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs)
__u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF)
__u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF)
__u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF)
__s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF)
__u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF)
__u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF)
__u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF)
__u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
__u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
__u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
__flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF)
__flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF)
__flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF)
__flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF)
__u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
__u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
/* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */
__flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF)
)
GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
__str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32)
__u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF)
)
GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
__str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE,
shared_secret, SHARED_SECRET_MAX)
__str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX)
__str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX)
__str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX)
__str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX)
__u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF)
__u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF)
__u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF)
__u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF)
__u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF)
__u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF)
__u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF)
__u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF)
__u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF)
__u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF)
__u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF)
__u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF)
__u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF)
__u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF)
__u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF)
__u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF)
__u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF)
__u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF)
__flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF)
__flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data)
__flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF)
__flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF)
__flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative)
__flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF)
/* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */
)
GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate)
)
GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
__u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size)
__flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force)
__flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync)
)
GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
/* the reason of the broadcast,
* if this is an event triggered broadcast. */
__u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason)
__u32_field(2, DRBD_F_REQUIRED, current_state)
__u64_field(3, DRBD_GENLA_F_MANDATORY, capacity)
__u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid)
/* These are for broadcast from after state change work.
* prev_state and new_state are from the moment the state change took
* place, new_state is not neccessarily the same as current_state,
* there may have been more state changes since. Which will be
* broadcasted soon, in their respective after state change work. */
__u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state)
__u32_field(6, DRBD_GENLA_F_MANDATORY, new_state)
/* if we have a local disk: */
__bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64)))
__u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags)
__u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total)
__u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos)
/* and in case resync or online verify is active */
__u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total)
__u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed)
/* for pre and post notifications of helper execution */
__str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32)
__u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code)
__u64_field(15, 0, send_cnt)
__u64_field(16, 0, recv_cnt)
__u64_field(17, 0, read_cnt)
__u64_field(18, 0, writ_cnt)
__u64_field(19, 0, al_writ_cnt)
__u64_field(20, 0, bm_writ_cnt)
__u32_field(21, 0, ap_bio_cnt)
__u32_field(22, 0, ap_pending_cnt)
__u32_field(23, 0, rs_pending_cnt)
)
GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms,
__u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector)
__u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector)
)
GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm)
)
GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms,
__u32_field(1, DRBD_F_REQUIRED, timeout_type)
)
GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect)
)
GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach)
)
/*
* Notifications and commands (genlmsghdr->cmd)
*/
GENL_mc_group(events)
/* kernel -> userspace announcement of changes */
GENL_notification(
DRBD_EVENT, 1, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY)
)
/* query kernel for specific or all info */
GENL_op(
DRBD_ADM_GET_STATUS, 2,
GENL_op_init(
.doit = drbd_adm_get_status,
.dumpit = drbd_adm_get_status_all,
/* anyone may ask for the status,
* it is broadcasted anyways */
),
/* To select the object .doit.
* Or a subset of objects in .dumpit. */
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
)
/* add DRBD minor devices as volumes to resources */
GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
/* add or delete resources */
GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESOURCE_OPTS, 9,
GENL_doit(drbd_adm_resource_opts),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
DRBD_ADM_CONNECT, 10,
GENL_doit(drbd_adm_connect),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_CHG_NET_OPTS, 29,
GENL_doit(drbd_adm_net_opts),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
)
GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_ATTACH, 12,
GENL_doit(drbd_adm_attach),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED)
)
GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28,
GENL_doit(drbd_adm_disk_opts),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_RESIZE, 13,
GENL_doit(drbd_adm_resize),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
DRBD_ADM_PRIMARY, 14,
GENL_doit(drbd_adm_set_role),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_SECONDARY, 15,
GENL_doit(drbd_adm_set_role),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_NEW_C_UUID, 16,
GENL_doit(drbd_adm_new_c_uuid),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
DRBD_ADM_START_OV, 17,
GENL_doit(drbd_adm_start_ov),
GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))

View File

@ -0,0 +1,55 @@
#ifndef DRBD_GENL_STRUCT_H
#define DRBD_GENL_STRUCT_H
/**
* struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests
* @minor:
* For admin requests (user -> kernel): which minor device to operate on.
* For (unicast) replies or informational (broadcast) messages
* (kernel -> user): which minor device the information is about.
* If we do not operate on minors, but on connections or resources,
* the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT
* is used instead.
* @flags: possible operation modifiers (relevant only for user->kernel):
* DRBD_GENL_F_SET_DEFAULTS
* @volume:
* When creating a new minor (adding it to a resource), the resource needs
* to know which volume number within the resource this is supposed to be.
* The volume number corresponds to the same volume number on the remote side,
* whereas the minor number on the remote side may be different
* (union with flags).
* @ret_code: kernel->userland unicast cfg reply return code (union with flags);
*/
struct drbd_genlmsghdr {
__u32 minor;
union {
__u32 flags;
__s32 ret_code;
};
};
/* To be used in drbd_genlmsghdr.flags */
enum {
DRBD_GENL_F_SET_DEFAULTS = 1,
};
enum drbd_state_info_bcast_reason {
SIB_GET_STATUS_REPLY = 1,
SIB_STATE_CHANGE = 2,
SIB_HELPER_PRE = 3,
SIB_HELPER_POST = 4,
SIB_SYNC_PROGRESS = 5,
};
/* hack around predefined gcc/cpp "linux=1",
* we cannot possibly include <1/drbd_genl.h> */
#undef linux
#include <linux/drbd.h>
#define GENL_MAGIC_VERSION API_VERSION
#define GENL_MAGIC_FAMILY drbd
#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr)
#define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h>
#include <linux/genl_magic_struct.h>
#endif

View File

@ -16,29 +16,37 @@
#define DEBUG_RANGE_CHECK 0
#define DRBD_MINOR_COUNT_MIN 1
#define DRBD_MINOR_COUNT_MAX 256
#define DRBD_MINOR_COUNT_MAX 255
#define DRBD_MINOR_COUNT_DEF 32
#define DRBD_MINOR_COUNT_SCALE '1'
#define DRBD_VOLUME_MAX 65535
#define DRBD_DIALOG_REFRESH_MIN 0
#define DRBD_DIALOG_REFRESH_MAX 600
#define DRBD_DIALOG_REFRESH_SCALE '1'
/* valid port number */
#define DRBD_PORT_MIN 1
#define DRBD_PORT_MAX 0xffff
#define DRBD_PORT_SCALE '1'
/* startup { */
/* if you want more than 3.4 days, disable */
#define DRBD_WFC_TIMEOUT_MIN 0
#define DRBD_WFC_TIMEOUT_MAX 300000
#define DRBD_WFC_TIMEOUT_DEF 0
#define DRBD_WFC_TIMEOUT_SCALE '1'
#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1'
#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1'
/* }*/
/* net { */
@ -47,75 +55,91 @@
#define DRBD_TIMEOUT_MIN 1
#define DRBD_TIMEOUT_MAX 600
#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
#define DRBD_TIMEOUT_SCALE '1'
/* If backing disk takes longer than disk_timeout, mark the disk as failed */
#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */
#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */
#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */
#define DRBD_DISK_TIMEOUT_SCALE '1'
/* active connection retries when C_WF_CONNECTION */
#define DRBD_CONNECT_INT_MIN 1
#define DRBD_CONNECT_INT_MAX 120
#define DRBD_CONNECT_INT_DEF 10 /* seconds */
#define DRBD_CONNECT_INT_SCALE '1'
/* keep-alive probes when idle */
#define DRBD_PING_INT_MIN 1
#define DRBD_PING_INT_MAX 120
#define DRBD_PING_INT_DEF 10
#define DRBD_PING_INT_SCALE '1'
/* timeout for the ping packets.*/
#define DRBD_PING_TIMEO_MIN 1
#define DRBD_PING_TIMEO_MAX 300
#define DRBD_PING_TIMEO_DEF 5
#define DRBD_PING_TIMEO_SCALE '1'
/* max number of write requests between write barriers */
#define DRBD_MAX_EPOCH_SIZE_MIN 1
#define DRBD_MAX_EPOCH_SIZE_MAX 20000
#define DRBD_MAX_EPOCH_SIZE_DEF 2048
#define DRBD_MAX_EPOCH_SIZE_SCALE '1'
/* I don't think that a tcp send buffer of more than 10M is useful */
#define DRBD_SNDBUF_SIZE_MIN 0
#define DRBD_SNDBUF_SIZE_MAX (10<<20)
#define DRBD_SNDBUF_SIZE_DEF 0
#define DRBD_SNDBUF_SIZE_SCALE '1'
#define DRBD_RCVBUF_SIZE_MIN 0
#define DRBD_RCVBUF_SIZE_MAX (10<<20)
#define DRBD_RCVBUF_SIZE_DEF 0
#define DRBD_RCVBUF_SIZE_SCALE '1'
/* @4k PageSize -> 128kB - 512MB */
#define DRBD_MAX_BUFFERS_MIN 32
#define DRBD_MAX_BUFFERS_MAX 131072
#define DRBD_MAX_BUFFERS_DEF 2048
#define DRBD_MAX_BUFFERS_SCALE '1'
/* @4k PageSize -> 4kB - 512MB */
#define DRBD_UNPLUG_WATERMARK_MIN 1
#define DRBD_UNPLUG_WATERMARK_MAX 131072
#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
#define DRBD_UNPLUG_WATERMARK_SCALE '1'
/* 0 is disabled.
* 200 should be more than enough even for very short timeouts */
#define DRBD_KO_COUNT_MIN 0
#define DRBD_KO_COUNT_MAX 200
#define DRBD_KO_COUNT_DEF 0
#define DRBD_KO_COUNT_DEF 7
#define DRBD_KO_COUNT_SCALE '1'
/* } */
/* syncer { */
/* FIXME allow rate to be zero? */
#define DRBD_RATE_MIN 1
#define DRBD_RESYNC_RATE_MIN 1
/* channel bonding 10 GbE, or other hardware */
#define DRBD_RATE_MAX (4 << 20)
#define DRBD_RATE_DEF 250 /* kb/second */
#define DRBD_RESYNC_RATE_MAX (4 << 20)
#define DRBD_RESYNC_RATE_DEF 250
#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
/* less than 7 would hit performance unnecessarily.
* 3833 is the largest prime that still does fit
* into 64 sectors of activity log */
* 919 slots context information per transaction,
* 32k activity log, 4k transaction size,
* one transaction in flight:
* 919 * 7 = 6433 */
#define DRBD_AL_EXTENTS_MIN 7
#define DRBD_AL_EXTENTS_MAX 3833
#define DRBD_AL_EXTENTS_DEF 127
#define DRBD_AL_EXTENTS_MAX 6433
#define DRBD_AL_EXTENTS_DEF 1237
#define DRBD_AL_EXTENTS_SCALE '1'
#define DRBD_AFTER_MIN -1
#define DRBD_AFTER_MAX 255
#define DRBD_AFTER_DEF -1
#define DRBD_MINOR_NUMBER_MIN -1
#define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1)
#define DRBD_MINOR_NUMBER_DEF -1
#define DRBD_MINOR_NUMBER_SCALE '1'
/* } */
@ -124,11 +148,12 @@
* the upper limit with 64bit kernel, enough ram and flexible meta data
* is 1 PiB, currently. */
/* DRBD_MAX_SECTORS */
#define DRBD_DISK_SIZE_SECT_MIN 0
#define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40))
#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
#define DRBD_DISK_SIZE_MIN 0
#define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40))
#define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */
#define DRBD_DISK_SIZE_SCALE 's' /* sectors */
#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
#define DRBD_ON_IO_ERROR_DEF EP_DETACH
#define DRBD_FENCING_DEF FP_DONT_CARE
#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
@ -136,38 +161,59 @@
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
#define DRBD_ON_CONGESTION_DEF OC_BLOCK
#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL
#define DRBD_MAX_BIO_BVECS_MIN 0
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
#define DRBD_MAX_BIO_BVECS_SCALE '1'
#define DRBD_C_PLAN_AHEAD_MIN 0
#define DRBD_C_PLAN_AHEAD_MAX 300
#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */
#define DRBD_C_PLAN_AHEAD_DEF 20
#define DRBD_C_PLAN_AHEAD_SCALE '1'
#define DRBD_C_DELAY_TARGET_MIN 1
#define DRBD_C_DELAY_TARGET_MAX 100
#define DRBD_C_DELAY_TARGET_DEF 10
#define DRBD_C_DELAY_TARGET_SCALE '1'
#define DRBD_C_FILL_TARGET_MIN 0
#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */
#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */
#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */
#define DRBD_C_MAX_RATE_MIN 250
#define DRBD_C_MAX_RATE_MAX (4 << 20)
#define DRBD_C_MAX_RATE_DEF 102400
#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */
#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */
#define DRBD_C_MIN_RATE_MIN 0
#define DRBD_C_MIN_RATE_MAX (4 << 20)
#define DRBD_C_MIN_RATE_DEF 4096
#define DRBD_C_MIN_RATE_DEF 250
#define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */
#define DRBD_CONG_FILL_MIN 0
#define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */
#define DRBD_CONG_FILL_DEF 0
#define DRBD_CONG_FILL_SCALE 's' /* sectors */
#define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN
#define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX
#define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF
#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE
#define DRBD_PROTOCOL_DEF DRBD_PROT_C
#define DRBD_DISK_BARRIER_DEF 0
#define DRBD_DISK_FLUSHES_DEF 1
#define DRBD_DISK_DRAIN_DEF 1
#define DRBD_MD_FLUSHES_DEF 1
#define DRBD_TCP_CORK_DEF 1
#define DRBD_AL_UPDATES_DEF 1
#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0
#define DRBD_ALWAYS_ASBP_DEF 0
#define DRBD_USE_RLE_DEF 1
#undef RANGE
#endif

View File

@ -1,163 +0,0 @@
/*
PAKET( name,
TYPE ( pn, pr, member )
...
)
You may never reissue one of the pn arguments
*/
#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
#endif
NL_PACKET(primary, 1,
NL_BIT( 1, T_MAY_IGNORE, primary_force)
)
NL_PACKET(secondary, 2, )
NL_PACKET(disk_conf, 3,
NL_INT64( 2, T_MAY_IGNORE, disk_size)
NL_STRING( 3, T_MANDATORY, backing_dev, 128)
NL_STRING( 4, T_MANDATORY, meta_dev, 128)
NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
NL_INTEGER( 7, T_MAY_IGNORE, fencing)
NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
/* 55 max_bio_size was available in 8.2.6rc2 */
NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout)
)
NL_PACKET(detach, 4,
NL_BIT( 88, T_MANDATORY, detach_force)
)
NL_PACKET(net_conf, 5,
NL_STRING( 8, T_MANDATORY, my_addr, 128)
NL_STRING( 9, T_MANDATORY, peer_addr, 128)
NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
NL_INTEGER( 14, T_MAY_IGNORE, timeout)
NL_INTEGER( 15, T_MANDATORY, wire_protocol)
NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
NL_INTEGER( 81, T_MAY_IGNORE, on_congestion)
NL_INTEGER( 82, T_MAY_IGNORE, cong_fill)
NL_INTEGER( 83, T_MAY_IGNORE, cong_extents)
/* 59 addr_family was available in GIT, never released */
NL_BIT( 60, T_MANDATORY, mind_af)
NL_BIT( 27, T_MAY_IGNORE, want_lose)
NL_BIT( 28, T_MAY_IGNORE, two_primaries)
NL_BIT( 41, T_MAY_IGNORE, always_asbp)
NL_BIT( 61, T_MAY_IGNORE, no_cork)
NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
NL_BIT( 70, T_MANDATORY, dry_run)
)
NL_PACKET(disconnect, 6,
NL_BIT( 84, T_MAY_IGNORE, force)
)
NL_PACKET(resize, 7,
NL_INT64( 29, T_MAY_IGNORE, resize_size)
NL_BIT( 68, T_MAY_IGNORE, resize_force)
NL_BIT( 69, T_MANDATORY, no_resync)
)
NL_PACKET(syncer_conf, 8,
NL_INTEGER( 30, T_MAY_IGNORE, rate)
NL_INTEGER( 31, T_MAY_IGNORE, after)
NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
/* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume)
* NL_INTEGER( 72, T_MAY_IGNORE, dp_interval)
* NL_INTEGER( 73, T_MAY_IGNORE, throttle_th)
* NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th)
* feature will be reimplemented differently with 8.3.9 */
NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
NL_BIT( 65, T_MAY_IGNORE, use_rle)
NL_INTEGER( 75, T_MAY_IGNORE, on_no_data)
NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead)
NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target)
NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target)
NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate)
NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate)
)
NL_PACKET(invalidate, 9, )
NL_PACKET(invalidate_peer, 10, )
NL_PACKET(pause_sync, 11, )
NL_PACKET(resume_sync, 12, )
NL_PACKET(suspend_io, 13, )
NL_PACKET(resume_io, 14, )
NL_PACKET(outdate, 15, )
NL_PACKET(get_config, 16, )
NL_PACKET(get_state, 17,
NL_INTEGER( 33, T_MAY_IGNORE, state_i)
)
NL_PACKET(get_uuids, 18,
NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
)
NL_PACKET(get_timeout_flag, 19,
NL_BIT( 36, T_MAY_IGNORE, use_degraded)
)
NL_PACKET(call_helper, 20,
NL_STRING( 38, T_MAY_IGNORE, helper, 32)
)
/* Tag nr 42 already allocated in drbd-8.1 development. */
NL_PACKET(sync_progress, 23,
NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
)
NL_PACKET(dump_ee, 24,
NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
NL_INT64( 48, T_MAY_IGNORE, ee_sector)
NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
)
NL_PACKET(start_ov, 25,
NL_INT64( 66, T_MAY_IGNORE, start_sector)
)
NL_PACKET(new_c_uuid, 26,
NL_BIT( 63, T_MANDATORY, clear_bm)
)
#ifdef NL_RESPONSE
NL_RESPONSE(return_code_only, 27)
#endif
#undef NL_PACKET
#undef NL_INTEGER
#undef NL_INT64
#undef NL_BIT
#undef NL_STRING
#undef NL_RESPONSE

View File

@ -1,84 +0,0 @@
#ifndef DRBD_TAG_MAGIC_H
#define DRBD_TAG_MAGIC_H
#define TT_END 0
#define TT_REMOVED 0xE000
/* declare packet_type enums */
enum packet_types {
#define NL_PACKET(name, number, fields) P_ ## name = number,
#define NL_RESPONSE(name, number) P_ ## name = number,
#define NL_INTEGER(pn, pr, member)
#define NL_INT64(pn, pr, member)
#define NL_BIT(pn, pr, member)
#define NL_STRING(pn, pr, member, len)
#include <linux/drbd_nl.h>
P_nl_after_last_packet,
};
/* These struct are used to deduce the size of the tag lists: */
#define NL_PACKET(name, number, fields) \
struct name ## _tag_len_struct { fields };
#define NL_INTEGER(pn, pr, member) \
int member; int tag_and_len ## member;
#define NL_INT64(pn, pr, member) \
__u64 member; int tag_and_len ## member;
#define NL_BIT(pn, pr, member) \
unsigned char member:1; int tag_and_len ## member;
#define NL_STRING(pn, pr, member, len) \
unsigned char member[len]; int member ## _len; \
int tag_and_len ## member;
#include <linux/drbd_nl.h>
/* declare tag-list-sizes */
static const int tag_list_sizes[] = {
#define NL_PACKET(name, number, fields) 2 fields ,
#define NL_INTEGER(pn, pr, member) + 4 + 4
#define NL_INT64(pn, pr, member) + 4 + 8
#define NL_BIT(pn, pr, member) + 4 + 1
#define NL_STRING(pn, pr, member, len) + 4 + (len)
#include <linux/drbd_nl.h>
};
/* The two highest bits are used for the tag type */
#define TT_MASK 0xC000
#define TT_INTEGER 0x0000
#define TT_INT64 0x4000
#define TT_BIT 0x8000
#define TT_STRING 0xC000
/* The next bit indicates if processing of the tag is mandatory */
#define T_MANDATORY 0x2000
#define T_MAY_IGNORE 0x0000
#define TN_MASK 0x1fff
/* The remaining 13 bits are used to enumerate the tags */
#define tag_type(T) ((T) & TT_MASK)
#define tag_number(T) ((T) & TN_MASK)
/* declare tag enums */
#define NL_PACKET(name, number, fields) fields
enum drbd_tags {
#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
#include <linux/drbd_nl.h>
};
struct tag {
const char *name;
int type_n_flags;
int max_len;
};
/* declare tag names */
#define NL_PACKET(name, number, fields) fields
static const struct tag tag_descriptions[] = {
#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
#include <linux/drbd_nl.h>
};
#endif

View File

@ -88,10 +88,14 @@ struct disk_stats {
};
#define PARTITION_META_INFO_VOLNAMELTH 64
#define PARTITION_META_INFO_UUIDLTH 16
/*
* Enough for the string representation of any kind of UUID plus NULL.
* EFI UUID is 36 characters. MSDOS UUID is 11 characters.
*/
#define PARTITION_META_INFO_UUIDLTH 37
struct partition_meta_info {
u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */
char uuid[PARTITION_META_INFO_UUIDLTH];
u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};

View File

@ -0,0 +1,422 @@
#ifndef GENL_MAGIC_FUNC_H
#define GENL_MAGIC_FUNC_H
#include <linux/genl_magic_struct.h>
/*
* Magic: declare tla policy {{{1
* Magic: declare nested policies
* {{{2
*/
#undef GENL_mc_group
#define GENL_mc_group(group)
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list)
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
[tag_name] = { .type = NLA_NESTED },
static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static struct nla_policy s_name ## _nl_policy[] __read_mostly = \
{ s_fields };
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \
__put, __is_signed) \
[attr_nr] = { .type = nla_type },
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \
__get, __put, __is_signed) \
[attr_nr] = { .type = nla_type, \
.len = maxlen - (nla_type == NLA_NUL_STRING) },
#include GENL_MAGIC_INCLUDE_FILE
#ifndef __KERNEL__
#ifndef pr_info
#define pr_info(args...) fprintf(stderr, args);
#endif
#endif
#ifdef GENL_MAGIC_DEBUG
static void dprint_field(const char *dir, int nla_type,
const char *name, void *valp)
{
__u64 val = valp ? *(__u32 *)valp : 1;
switch (nla_type) {
case NLA_U8: val = (__u8)val;
case NLA_U16: val = (__u16)val;
case NLA_U32: val = (__u32)val;
pr_info("%s attr %s: %d 0x%08x\n", dir,
name, (int)val, (unsigned)val);
break;
case NLA_U64:
val = *(__u64*)valp;
pr_info("%s attr %s: %lld 0x%08llx\n", dir,
name, (long long)val, (unsigned long long)val);
break;
case NLA_FLAG:
if (val)
pr_info("%s attr %s: set\n", dir, name);
break;
}
}
static void dprint_array(const char *dir, int nla_type,
const char *name, const char *val, unsigned len)
{
switch (nla_type) {
case NLA_NUL_STRING:
if (len && val[len-1] == '\0')
len--;
pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val);
break;
default:
/* we can always show 4 byte,
* thats what nlattr are aligned to. */
pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n",
dir, name, len, val[0], val[1], val[2], val[3]);
}
}
#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b);
/* Name is a member field name of the struct s.
* If s is NULL (only parsing, no copy requested in *_from_attrs()),
* nla is supposed to point to the attribute containing the information
* corresponding to that struct member. */
#define DPRINT_FIELD(dir, nla_type, name, s, nla) \
do { \
if (s) \
dprint_field(dir, nla_type, #name, &s->name); \
else if (nla) \
dprint_field(dir, nla_type, #name, \
(nla_type == NLA_FLAG) ? NULL \
: nla_data(nla)); \
} while (0)
#define DPRINT_ARRAY(dir, nla_type, name, s, nla) \
do { \
if (s) \
dprint_array(dir, nla_type, #name, \
s->name, s->name ## _len); \
else if (nla) \
dprint_array(dir, nla_type, #name, \
nla_data(nla), nla_len(nla)); \
} while (0)
#else
#define DPRINT_TLA(a, op, b) do {} while (0)
#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0)
#define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0)
#endif
/*
* Magic: provide conversion functions {{{1
* populate struct from attribute table:
* {{{2
*/
/* processing of generic netlink messages is serialized.
* use one static buffer for parsing of nested attributes */
static struct nlattr *nested_attr_tb[128];
#ifndef BUILD_BUG_ON
/* Force a compilation error if condition is true */
#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
/* Force a compilation error if condition is true, but also produce a
result (of value 0 and type size_t), so the expression can be used
e.g. in a structure initializer (or where-ever else comma expressions
aren't permitted). */
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
#endif
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
/* *_from_attrs functions are static, but potentially unused */ \
static int __ ## s_name ## _from_attrs(struct s_name *s, \
struct genl_info *info, bool exclude_invariants) \
{ \
const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \
struct nlattr *tla = info->attrs[tag_number]; \
struct nlattr **ntb = nested_attr_tb; \
struct nlattr *nla; \
int err; \
BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \
if (!tla) \
return -ENOMSG; \
DPRINT_TLA(#s_name, "<=-", #tag_name); \
err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \
if (err) \
return err; \
\
s_fields \
return 0; \
} __attribute__((unused)) \
static int s_name ## _from_attrs(struct s_name *s, \
struct genl_info *info) \
{ \
return __ ## s_name ## _from_attrs(s, info, false); \
} __attribute__((unused)) \
static int s_name ## _from_attrs_for_change(struct s_name *s, \
struct genl_info *info) \
{ \
return __ ## s_name ## _from_attrs(s, info, true); \
} __attribute__((unused)) \
#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \
nla = ntb[attr_nr]; \
if (nla) { \
if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \
pr_info("<< must not change invariant attr: %s\n", #name); \
return -EEXIST; \
} \
assignment; \
} else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \
/* attribute missing from payload, */ \
/* which was expected */ \
} else if ((attr_flag) & DRBD_F_REQUIRED) { \
pr_info("<< missing attr: %s\n", #name); \
return -ENOMSG; \
}
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
__assign(attr_nr, attr_flag, name, nla_type, type, \
if (s) \
s->name = __get(nla); \
DPRINT_FIELD("<<", nla_type, name, s, nla))
/* validate_nla() already checked nla_len <= maxlen appropriately. */
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
__assign(attr_nr, attr_flag, name, nla_type, type, \
if (s) \
s->name ## _len = \
__get(s->name, nla, maxlen); \
DPRINT_ARRAY("<<", nla_type, name, s, nla))
#include GENL_MAGIC_INCLUDE_FILE
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields)
/*
* Magic: define op number to op name mapping {{{1
* {{{2
*/
const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
{
switch (cmd) {
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list) \
case op_num: return #op_name;
#include GENL_MAGIC_INCLUDE_FILE
default:
return "unknown";
}
}
#ifdef __KERNEL__
#include <linux/stringify.h>
/*
* Magic: define genl_ops {{{1
* {{{2
*/
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list) \
{ \
handler \
.cmd = op_name, \
.policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \
},
#define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops)
static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list)
/*
* Define the genl_family, multicast groups, {{{1
* and provide register/unregister functions.
* {{{2
*/
#define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
static struct genl_family ZZZ_genl_family __read_mostly = {
.id = GENL_ID_GENERATE,
.name = __stringify(GENL_MAGIC_FAMILY),
.version = GENL_MAGIC_VERSION,
#ifdef GENL_MAGIC_FAMILY_HDRSZ
.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
#endif
.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
};
/*
* Magic: define multicast groups
* Magic: define multicast group registration helper
*/
#undef GENL_mc_group
#define GENL_mc_group(group) \
static struct genl_multicast_group \
CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \
.name = #group, \
}; \
static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \
struct sk_buff *skb, gfp_t flags) \
{ \
unsigned int group_id = \
CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \
if (!group_id) \
return -EINVAL; \
return genlmsg_multicast(skb, 0, group_id, flags); \
}
#include GENL_MAGIC_INCLUDE_FILE
int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
{
int err = genl_register_family_with_ops(&ZZZ_genl_family,
ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops));
if (err)
return err;
#undef GENL_mc_group
#define GENL_mc_group(group) \
err = genl_register_mc_group(&ZZZ_genl_family, \
&CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \
if (err) \
goto fail; \
else \
pr_info("%s: mcg %s: %u\n", #group, \
__stringify(GENL_MAGIC_FAMILY), \
CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id);
#include GENL_MAGIC_INCLUDE_FILE
#undef GENL_mc_group
#define GENL_mc_group(group)
return 0;
fail:
genl_unregister_family(&ZZZ_genl_family);
return err;
}
void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
{
genl_unregister_family(&ZZZ_genl_family);
}
/*
* Magic: provide conversion functions {{{1
* populate skb from struct.
* {{{2
*/
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \
const bool exclude_sensitive) \
{ \
struct nlattr *tla = nla_nest_start(skb, tag_number); \
if (!tla) \
goto nla_put_failure; \
DPRINT_TLA(#s_name, "-=>", #tag_name); \
s_fields \
nla_nest_end(skb, tla); \
return 0; \
\
nla_put_failure: \
if (tla) \
nla_nest_cancel(skb, tla); \
return -EMSGSIZE; \
} \
static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \
struct s_name *s) \
{ \
return s_name ## _to_skb(skb, s, 0); \
} \
static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \
struct s_name *s) \
{ \
return s_name ## _to_skb(skb, s, 1); \
}
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \
DPRINT_FIELD(">>", nla_type, name, s, NULL); \
if (__put(skb, attr_nr, s->name)) \
goto nla_put_failure; \
}
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \
DPRINT_ARRAY(">>",nla_type, name, s, NULL); \
if (__put(skb, attr_nr, min_t(int, maxlen, \
s->name ## _len + (nla_type == NLA_NUL_STRING)),\
s->name)) \
goto nla_put_failure; \
}
#include GENL_MAGIC_INCLUDE_FILE
/* Functions for initializing structs to default values. */
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed)
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed)
#undef __u32_field_def
#define __u32_field_def(attr_nr, attr_flag, name, default) \
x->name = default;
#undef __s32_field_def
#define __s32_field_def(attr_nr, attr_flag, name, default) \
x->name = default;
#undef __flg_field_def
#define __flg_field_def(attr_nr, attr_flag, name, default) \
x->name = default;
#undef __str_field_def
#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
memset(x->name, 0, sizeof(x->name)); \
x->name ## _len = 0;
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \
static void set_ ## s_name ## _defaults(struct s_name *x) { \
s_fields \
}
#include GENL_MAGIC_INCLUDE_FILE
#endif /* __KERNEL__ */
/* }}}1 */
#endif /* GENL_MAGIC_FUNC_H */
/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */

View File

@ -0,0 +1,277 @@
#ifndef GENL_MAGIC_STRUCT_H
#define GENL_MAGIC_STRUCT_H
#ifndef GENL_MAGIC_FAMILY
# error "you need to define GENL_MAGIC_FAMILY before inclusion"
#endif
#ifndef GENL_MAGIC_VERSION
# error "you need to define GENL_MAGIC_VERSION before inclusion"
#endif
#ifndef GENL_MAGIC_INCLUDE_FILE
# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion"
#endif
#include <linux/genetlink.h>
#include <linux/types.h>
#define CONCAT__(a,b) a ## b
#define CONCAT_(a,b) CONCAT__(a,b)
extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void);
extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
/*
* Extension of genl attribute validation policies {{{2
*/
/*
* @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not
* know about. This flag can be set in nlattr->nla_type to indicate that this
* attribute must not be ignored.
*
* We check and remove this flag in drbd_nla_check_mandatory() before
* validating the attribute types and lengths via nla_parse_nested().
*/
#define DRBD_GENLA_F_MANDATORY (1 << 14)
/*
* Flags specific to drbd and not visible at the netlink layer, used in
* <struct>_from_attrs and <struct>_to_skb:
*
* @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is
* invalid.
*
* @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be
* included in unpriviledged get requests or broadcasts.
*
* @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but
* cannot subsequently be changed.
*/
#define DRBD_F_REQUIRED (1 << 0)
#define DRBD_F_SENSITIVE (1 << 1)
#define DRBD_F_INVARIANT (1 << 2)
#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY))
/* }}}1
* MAGIC
* multi-include macro expansion magic starts here
*/
/* MAGIC helpers {{{2 */
/* possible field types */
#define __flg_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U8, char, \
nla_get_u8, nla_put_u8, false)
#define __u8_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \
nla_get_u8, nla_put_u8, false)
#define __u16_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U16, __u16, \
nla_get_u16, nla_put_u16, false)
#define __u32_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U32, __u32, \
nla_get_u32, nla_put_u32, false)
#define __s32_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U32, __s32, \
nla_get_u32, nla_put_u32, true)
#define __u64_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U64, __u64, \
nla_get_u64, nla_put_u64, false)
#define __str_field(attr_nr, attr_flag, name, maxlen) \
__array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
nla_strlcpy, nla_put, false)
#define __bin_field(attr_nr, attr_flag, name, maxlen) \
__array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \
nla_memcpy, nla_put, false)
/* fields with default values */
#define __flg_field_def(attr_nr, attr_flag, name, default) \
__flg_field(attr_nr, attr_flag, name)
#define __u32_field_def(attr_nr, attr_flag, name, default) \
__u32_field(attr_nr, attr_flag, name)
#define __s32_field_def(attr_nr, attr_flag, name, default) \
__s32_field(attr_nr, attr_flag, name)
#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
__str_field(attr_nr, attr_flag, name, maxlen)
#define GENL_op_init(args...) args
#define GENL_doit(handler) \
.doit = handler, \
.flags = GENL_ADMIN_PERM,
#define GENL_dumpit(handler) \
.dumpit = handler, \
.flags = GENL_ADMIN_PERM,
/* }}}1
* Magic: define the enum symbols for genl_ops
* Magic: define the enum symbols for top level attributes
* Magic: define the enum symbols for nested attributes
* {{{2
*/
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields)
#undef GENL_mc_group
#define GENL_mc_group(group)
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list) \
op_name = op_num,
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list) \
op_name = op_num,
enum {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list)
#undef GENL_op
#define GENL_op(op_name, op_num, handler, attr_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
tag_name = tag_number,
enum {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
enum { \
s_fields \
};
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, \
__get, __put, __is_signed) \
T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, \
maxlen, __get, __put, __is_signed) \
T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
#include GENL_MAGIC_INCLUDE_FILE
/* }}}1
* Magic: compile time assert unique numbers for operations
* Magic: -"- unique numbers for top level attributes
* Magic: -"- unique numbers for nested attributes
* {{{2
*/
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields)
#undef GENL_op
#define GENL_op(op_name, op_num, handler, attr_list) \
case op_name:
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list) \
case op_name:
static inline void ct_assert_unique_operations(void)
{
switch (0) {
#include GENL_MAGIC_INCLUDE_FILE
;
}
}
#undef GENL_op
#define GENL_op(op_name, op_num, handler, attr_list)
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
case tag_number:
static inline void ct_assert_unique_top_level_attributes(void)
{
switch (0) {
#include GENL_MAGIC_INCLUDE_FILE
;
}
}
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static inline void ct_assert_unique_ ## s_name ## _attributes(void) \
{ \
switch (0) { \
s_fields \
; \
} \
}
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
case attr_nr:
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
case attr_nr:
#include GENL_MAGIC_INCLUDE_FILE
/* }}}1
* Magic: declare structs
* struct <name> {
* fields
* };
* {{{2
*/
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
struct s_name { s_fields };
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
type name;
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
type name[maxlen]; \
__u32 name ## _len;
#include GENL_MAGIC_INCLUDE_FILE
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
enum { \
s_fields \
};
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
is_signed) \
F_ ## name ## _IS_SIGNED = is_signed,
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, is_signed) \
F_ ## name ## _IS_SIGNED = is_signed,
#include GENL_MAGIC_INCLUDE_FILE
/* }}}1 */
#endif /* GENL_MAGIC_STRUCT_H */
/* vim: set foldmethod=marker nofoldenable : */

View File

@ -152,4 +152,15 @@ void ida_simple_remove(struct ida *ida, unsigned int id);
void __init idr_init_cache(void);
/**
* idr_for_each_entry - iterate over an idr's elements of a given type
* @idp: idr handle
* @entry: the type * to use as cursor
* @id: id entry's key
*/
#define idr_for_each_entry(idp, entry, id) \
for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
entry != NULL; \
++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
#endif /* __IDR_H__ */

View File

@ -53,10 +53,13 @@ struct loop_device {
spinlock_t lo_lock;
struct bio_list lo_bio_list;
unsigned int lo_bio_count;
int lo_state;
struct mutex lo_ctl_mutex;
struct task_struct *lo_thread;
wait_queue_head_t lo_event;
/* wait queue for incoming requests */
wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
struct gendisk *lo_disk;

View File

@ -166,9 +166,11 @@ struct lc_element {
/* if we want to track a larger set of objects,
* it needs to become arch independend u64 */
unsigned lc_number;
/* special label when on free list */
#define LC_FREE (~0U)
/* for pending changes */
unsigned lc_new_number;
};
struct lru_cache {
@ -176,6 +178,7 @@ struct lru_cache {
struct list_head lru;
struct list_head free;
struct list_head in_use;
struct list_head to_be_changed;
/* the pre-created kmem cache to allocate the objects from */
struct kmem_cache *lc_cache;
@ -186,7 +189,7 @@ struct lru_cache {
size_t element_off;
/* number of elements (indices) */
unsigned int nr_elements;
unsigned int nr_elements;
/* Arbitrary limit on maximum tracked objects. Practical limit is much
* lower due to allocation failures, probably. For typical use cases,
* nr_elements should be a few thousand at most.
@ -194,18 +197,19 @@ struct lru_cache {
* 8 high bits of .lc_index to be overloaded with flags in the future. */
#define LC_MAX_ACTIVE (1<<24)
/* allow to accumulate a few (index:label) changes,
* but no more than max_pending_changes */
unsigned int max_pending_changes;
/* number of elements currently on to_be_changed list */
unsigned int pending_changes;
/* statistics */
unsigned used; /* number of lelements currently on in_use list */
unsigned long hits, misses, starving, dirty, changed;
unsigned used; /* number of elements currently on in_use list */
unsigned long hits, misses, starving, locked, changed;
/* see below: flag-bits for lru_cache */
unsigned long flags;
/* when changing the label of an index element */
unsigned int new_number;
/* for paranoia when changing the label of an index element */
struct lc_element *changing_element;
void *lc_private;
const char *name;
@ -221,10 +225,15 @@ enum {
/* debugging aid, to catch concurrent access early.
* user needs to guarantee exclusive access by proper locking! */
__LC_PARANOIA,
/* if we need to change the set, but currently there is a changing
* transaction pending, we are "dirty", and must deferr further
* changing requests */
/* annotate that the set is "dirty", possibly accumulating further
* changes, until a transaction is finally triggered */
__LC_DIRTY,
/* Locked, no further changes allowed.
* Also used to serialize changing transactions. */
__LC_LOCKED,
/* if we need to change the set, but currently there is no free nor
* unused element available, we are "starving", and must not give out
* further references, to guarantee that eventually some refcnt will
@ -236,9 +245,11 @@ enum {
};
#define LC_PARANOIA (1<<__LC_PARANOIA)
#define LC_DIRTY (1<<__LC_DIRTY)
#define LC_LOCKED (1<<__LC_LOCKED)
#define LC_STARVING (1<<__LC_STARVING)
extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned max_pending_changes,
unsigned e_count, size_t e_size, size_t e_off);
extern void lc_reset(struct lru_cache *lc);
extern void lc_destroy(struct lru_cache *lc);
@ -249,7 +260,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
extern void lc_committed(struct lru_cache *lc);
struct seq_file;
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
@ -258,16 +269,28 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char
void (*detail) (struct seq_file *, struct lc_element *));
/**
* lc_try_lock - can be used to stop lc_get() from changing the tracked set
* lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Allows (expects) the set to be "dirty". Note that the reference counts and
* order on the active and lru lists may still change. Used to serialize
* changing transactions. Returns true if we aquired the lock.
*/
static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
{
return !test_and_set_bit(__LC_LOCKED, &lc->flags);
}
/**
* lc_try_lock - variant to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Note that the reference counts and order on the active and lru lists may
* still change. Returns true if we acquired the lock.
* still change. Only works on a "clean" set. Returns true if we aquired the
* lock, which means there are no pending changes, and any further attempt to
* change the set will not succeed until the next lc_unlock().
*/
static inline int lc_try_lock(struct lru_cache *lc)
{
return !test_and_set_bit(__LC_DIRTY, &lc->flags);
}
extern int lc_try_lock(struct lru_cache *lc);
/**
* lc_unlock - unlock @lc, allow lc_get() to change the set again
@ -276,14 +299,10 @@ static inline int lc_try_lock(struct lru_cache *lc)
static inline void lc_unlock(struct lru_cache *lc)
{
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
clear_bit_unlock(__LC_LOCKED, &lc->flags);
}
static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e = lc_find(lc, enr);
return e && e->refcnt;
}
extern bool lc_is_used(struct lru_cache *lc, unsigned int enr);
#define lc_entry(ptr, type, member) \
container_of(ptr, type, member)

View File

@ -550,6 +550,170 @@ do { \
__ret; \
})
#define __wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
DEFINE_WAIT(__wait); \
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
finish_wait(&wq, &__wait); \
} while (0)
/**
* wait_event_lock_irq_cmd - sleep until a condition gets true. The
* condition is checked under the lock. This
* is expected to be called with the lock
* taken.
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before cmd
* and schedule() and reacquired afterwards.
* @cmd: a command which is invoked outside the critical section before
* sleep
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before invoking the cmd and going to sleep and is reacquired
* afterwards.
*/
#define wait_event_lock_irq_cmd(wq, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, cmd); \
} while (0)
/**
* wait_event_lock_irq - sleep until a condition gets true. The
* condition is checked under the lock. This
* is expected to be called with the lock
* taken.
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before going to sleep and is reacquired afterwards.
*/
#define wait_event_lock_irq(wq, condition, lock) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, ); \
} while (0)
#define __wait_event_interruptible_lock_irq(wq, condition, \
lock, ret, cmd) \
do { \
DEFINE_WAIT(__wait); \
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
if (condition) \
break; \
if (signal_pending(current)) { \
ret = -ERESTARTSYS; \
break; \
} \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
finish_wait(&wq, &__wait); \
} while (0)
/**
* wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
* The condition is checked under the lock. This is expected to
* be called with the lock taken.
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before cmd and
* schedule() and reacquired afterwards.
* @cmd: a command which is invoked outside the critical section before
* sleep
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received. The @condition is
* checked each time the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before invoking the cmd and going to sleep and is reacquired
* afterwards.
*
* The macro will return -ERESTARTSYS if it was interrupted by a signal
* and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \
({ \
int __ret = 0; \
\
if (!(condition)) \
__wait_event_interruptible_lock_irq(wq, condition, \
lock, __ret, cmd); \
__ret; \
})
/**
* wait_event_interruptible_lock_irq - sleep until a condition gets true.
* The condition is checked under the lock. This is expected
* to be called with the lock taken.
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or signal is received. The @condition is
* checked each time the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before going to sleep and is reacquired afterwards.
*
* The macro will return -ERESTARTSYS if it was interrupted by a signal
* and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_lock_irq(wq, condition, lock) \
({ \
int __ret = 0; \
\
if (!(condition)) \
__wait_event_interruptible_lock_irq(wq, condition, \
lock, __ret, ); \
__ret; \
})
/*
* These are the old interfaces to sleep waiting for an event.
* They are racy. DO NOT use them, use the wait_event* interfaces above.

View File

@ -69,23 +69,28 @@ __setup("ro", readonly);
__setup("rw", readwrite);
#ifdef CONFIG_BLOCK
struct uuidcmp {
const char *uuid;
int len;
};
/**
* match_dev_by_uuid - callback for finding a partition using its uuid
* @dev: device passed in by the caller
* @data: opaque pointer to a 36 byte char array with a UUID
* @data: opaque pointer to the desired struct uuidcmp to match
*
* Returns 1 if the device matches, and 0 otherwise.
*/
static int match_dev_by_uuid(struct device *dev, void *data)
{
u8 *uuid = data;
struct uuidcmp *cmp = data;
struct hd_struct *part = dev_to_part(dev);
if (!part->info)
goto no_match;
if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
goto no_match;
if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len))
goto no_match;
return 1;
no_match:
@ -95,7 +100,7 @@ no_match:
/**
* devt_from_partuuid - looks up the dev_t of a partition by its UUID
* @uuid: min 36 byte char array containing a hex ascii UUID
* @uuid: char array containing ascii UUID
*
* The function will return the first partition which contains a matching
* UUID value in its partition_meta_info struct. This does not search
@ -106,38 +111,41 @@ no_match:
*
* Returns the matching dev_t on success or 0 on failure.
*/
static dev_t devt_from_partuuid(char *uuid_str)
static dev_t devt_from_partuuid(const char *uuid_str)
{
dev_t res = 0;
struct uuidcmp cmp;
struct device *dev = NULL;
u8 uuid[16];
struct gendisk *disk;
struct hd_struct *part;
int offset = 0;
bool clear_root_wait = false;
char *slash;
if (strlen(uuid_str) < 36)
goto done;
cmp.uuid = uuid_str;
slash = strchr(uuid_str, '/');
/* Check for optional partition number offset attributes. */
if (uuid_str[36]) {
if (slash) {
char c = 0;
/* Explicitly fail on poor PARTUUID syntax. */
if (sscanf(&uuid_str[36],
"/PARTNROFF=%d%c", &offset, &c) != 1) {
printk(KERN_ERR "VFS: PARTUUID= is invalid.\n"
"Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
if (root_wait)
printk(KERN_ERR
"Disabling rootwait; root= is invalid.\n");
root_wait = 0;
if (sscanf(slash + 1,
"PARTNROFF=%d%c", &offset, &c) != 1) {
clear_root_wait = true;
goto done;
}
cmp.len = slash - uuid_str;
} else {
cmp.len = strlen(uuid_str);
}
/* Pack the requested UUID in the expected format. */
part_pack_uuid(uuid_str, uuid);
if (!cmp.len) {
clear_root_wait = true;
goto done;
}
dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
dev = class_find_device(&block_class, NULL, &cmp,
&match_dev_by_uuid);
if (!dev)
goto done;
@ -158,6 +166,13 @@ static dev_t devt_from_partuuid(char *uuid_str)
no_offset:
put_device(dev);
done:
if (clear_root_wait) {
pr_err("VFS: PARTUUID= is invalid.\n"
"Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
if (root_wait)
pr_err("Disabling rootwait; root= is invalid.\n");
root_wait = 0;
}
return res;
}
#endif
@ -174,6 +189,10 @@ done:
* used when disk name of partitioned disk ends on a digit.
* 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
* unique id of a partition if the partition table provides it.
* The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
* partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
* filled hex representation of the 32-bit "NT disk signature", and PP
* is a zero-filled hex representation of the 1-based partition number.
* 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
* a partition with a known unique id.
*

View File

@ -44,8 +44,8 @@ MODULE_LICENSE("GPL");
} while (0)
#define RETURN(x...) do { \
clear_bit(__LC_PARANOIA, &lc->flags); \
smp_mb__after_clear_bit(); return x ; } while (0)
clear_bit_unlock(__LC_PARANOIA, &lc->flags); \
return x ; } while (0)
/* BUG() if e is not one of the elements tracked by lc */
#define PARANOIA_LC_ELEMENT(lc, e) do { \
@ -55,9 +55,40 @@ MODULE_LICENSE("GPL");
BUG_ON(i >= lc_->nr_elements); \
BUG_ON(lc_->lc_element[i] != e_); } while (0)
/* We need to atomically
* - try to grab the lock (set LC_LOCKED)
* - only if there is no pending transaction
* (neither LC_DIRTY nor LC_STARVING is set)
* Because of PARANOIA_ENTRY() above abusing lc->flags as well,
* it is not sufficient to just say
* return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED);
*/
int lc_try_lock(struct lru_cache *lc)
{
unsigned long val;
do {
val = cmpxchg(&lc->flags, 0, LC_LOCKED);
} while (unlikely (val == LC_PARANOIA));
/* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */
return 0 == val;
#if 0
/* Alternative approach, spin in case someone enters or leaves a
* PARANOIA_ENTRY()/RETURN() section. */
unsigned long old, new, val;
do {
old = lc->flags & LC_PARANOIA;
new = old | LC_LOCKED;
val = cmpxchg(&lc->flags, old, new);
} while (unlikely (val == (old ^ LC_PARANOIA)));
return old == val;
#endif
}
/**
* lc_create - prepares to track objects in an active set
* @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
* @max_pending_changes: maximum changes to accumulate until a transaction is required
* @e_count: number of elements allowed to be active simultaneously
* @e_size: size of the tracked objects
* @e_off: offset to the &struct lc_element member in a tracked object
@ -66,6 +97,7 @@ MODULE_LICENSE("GPL");
* or NULL on (allocation) failure.
*/
struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned max_pending_changes,
unsigned e_count, size_t e_size, size_t e_off)
{
struct hlist_head *slot = NULL;
@ -98,12 +130,13 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
INIT_LIST_HEAD(&lc->to_be_changed);
lc->name = name;
lc->element_size = e_size;
lc->element_off = e_off;
lc->nr_elements = e_count;
lc->new_number = LC_FREE;
lc->max_pending_changes = max_pending_changes;
lc->lc_cache = cache;
lc->lc_element = element;
lc->lc_slot = slot;
@ -117,6 +150,7 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
e = p + e_off;
e->lc_index = i;
e->lc_number = LC_FREE;
e->lc_new_number = LC_FREE;
list_add(&e->list, &lc->free);
element[i] = e;
}
@ -175,15 +209,15 @@ void lc_reset(struct lru_cache *lc)
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
INIT_LIST_HEAD(&lc->to_be_changed);
lc->used = 0;
lc->hits = 0;
lc->misses = 0;
lc->starving = 0;
lc->dirty = 0;
lc->locked = 0;
lc->changed = 0;
lc->pending_changes = 0;
lc->flags = 0;
lc->changing_element = NULL;
lc->new_number = LC_FREE;
memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
for (i = 0; i < lc->nr_elements; i++) {
@ -194,6 +228,7 @@ void lc_reset(struct lru_cache *lc)
/* re-init it */
e->lc_index = i;
e->lc_number = LC_FREE;
e->lc_new_number = LC_FREE;
list_add(&e->list, &lc->free);
}
}
@ -208,14 +243,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
/* NOTE:
* total calls to lc_get are
* (starving + hits + misses)
* misses include "dirty" count (update from an other thread in
* misses include "locked" count (update from an other thread in
* progress) and "changed", when this in fact lead to an successful
* update of the cache.
*/
return seq_printf(seq, "\t%s: used:%u/%u "
"hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
"hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
lc->name, lc->used, lc->nr_elements,
lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
}
static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
@ -224,6 +259,27 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
}
static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr,
bool include_changing)
{
struct hlist_node *n;
struct lc_element *e;
BUG_ON(!lc);
BUG_ON(!lc->nr_elements);
hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
/* "about to be changed" elements, pending transaction commit,
* are hashed by their "new number". "Normal" elements have
* lc_number == lc_new_number. */
if (e->lc_new_number != enr)
continue;
if (e->lc_new_number == e->lc_number || include_changing)
return e;
break;
}
return NULL;
}
/**
* lc_find - find element by label, if present in the hash table
* @lc: The lru_cache object
@ -232,38 +288,28 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
* Returns the pointer to an element, if the element with the requested
* "label" or element number is present in the hash table,
* or NULL if not found. Does not change the refcnt.
* Ignores elements that are "about to be used", i.e. not yet in the active
* set, but still pending transaction commit.
*/
struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
{
struct hlist_node *n;
struct lc_element *e;
BUG_ON(!lc);
BUG_ON(!lc->nr_elements);
hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
if (e->lc_number == enr)
return e;
}
return NULL;
return __lc_find(lc, enr, 0);
}
/* returned element will be "recycled" immediately */
static struct lc_element *lc_evict(struct lru_cache *lc)
/**
* lc_is_used - find element by label
* @lc: The lru_cache object
* @enr: element number
*
* Returns true, if the element with the requested "label" or element number is
* present in the hash table, and is used (refcnt > 0).
* Also finds elements that are not _currently_ used but only "about to be
* used", i.e. on the "to_be_changed" list, pending transaction commit.
*/
bool lc_is_used(struct lru_cache *lc, unsigned int enr)
{
struct list_head *n;
struct lc_element *e;
if (list_empty(&lc->lru))
return NULL;
n = lc->lru.prev;
e = list_entry(n, struct lc_element, list);
PARANOIA_LC_ELEMENT(lc, e);
list_del(&e->list);
hlist_del(&e->colision);
return e;
struct lc_element *e = __lc_find(lc, enr, 1);
return e && e->refcnt;
}
/**
@ -280,22 +326,34 @@ void lc_del(struct lru_cache *lc, struct lc_element *e)
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt);
e->lc_number = LC_FREE;
e->lc_number = e->lc_new_number = LC_FREE;
hlist_del_init(&e->colision);
list_move(&e->list, &lc->free);
RETURN();
}
static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number)
{
struct list_head *n;
struct lc_element *e;
if (list_empty(&lc->free))
return lc_evict(lc);
if (!list_empty(&lc->free))
n = lc->free.next;
else if (!list_empty(&lc->lru))
n = lc->lru.prev;
else
return NULL;
n = lc->free.next;
list_del(n);
return list_entry(n, struct lc_element, list);
e = list_entry(n, struct lc_element, list);
PARANOIA_LC_ELEMENT(lc, e);
e->lc_new_number = new_number;
if (!hlist_unhashed(&e->colision))
__hlist_del(&e->colision);
hlist_add_head(&e->colision, lc_hash_slot(lc, new_number));
list_move(&e->list, &lc->to_be_changed);
return e;
}
static int lc_unused_element_available(struct lru_cache *lc)
@ -308,6 +366,75 @@ static int lc_unused_element_available(struct lru_cache *lc)
return 0;
}
static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
{
struct lc_element *e;
PARANOIA_ENTRY();
if (lc->flags & LC_STARVING) {
++lc->starving;
RETURN(NULL);
}
e = __lc_find(lc, enr, 1);
/* if lc_new_number != lc_number,
* this enr is currently being pulled in already,
* and will be available once the pending transaction
* has been committed. */
if (e && e->lc_new_number == e->lc_number) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
list_move(&e->list, &lc->in_use); /* Not evictable... */
RETURN(e);
}
++lc->misses;
if (!may_change)
RETURN(NULL);
/* It has been found above, but on the "to_be_changed" list, not yet
* committed. Don't pull it in twice, wait for the transaction, then
* try again */
if (e)
RETURN(NULL);
/* To avoid races with lc_try_lock(), first, mark us dirty
* (using test_and_set_bit, as it implies memory barriers), ... */
test_and_set_bit(__LC_DIRTY, &lc->flags);
/* ... only then check if it is locked anyways. If lc_unlock clears
* the dirty bit again, that's not a problem, we will come here again.
*/
if (test_bit(__LC_LOCKED, &lc->flags)) {
++lc->locked;
RETURN(NULL);
}
/* In case there is nothing available and we can not kick out
* the LRU element, we have to wait ...
*/
if (!lc_unused_element_available(lc)) {
__set_bit(__LC_STARVING, &lc->flags);
RETURN(NULL);
}
/* It was not present in the active set. We are going to recycle an
* unused (or even "free") element, but we won't accumulate more than
* max_pending_changes changes. */
if (lc->pending_changes >= lc->max_pending_changes)
RETURN(NULL);
e = lc_prepare_for_change(lc, enr);
BUG_ON(!e);
clear_bit(__LC_STARVING, &lc->flags);
BUG_ON(++e->refcnt != 1);
lc->used++;
lc->pending_changes++;
RETURN(e);
}
/**
* lc_get - get element by label, maybe change the active set
@ -336,110 +463,65 @@ static int lc_unused_element_available(struct lru_cache *lc)
* pointer to an UNUSED element with some different element number,
* where that different number may also be %LC_FREE.
*
* In this case, the cache is marked %LC_DIRTY (blocking further changes),
* and the returned element pointer is removed from the lru list and
* hash collision chains. The user now should do whatever housekeeping
* is necessary.
* Then he must call lc_changed(lc,element_pointer), to finish
* the change.
* In this case, the cache is marked %LC_DIRTY,
* so lc_try_lock() will no longer succeed.
* The returned element pointer is moved to the "to_be_changed" list,
* and registered with the new element number on the hash collision chains,
* so it is possible to pick it up from lc_is_used().
* Up to "max_pending_changes" (see lc_create()) can be accumulated.
* The user now should do whatever housekeeping is necessary,
* typically serialize on lc_try_lock_for_transaction(), then call
* lc_committed(lc) and lc_unlock(), to finish the change.
*
* NOTE: The user needs to check the lc_number on EACH use, so he recognizes
* any cache set change.
*/
struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e;
PARANOIA_ENTRY();
if (lc->flags & LC_STARVING) {
++lc->starving;
RETURN(NULL);
}
e = lc_find(lc, enr);
if (e) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
list_move(&e->list, &lc->in_use); /* Not evictable... */
RETURN(e);
}
++lc->misses;
/* In case there is nothing available and we can not kick out
* the LRU element, we have to wait ...
*/
if (!lc_unused_element_available(lc)) {
__set_bit(__LC_STARVING, &lc->flags);
RETURN(NULL);
}
/* it was not present in the active set.
* we are going to recycle an unused (or even "free") element.
* user may need to commit a transaction to record that change.
* we serialize on flags & TF_DIRTY */
if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
++lc->dirty;
RETURN(NULL);
}
e = lc_get_unused_element(lc);
BUG_ON(!e);
clear_bit(__LC_STARVING, &lc->flags);
BUG_ON(++e->refcnt != 1);
lc->used++;
lc->changing_element = e;
lc->new_number = enr;
RETURN(e);
}
/* similar to lc_get,
* but only gets a new reference on an existing element.
* you either get the requested element, or NULL.
* will be consolidated into one function.
*/
struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e;
PARANOIA_ENTRY();
if (lc->flags & LC_STARVING) {
++lc->starving;
RETURN(NULL);
}
e = lc_find(lc, enr);
if (e) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
list_move(&e->list, &lc->in_use); /* Not evictable... */
}
RETURN(e);
return __lc_get(lc, enr, 1);
}
/**
* lc_changed - tell @lc that the change has been recorded
* lc_try_get - get element by label, if present; do not change the active set
* @lc: the lru cache to operate on
* @e: the element pending label change
* @enr: the label to look up
*
* Finds an element in the cache, increases its usage count,
* "touches" and returns it.
*
* Return values:
* NULL
* The cache was marked %LC_STARVING,
* or the requested label was not in the active set
*
* pointer to the element with the REQUESTED element number.
* In this case, it can be used right away
*/
void lc_changed(struct lru_cache *lc, struct lc_element *e)
struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
{
return __lc_get(lc, enr, 0);
}
/**
* lc_committed - tell @lc that pending changes have been recorded
* @lc: the lru cache to operate on
*
* User is expected to serialize on explicit lc_try_lock_for_transaction()
* before the transaction is started, and later needs to lc_unlock() explicitly
* as well.
*/
void lc_committed(struct lru_cache *lc)
{
struct lc_element *e, *tmp;
PARANOIA_ENTRY();
BUG_ON(e != lc->changing_element);
PARANOIA_LC_ELEMENT(lc, e);
++lc->changed;
e->lc_number = lc->new_number;
list_add(&e->list, &lc->in_use);
hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
lc->changing_element = NULL;
lc->new_number = LC_FREE;
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) {
/* count number of changes, not number of transactions */
++lc->changed;
e->lc_number = e->lc_new_number;
list_move(&e->list, &lc->in_use);
}
lc->pending_changes = 0;
RETURN();
}
@ -458,13 +540,12 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
PARANOIA_ENTRY();
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt == 0);
BUG_ON(e == lc->changing_element);
BUG_ON(e->lc_number != e->lc_new_number);
if (--e->refcnt == 0) {
/* move it to the front of LRU. */
list_move(&e->list, &lc->lru);
lc->used--;
clear_bit(__LC_STARVING, &lc->flags);
smp_mb__after_clear_bit();
clear_bit_unlock(__LC_STARVING, &lc->flags);
}
RETURN(e->refcnt);
}
@ -504,16 +585,24 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
void lc_set(struct lru_cache *lc, unsigned int enr, int index)
{
struct lc_element *e;
struct list_head *lh;
if (index < 0 || index >= lc->nr_elements)
return;
e = lc_element_by_index(lc, index);
e->lc_number = enr;
BUG_ON(e->lc_number != e->lc_new_number);
BUG_ON(e->refcnt != 0);
e->lc_number = e->lc_new_number = enr;
hlist_del_init(&e->colision);
hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
if (enr == LC_FREE)
lh = &lc->free;
else {
hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
lh = &lc->lru;
}
list_move(&e->list, lh);
}
/**
@ -553,8 +642,10 @@ EXPORT_SYMBOL(lc_try_get);
EXPORT_SYMBOL(lc_find);
EXPORT_SYMBOL(lc_get);
EXPORT_SYMBOL(lc_put);
EXPORT_SYMBOL(lc_changed);
EXPORT_SYMBOL(lc_committed);
EXPORT_SYMBOL(lc_element_by_index);
EXPORT_SYMBOL(lc_index_of);
EXPORT_SYMBOL(lc_seq_printf_stats);
EXPORT_SYMBOL(lc_seq_dump_details);
EXPORT_SYMBOL(lc_try_lock);
EXPORT_SYMBOL(lc_is_used);