From f775738f6fba9c7f6deaa540860d6fb7e2d28445 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 30 Mar 2012 15:14:27 +0800 Subject: [PATCH 01/37] btrfs/ctree.c: remove the unnecessary 'return -1;' at the end of bin_search The code path should not reach there. Remove it. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/ctree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4106264fbc6..26847999c64 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -854,20 +854,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb, static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot) { - if (level == 0) { + if (level == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), key, btrfs_header_nritems(eb), slot); - } else { + else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), sizeof(struct btrfs_key_ptr), key, btrfs_header_nritems(eb), slot); - } - return -1; } int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, From 1b303fc0545b4bfbb2b8a69eec89e6f077f69b56 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:18 +0800 Subject: [PATCH 02/37] Btrfs: cleanup the comment for clear_state_bit in extent_io.c No 'delete' arg is used for clear_state_bit. Cleanup the comment. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9018a05036..aeb98ceda51 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -413,7 +413,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1) + * it will optionally wake up any one waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree From 39bab87ba6f4d8cce2b70c19e60233ad8030d7b4 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:31 +0800 Subject: [PATCH 03/37] Btrfs: fix btrfs_release_extent_buffer_page with the right usage of num_extent_pages num_extent_pages returns the number of pages in the specific range, not the index of the last page in the eb range. btrfs_release_extent_buffer_page is called with start_idx set 0 in current codes, so it's not a problem yet. But the logic is indeed wrong. Fix it here. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aeb98ceda51..455daec2b6c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3981,11 +3981,13 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long start_idx) { unsigned long index; + unsigned long num_pages; struct page *page; BUG_ON(extent_buffer_under_io(eb)); - index = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; if (start_idx >= index) return; From 477d7eafa9585ded87ee1c6f69638a6baf9d8922 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:47 +0800 Subject: [PATCH 04/37] Btrfs: fix the comment for find_first_extent_bit The return value of find_first_extent_bit is 1 or 0, no < 0. And if found something, return 0; if nothing was found, return 1. Fix the comment. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 455daec2b6c..fe91305e12f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1293,7 +1293,7 @@ out: * returned if we find something, and *start_ret and *end_ret are * set to reflect the state struct that was found. * - * If nothing was found, 1 is returned, < 0 on error + * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, int bits) From fd5e62a37cef5b212318c522eac0ecd394b50a19 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:05 +0800 Subject: [PATCH 05/37] Btrfs: remove the useless assignment to *entry in function tree_insert of file extent_io.c In tree_insert, var *entry is used in the loop only, and is useless out of the loop. Remove the useless assignment after the loop. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe91305e12f..82b4829f00b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; From e06baab4184509bdfddd294efc6cae7a410c6f07 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 12 Apr 2012 12:53:40 +0200 Subject: [PATCH 06/37] Btrfs: change integrity checker to support big blocks The integrity checker used to be coded for nodesize == leafsize == sectorsize == PAGE_CACHE_SIZE. This is now changed to support sizes for nodesize and leafsize which are N * PAGE_CACHE_SIZE. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 563 +++++++++++++++++++++++++++---------- 1 file changed, 416 insertions(+), 147 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index c053e90f200..7f6cc359e44 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -103,8 +103,6 @@ #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, * excluding " [...]" */ -#define BTRFSIC_BLOCK_SIZE PAGE_SIZE - #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) /* @@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx { u64 dev_bytenr; /* physical bytenr on device */ u32 len; struct btrfsic_dev_state *dev; - char *data; - struct buffer_head *bh; /* do not use if set to NULL */ + char **datav; + struct page **pagev; + void *mem_to_free; }; /* This structure is used to implement recursion without occupying @@ -243,6 +242,8 @@ struct btrfsic_state { struct btrfs_root *root; u64 max_superblock_generation; struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; }; static void btrfsic_block_init(struct btrfsic_block *b); @@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, static int btrfsic_process_metablock(struct btrfsic_state *state, struct btrfsic_block *block, struct btrfsic_block_data_ctx *block_ctx, - struct btrfs_header *hdr, int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); static int btrfsic_create_link_to_next_block( struct btrfsic_state *state, struct btrfsic_block *block, @@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size); + char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, u8 *mapped_data, - unsigned int len, struct bio *bio, - int *bio_is_patched, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, struct buffer_head *bh, int submit_bio_bh_rw); static int btrfsic_process_written_superblock( @@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data); + u64 dev_bytenr); static struct mutex btrfsic_mutex; static int btrfsic_is_initialized; @@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int pass; BUG_ON(NULL == state); - selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); return -1; @@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfsic_block *next_block; struct btrfsic_block_data_ctx tmp_next_block_ctx; struct btrfsic_block_link *l; - struct btrfs_header *hdr; - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, &tmp_next_block_ctx, mirror_num); if (ret) { @@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, BUG_ON(NULL == l); ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { + if (ret < (int)PAGE_CACHE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", (unsigned long long) @@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, return -1; } - hdr = (struct btrfs_header *)tmp_next_block_ctx.data; ret = btrfsic_process_metablock(state, next_block, &tmp_next_block_ctx, - hdr, BTRFS_MAX_LEVEL + 3, 1); btrfsic_release_block_ctx(&tmp_next_block_ctx); } @@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + return -1; + bh = __bread(superblock_bdev, dev_bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); if (NULL == bh) return -1; super_tmp = (struct btrfs_super_block *) @@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror( if (btrfs_super_bytenr(super_tmp) != dev_bytenr || strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, sizeof(super_tmp->magic)) || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_leafsize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { brelse(bh); return 0; } @@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror( num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfsic_block_data_ctx tmp_next_block_ctx; struct btrfsic_block_link *l; - if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, &tmp_next_block_ctx, mirror_num)) { printk(KERN_INFO "btrfsic: btrfsic_map_block(" @@ -966,13 +975,15 @@ static int btrfsic_process_metablock( struct btrfsic_state *state, struct btrfsic_block *const first_block, struct btrfsic_block_data_ctx *const first_block_ctx, - struct btrfs_header *const first_hdr, int first_limit_nesting, int force_iodone_flag) { struct btrfsic_stack_frame initial_stack_frame = { 0 }; struct btrfsic_stack_frame *sf; struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + BUG_ON(!first_hdr); sf = &initial_stack_frame; sf->error = 0; sf->i = -1; @@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame: } if (sf->i < sf->nr) { - struct btrfs_item *disk_item = leafhdr->items + sf->i; - struct btrfs_disk_key *disk_key = &disk_item->key; + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; u8 type; - const u32 item_offset = le32_to_cpu(disk_item->offset); + u32 item_offset; + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + printk(KERN_INFO + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = le32_to_cpu(disk_item.offset); + disk_key = &disk_item.key; type = disk_key->type; if (BTRFS_ROOT_ITEM_KEY == type) { - const struct btrfs_root_item *const root_item = - (struct btrfs_root_item *) - (sf->block_ctx->data + - offsetof(struct btrfs_leaf, items) + - item_offset); - const u64 next_bytenr = - le64_to_cpu(root_item->bytenr); + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + + sizeof(struct btrfs_root_item) > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + sizeof(struct btrfs_root_item)); + next_bytenr = le64_to_cpu(root_item.bytenr); sf->error = btrfsic_create_link_to_next_block( @@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame: &sf->num_copies, &sf->mirror_num, disk_key, - le64_to_cpu(root_item-> + le64_to_cpu(root_item. generation)); if (sf->error) goto one_stack_frame_backwards; @@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame: if (NULL != sf->next_block) { struct btrfs_header *const next_hdr = (struct btrfs_header *) - sf->next_block_ctx.data; + sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); @@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame: } if (sf->i < sf->nr) { - struct btrfs_key_ptr *disk_key_ptr = - nodehdr->ptrs + sf->i; - const u64 next_bytenr = - le64_to_cpu(disk_key_ptr->blockptr); + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + printk(KERN_INFO + "btrfsic: node item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = le64_to_cpu(key_ptr.blockptr); sf->error = btrfsic_create_link_to_next_block( state, @@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame: force_iodone_flag, &sf->num_copies, &sf->mirror_num, - &disk_key_ptr->key, - le64_to_cpu(disk_key_ptr->generation)); + &key_ptr.key, + le64_to_cpu(key_ptr.generation)); if (sf->error) goto one_stack_frame_backwards; if (NULL != sf->next_block) { struct btrfs_header *const next_hdr = (struct btrfs_header *) - sf->next_block_ctx.data; + sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) @@ -1181,6 +1232,35 @@ one_stack_frame_backwards: return sf->error; } +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t offset_in_page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + offset_in_page = (start_offset + offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); + BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + offset_in_page, cur); + + dst += cur; + len -= cur; + offset_in_page = 0; + i++; + } +} + static int btrfsic_create_link_to_next_block( struct btrfsic_state *state, struct btrfsic_block *block, @@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block( if (0 == *num_copiesp) { *num_copiesp = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, *num_copiesp); @@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block( "btrfsic_create_link_to_next_block(mirror_num=%d)\n", *mirror_nump); ret = btrfsic_map_block(state, next_bytenr, - BTRFSIC_BLOCK_SIZE, + state->metablock_size, next_block_ctx, *mirror_nump); if (ret) { printk(KERN_INFO @@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block( if (limit_nesting > 0 && did_alloc_block_link) { ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { + if (ret < (int)next_block_ctx->len) { printk(KERN_INFO "btrfsic: read block @logical %llu failed!\n", (unsigned long long)next_bytenr); @@ -1339,43 +1419,55 @@ static int btrfsic_handle_extent_data( u32 item_offset, int force_iodone_flag) { int ret; - struct btrfs_file_extent_item *file_extent_item = - (struct btrfs_file_extent_item *)(block_ctx->data + - offsetof(struct btrfs_leaf, - items) + item_offset); - u64 next_bytenr = - le64_to_cpu(file_extent_item->disk_bytenr) + - le64_to_cpu(file_extent_item->offset); - u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); - u64 generation = le64_to_cpu(file_extent_item->generation); + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; struct btrfsic_block_link *l; + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + + le64_to_cpu(file_extent_item.offset); + generation = le64_to_cpu(file_extent_item.generation); + num_bytes = le64_to_cpu(file_extent_item.num_bytes); + generation = le64_to_cpu(file_extent_item.generation); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," " offset = %llu, num_bytes = %llu\n", - file_extent_item->type, + file_extent_item.type, (unsigned long long) - le64_to_cpu(file_extent_item->disk_bytenr), - (unsigned long long) - le64_to_cpu(file_extent_item->offset), - (unsigned long long) - le64_to_cpu(file_extent_item->num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || - ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) + le64_to_cpu(file_extent_item.disk_bytenr), + (unsigned long long)le64_to_cpu(file_extent_item.offset), + (unsigned long long)num_bytes); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) return 0; while (num_bytes > 0) { u32 chunk_len; int num_copies; int mirror_num; - if (num_bytes > BTRFSIC_BLOCK_SIZE) - chunk_len = BTRFSIC_BLOCK_SIZE; + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; else chunk_len = num_bytes; num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -1475,8 +1567,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; if (0 == ret) kfree(multi); @@ -1496,8 +1589,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, block_ctx_out->dev_bytenr = bytenr; block_ctx_out->start = bytenr; block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; if (NULL != block_ctx_out->dev) { return 0; } else { @@ -1508,38 +1602,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) { - if (NULL != block_ctx->bh) { - brelse(block_ctx->bh); - block_ctx->bh = NULL; + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) { + kunmap(block_ctx->pagev[num_pages]); + block_ctx->datav[num_pages] = NULL; + } + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; } } static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx) { - block_ctx->bh = NULL; - if (block_ctx->dev_bytenr & 4095) { + unsigned int num_pages; + unsigned int i; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", (unsigned long long)block_ctx->dev_bytenr); return -1; } - if (block_ctx->len > 4096) { - printk(KERN_INFO - "btrfsic: read_block() with too huge size %d\n", - block_ctx->len); + + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev)) * + num_pages, GFP_NOFS); + if (!block_ctx->mem_to_free) return -1; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + for (i = 0; i < num_pages; i++) { + block_ctx->pagev[i] = alloc_page(GFP_NOFS); + if (!block_ctx->pagev[i]) + return -1; } - block_ctx->bh = __bread(block_ctx->dev->bdev, - block_ctx->dev_bytenr >> 12, 4096); - if (NULL == block_ctx->bh) - return -1; - block_ctx->data = block_ctx->bh->b_data; + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, num_pages - i); + if (!bio) { + printk(KERN_INFO + "btrfsic: bio_alloc() for %u pages failed!\n", + num_pages - i); + return -1; + } + bio->bi_bdev = block_ctx->dev->bdev; + bio->bi_sector = dev_bytenr >> 9; + bio->bi_end_io = btrfsic_complete_bio_end_io; + bio->bi_private = &complete; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_CACHE_SIZE, 0); + if (PAGE_CACHE_SIZE != ret) + break; + } + if (j == i) { + printk(KERN_INFO + "btrfsic: error, failed to add a single page!\n"); + return -1; + } + submit_bio(READ, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_INFO + "btrfsic: read error at logical %llu dev %s!\n", + block_ctx->start, block_ctx->dev->name); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) { + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + if (!block_ctx->datav[i]) { + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", + block_ctx->dev->name); + return -1; + } + } return block_ctx->len; } +static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + static void btrfsic_dump_database(struct btrfsic_state *state) { struct list_head *elem_all; @@ -1617,32 +1800,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state) * (note that this test fails for the super block) */ static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size) + char **datav, unsigned int num_pages) { struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; + unsigned int i; - h = (struct btrfs_header *)data; + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + h = (struct btrfs_header *)datav[0]; if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) - fail++; + return 1; - crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_CACHE_SIZE : + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + + crc = crc32c(crc, data, sublen); + } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) - crc_fail++; + return 1; - return fail || crc_fail; + return 0; /* is metadata */ } static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, - u8 *mapped_data, unsigned int len, - struct bio *bio, - int *bio_is_patched, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, struct buffer_head *bh, int submit_bio_bh_rw) { @@ -1652,12 +1842,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, int ret; struct btrfsic_state *state = dev_state->state; struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; - WARN_ON(len > PAGE_SIZE); - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); if (NULL != bio_is_patched) *bio_is_patched = 0; +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, &state->block_hashtable); if (NULL != block) { @@ -1667,8 +1864,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (block->is_superblock) { bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); + if (num_pages * PAGE_CACHE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } is_metadata = 1; + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { printk(KERN_INFO @@ -1678,12 +1883,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } if (is_metadata) { if (!block->is_superblock) { + if (num_pages * PAGE_CACHE_SIZE < + state->metablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); + dev_bytenr); } if (block->logical_bytenr != bytenr) { printk(KERN_INFO @@ -1710,6 +1921,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block->mirror_num, btrfsic_get_block_type(state, block)); } else { + if (num_pages * PAGE_CACHE_SIZE < + state->datablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; bytenr = block->logical_bytenr; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO @@ -1747,7 +1965,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, le64_to_cpu(block->disk_key.offset), (unsigned long long) le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation), + mapped_datav[0])->generation), (unsigned long long) state->max_superblock_generation); btrfsic_dump_tree(state); @@ -1765,10 +1983,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, (unsigned long long)block->generation, (unsigned long long) le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation)); + mapped_datav[0])->generation)); /* it would not be safe to go on */ btrfsic_dump_tree(state); - return; + goto continue_loop; } /* @@ -1796,18 +2014,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, len, + ret = btrfsic_map_superblock(state, bytenr, + processed_len, bdev, &block_ctx); else - ret = btrfsic_map_block(state, bytenr, len, + ret = btrfsic_map_block(state, bytenr, processed_len, &block_ctx, 0); if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", (unsigned long long)bytenr); - return; + goto continue_loop; } - block_ctx.data = mapped_data; + block_ctx.datav = mapped_datav; /* the following is required in case of writes to mirrors, * use the same that was used for the lookup */ block_ctx.dev = dev_state; @@ -1863,11 +2082,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block->logical_bytenr = bytenr; block->is_metadata = 1; if (block->is_superblock) { + BUG_ON(PAGE_CACHE_SIZE != + BTRFS_SUPER_INFO_SIZE); ret = btrfsic_process_written_superblock( state, block, (struct btrfs_super_block *) - mapped_data); + mapped_datav[0]); if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { printk(KERN_INFO @@ -1880,8 +2101,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, state, block, &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); } if (ret) @@ -1912,26 +2131,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 bytenr; if (!is_metadata) { + processed_len = state->datablock_size; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block (%s/%llu/?)" " !found in hash table, D.\n", dev_state->name, (unsigned long long)dev_bytenr); - if (!state->include_extent_data) - return; /* ignore that written D block */ + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } /* this is getting ugly for the * include_extent_data case... */ bytenr = 0; /* unknown */ block_ctx.start = bytenr; - block_ctx.len = len; - block_ctx.bh = NULL; + block_ctx.len = processed_len; + block_ctx.mem_to_free = NULL; + block_ctx.pagev = NULL; } else { + processed_len = state->metablock_size; bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); + dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/?)" @@ -1940,17 +2163,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, dev_state->name, (unsigned long long)dev_bytenr); - ret = btrfsic_map_block(state, bytenr, len, &block_ctx, - 0); + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", (unsigned long long)dev_bytenr); - return; + goto continue_loop; } } - block_ctx.data = mapped_data; + block_ctx.datav = mapped_datav; /* the following is required in case of writes to mirrors, * use the same that was used for the lookup */ block_ctx.dev = dev_state; @@ -1960,7 +2183,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (NULL == block) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&block_ctx); - return; + goto continue_loop; } block->dev_state = dev_state; block->dev_bytenr = dev_bytenr; @@ -2020,9 +2243,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (is_metadata) { ret = btrfsic_process_metablock(state, block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); + &block_ctx, 0, 0); if (ret) printk(KERN_INFO "btrfsic: process_metablock(root @%llu)" @@ -2031,6 +2252,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } btrfsic_release_block_ctx(&block_ctx); } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; + num_pages -= processed_len >> PAGE_CACHE_SHIFT; + goto again; } static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) @@ -2213,7 +2441,7 @@ static int btrfsic_process_written_superblock( num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -2224,7 +2452,8 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic_process_written_superblock(" "mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, &tmp_next_block_ctx, mirror_num); if (ret) { @@ -2689,7 +2918,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data) + u64 dev_bytenr) { int num_copies; int mirror_num; @@ -2698,10 +2927,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, int match = 0; num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - bytenr, PAGE_SIZE); + bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, bytenr, state->metablock_size, &block_ctx, mirror_num); if (ret) { printk(KERN_INFO "btrfsic:" @@ -2727,7 +2956,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, (unsigned long long)bytenr, dev_state->name, (unsigned long long)dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, &block_ctx, mirror_num); if (ret) continue; @@ -2781,13 +3011,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) (unsigned long)bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, - bh->b_data, bh->b_size, NULL, + &bh->b_data, 1, NULL, NULL, bh, rw); } else if (NULL != dev_state && (rw & REQ_FLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", rw, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & @@ -2836,6 +3066,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) unsigned int i; u64 dev_bytenr; int bio_is_patched; + char **mapped_datav; dev_bytenr = 512 * bio->bi_sector; bio_is_patched = 0; @@ -2848,35 +3079,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)dev_bytenr, bio->bi_bdev); + mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, + GFP_NOFS); + if (!mapped_datav) + goto leave; for (i = 0; i < bio->bi_vcnt; i++) { - u8 *mapped_data; - - mapped_data = kmap(bio->bi_io_vec[i].bv_page); + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); + if (!mapped_datav[i]) { + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + goto leave; + } if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE) == (dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE))) printk(KERN_INFO - "#%u: page=%p, mapped=%p, len=%u," - " offset=%u\n", + "#%u: page=%p, len=%u, offset=%u\n", i, bio->bi_io_vec[i].bv_page, - mapped_data, bio->bi_io_vec[i].bv_len, bio->bi_io_vec[i].bv_offset); - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio, &bio_is_patched, - NULL, rw); - kunmap(bio->bi_io_vec[i].bv_page); - dev_bytenr += bio->bi_io_vec[i].bv_len; } + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_datav, bio->bi_vcnt, + bio, &bio_is_patched, + NULL, rw); + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); } else if (NULL != dev_state && (rw & REQ_FLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", rw, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & @@ -2903,6 +3145,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) bio->bi_end_io = btrfsic_bio_end_io; } } +leave: mutex_unlock(&btrfsic_mutex); submit_bio(rw, bio); @@ -2917,6 +3160,30 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; + if (root->nodesize != root->leafsize) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d != leafsize %d!\n", + root->nodesize, root->leafsize); + return -1; + } + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->nodesize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->leafsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } state = kzalloc(sizeof(*state), GFP_NOFS); if (NULL == state) { printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); @@ -2933,6 +3200,8 @@ int btrfsic_mount(struct btrfs_root *root, state->print_mask = print_mask; state->include_extent_data = including_extent_data; state->csum_size = 0; + state->metablock_size = root->nodesize; + state->datablock_size = root->sectorsize; INIT_LIST_HEAD(&state->all_blocks_list); btrfsic_block_hashtable_init(&state->block_hashtable); btrfsic_block_link_hashtable_init(&state->block_link_hashtable); From a25c75d5ad04df0a7abd09585231b4021a91a358 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 18 Apr 2012 09:59:29 +0300 Subject: [PATCH 07/37] Btrfs: cleanup: use consistent lock naming It confuses Smatch that we use two names for the same lock. Plus the shorter name is nicer. This doesn't change how the code works, it's just a cleanup. Signed-off-by: Dan Carpenter --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49fd7b66d57..59ae191d4f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3578,7 +3578,7 @@ again: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } From 0c4d2d95d06e920e0c61707e62c7fffc9c57f63a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Apr 2012 15:03:02 -0400 Subject: [PATCH 08/37] Btrfs: use i_version instead of our own sequence We've been keeping around the inode sequence number in hopes that somebody would use it, but nobody uses it and people actually use i_version which serves the same purpose, so use i_version where we used the incore inode's sequence number and that way the sequence is updated properly across the board, and not just in file write. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 3 --- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/file.c | 1 - fs/btrfs/inode.c | 16 +++++++++++++--- fs/btrfs/ioctl.c | 2 ++ fs/btrfs/super.c | 2 +- fs/btrfs/xattr.c | 1 + 7 files changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9b9b15fd520..3771b8543a7 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -83,9 +83,6 @@ struct btrfs_inode { */ u64 generation; - /* sequence number for NFS changes */ - u64 sequence; - /* * transid of the trans_handle that last modified this inode */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 03e3748d84d..bcd40c7109f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); btrfs_set_stack_inode_generation(inode_item, BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); + btrfs_set_stack_inode_sequence(inode_item, inode->i_version); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53bf2d764bb..8aa8d7fe74d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1409,7 +1409,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, mutex_unlock(&inode->i_mutex); goto out; } - BTRFS_I(inode)->sequence++; start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 61b16c641ce..41a62e6954c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2510,7 +2510,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -2594,7 +2594,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_sequence(leaf, item, inode->i_version); btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); @@ -2752,6 +2752,8 @@ err: goto out; btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(inode); + inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; btrfs_update_inode(trans, root, dir); out: @@ -3089,6 +3091,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); if (ret) @@ -3638,6 +3641,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid) { setattr_copy(inode, attr); + inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) @@ -4730,6 +4734,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); + inode_inc_iversion(parent_inode); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, parent_inode); if (ret) @@ -4937,6 +4942,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_inc_nlink(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -6894,7 +6900,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->root = NULL; ei->space_info = NULL; ei->generation = 0; - ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; @@ -7193,6 +7198,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) btrfs_add_ordered_operation(trans, root, old_inode); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; @@ -7219,6 +7227,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_inode) { + inode_inc_iversion(new_inode); new_inode->i_ctime = CURRENT_TIME; if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -7490,6 +7499,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14f8e1faa46..fccde7402cf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } btrfs_update_iflags(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); @@ -2622,6 +2623,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5f8fca4195..bd6d143cea8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -769,7 +769,7 @@ static int btrfs_fill_super(struct super_block *sb, #ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - + sb->s_flags |= MS_I_VERSION; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk("btrfs: open_ctree failed\n"); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e7a5659087e..3f4e2d69e83 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (ret) goto out; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); From 30f8fe3e47c5bb5715aa80b2a2fa0cab8b218fae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 13:55:30 -0400 Subject: [PATCH 09/37] Btrfs: cache no acl on new inodes When running compilebench I noticed we were spending some time looking up acls on new inodes, which shouldn't be happening since there were no acls. This is because when we init acls on the inode after creating them we don't cache the fact there are no acls if there aren't any. Doing this adds a little bit of a bump to my compilebench runs. Thanks, Btrfs: cache no acl on new inodes Signed-off-by: Josef Bacik --- fs/btrfs/acl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 89b156d85d6..761e2cd8fed 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); + } else { + cache_no_acl(inode); } + } else { + cache_no_acl(inode); } failed: posix_acl_release(acl); From d7dbe9e7f64e72ec6548658857c5d92327a73633 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 14:00:51 -0400 Subject: [PATCH 10/37] Btrfs: fix compile warnings in extent_io.c These warnings are bogus since we will always have at least one page in an eb, but to make the compiler happy just set ret = 0 in these two cases. Thanks, Btrfs: fix compile warnings in extent_io.c These warnings are bogus since we will always have at least one page in an eb, but to make the compiler happy just set ret = 0 in these two cases. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 82b4829f00b..836fc37a437 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3163,7 +3163,7 @@ static int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; int rw = (epd->sync_io ? WRITE_SYNC : WRITE); - int ret; + int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); From 551ebb2d34304ee2abfe6b00d39ec65d5e4e8266 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 14:41:09 -0400 Subject: [PATCH 11/37] Btrfs: remove useless waiting and extra filemap work In btrfs_wait_ordered_range we have been calling filemap_fdata_write() twice because compression does strange things and then waiting. Then we look up ordered extents and if we find any we will always schedule_timeout(); once and then loop back around and do it all again. We will even check to see if there is delalloc pages on this range and loop again. So this patch gets rid of the multipe fdata_write() calls and just does filemap_write_and_wait(). In the case of compression we will still find the ordered extents and start those individually if we need to so that is ok, but in the normal buffered case we avoid all this weird overhead. Then in the case of the schedule_timeout(1), we don't need it. All callers either 1) don't care, they just want to make sure what they just wrote maeks it to disk or 2) are doing the lock()->lookup ordered->unlock->flush thing in which case it will lock and check for ordered extents _anyway_ so get back to them as quickly as possible. The delaloc check is simply not needed, this only catches the case where we write to the file again since doing the filemap_write_and_wait() and if the caller truly cares about that it will take care of everything itself. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bbf6d0d9aeb..9807750c625 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -621,19 +621,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } -again: + /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - /* The compression code will leave pages locked but return from - * writepage without setting the page writeback. Starting again - * with WB_SYNC_ALL will end up waiting for the IO to actually start. - */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + filemap_write_and_wait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; @@ -657,11 +649,6 @@ again: break; end--; } - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_DELALLOC, 0, NULL)) { - schedule_timeout(1); - goto again; - } } /* From 0885ef5b5601e9b007c383e77c172769b1f214fd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 15:09:39 -0400 Subject: [PATCH 12/37] Btrfs: do not do filemap_write_and_wait_range in fsync We already do the btrfs_wait_ordered_range which will do this for us, so just remove this call so we don't call it twice. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8aa8d7fe74d..cfc0ab915d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1497,14 +1497,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_btrfs_sync_file(file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; mutex_lock(&inode->i_mutex); - /* we wait first, since the writeback may change the inode */ + /* + * we wait first, since the writeback may change the inode, also wait + * ordered range does a filemape_write_and_wait_range which is why we + * don't do it above like other file systems. + */ root->log_batch++; - btrfs_wait_ordered_range(inode, 0, (u64)-1); + btrfs_wait_ordered_range(inode, start, end); root->log_batch++; /* From 0d2450abfa359ff94a2bee64a7daeba68c346c81 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 24 Apr 2012 22:59:16 +0300 Subject: [PATCH 13/37] btrfs: allow changing 'thread_pool' size at remount time Changing 'mount -oremount,thread_pool=2 /' didn't make any effect: maximum amount of worker threads is specified in 2 places: - in 'strict btrfs_fs_info::thread_pool_size' - in each worker struct: 'struct btrfs_workers::max_workers' 'mount -oremount' updated only 'btrfs_fs_info::thread_pool_size'. Fix it by pushing new maximum value to all created worker structures as well. Cc: Josef Bacik Cc: Chris Mason Reviewed-by: Josef Bacik Signed-off-by: Sergei Trofimovich --- fs/btrfs/super.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bd6d143cea8..2cd32175753 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -435,11 +435,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_thread_pool: intarg = 0; match_int(&args[0], &intarg); - if (intarg) { + if (intarg) info->thread_pool_size = intarg; - printk(KERN_INFO "btrfs: thread pool %d\n", - info->thread_pool_size); - } break; case Opt_max_inline: num = match_strdup(&args[0]); @@ -1118,6 +1115,40 @@ error_fs_info: return ERR_PTR(error); } +static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) +{ + spin_lock_irq(&workers->lock); + workers->max_workers = new_limit; + spin_unlock_irq(&workers->lock); +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + int new_pool_size, int old_pool_size) +{ + if (new_pool_size == old_pool_size) + return; + + fs_info->thread_pool_size = new_pool_size; + + printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", + old_pool_size, new_pool_size); + + btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); + btrfs_set_max_workers(&fs_info->workers, new_pool_size); + btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); + btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1137,6 +1168,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -1180,7 +1214,8 @@ restore: fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; fs_info->alloc_start = old_alloc_start; - fs_info->thread_pool_size = old_thread_pool_size; + btrfs_resize_thread_pool(fs_info, + old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; return ret; } From 2eec6c8102c62c540c637176271cfdb13d828d7b Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Thu, 26 Apr 2012 00:37:14 +0800 Subject: [PATCH 14/37] Fix minor type issues Address some minor type issues identified by sparse checker. Signed-off-by: Daniel J Blueman --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ulist.c | 4 ++-- fs/btrfs/ulist.h | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fccde7402cf..9ebb2c7145a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2916,7 +2916,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } - user_dest = (struct btrfs_ioctl_space_info *) + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 12f5147bd2b..ad993bc2df9 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit); * * The allocated ulist will be returned in an initialized state. */ -struct ulist *ulist_alloc(unsigned long gfp_mask) +struct ulist *ulist_alloc(gfp_t gfp_mask) { struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); @@ -144,7 +144,7 @@ EXPORT_SYMBOL(ulist_free); * unaltered. */ int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask) + gfp_t gfp_mask) { int i; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 2e25dec58ec..6568c352773 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -59,10 +59,9 @@ struct ulist { void ulist_init(struct ulist *ulist); void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); +struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, gfp_t gfp_mask); struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); #endif From f07c9a79f06cd33b1c9c2c4eacb60bafa7e3f310 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Thu, 26 Apr 2012 18:35:12 +0200 Subject: [PATCH 15/37] Btrfs: avoid buffer overrun in btrfs_printk The buffer read-overrun would be triggered by a printk format starting with , where N is a single digit. NUL-terminate after strncpy. Use memcpy, not strncpy, since we know the string we're copying fits in the destination buffer and contains no NUL byte. Signed-off-by: Jim Meyering --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2cd32175753..46b26650415 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) va_start(args, fmt); if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { - strncpy(lvl, fmt, 3); + memcpy(lvl, fmt, 3); + lvl[3] = '\0'; fmt += 3; type = logtypes[fmt[1] - '0']; } else From a27202fbe92b12eec895c36644440175de01d7a6 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Thu, 26 Apr 2012 18:36:56 +0200 Subject: [PATCH 16/37] Btrfs: NUL-terminate path buffer in DEV_INFO ioctl result A device with name of length BTRFS_DEVICE_PATH_NAME_MAX or longer would not be NUL-terminated in the DEV_INFO ioctl result buffer. Signed-off-by: Jim Meyering --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9ebb2c7145a..3d8ab27622c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2263,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) + if (dev->name) { strncpy(di_args->path, dev->name, sizeof(di_args->path)); - else + di_args->path[sizeof(di_args->path) - 1] = 0; + } else { di_args->path[0] = '\0'; + } out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) From f60d16a8923201bb27ad7c09016abab2818cf8ce Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Wed, 25 Apr 2012 21:24:17 +0200 Subject: [PATCH 17/37] Btrfs: avoid buffer overrun in mount option handling There is an off-by-one error: allocating room for a maximal result string but without room for a trailing NUL. That, can lead to returning a transformed string that is not NUL-terminated, and then to a caller reading beyond end of the malloc'd buffer. Rewrite to s/kzalloc/kmalloc/, remove unwarranted use of strncpy (the result is guaranteed to fit), remove dead strlen at end, and change a few variable names and comments. Reviewed-by: Josef Bacik Signed-off-by: Jim Meyering --- fs/btrfs/super.c | 67 +++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 46b26650415..96eb9fef7bd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -923,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode) */ static char *setup_root_args(char *args) { - unsigned copied = 0; - unsigned len = strlen(args) + 2; - char *pos; - char *ret; + unsigned len = strlen(args) + 2 + 1; + char *src, *dst, *buf; /* - * We need the same args as before, but minus + * We need the same args as before, but with this substitution: + * s!subvol=[^,]+!subvolid=0! * - * subvol=a - * - * and add - * - * subvolid=0 - * - * which is a difference of 2 characters, so we allocate strlen(args) + - * 2 characters. + * Since the replacement string is up to 2 bytes longer than the + * original, allocate strlen(args) + 2 + 1 bytes. */ - ret = kzalloc(len * sizeof(char), GFP_NOFS); - if (!ret) - return NULL; - pos = strstr(args, "subvol="); + src = strstr(args, "subvol="); /* This shouldn't happen, but just in case.. */ - if (!pos) { - kfree(ret); + if (!src) + return NULL; + + buf = dst = kmalloc(len, GFP_NOFS); + if (!buf) return NULL; - } /* - * The subvol=<> arg is not at the front of the string, copy everybody - * up to that into ret. + * If the subvol= arg is not at the start of the string, + * copy whatever precedes it into buf. */ - if (pos != args) { - *pos = '\0'; - strcpy(ret, args); - copied += strlen(args); - pos++; + if (src != args) { + *src++ = '\0'; + strcpy(buf, args); + dst += strlen(args); } - strncpy(ret + copied, "subvolid=0", len - copied); - - /* Length of subvolid=0 */ - copied += 10; + strcpy(dst, "subvolid=0"); + dst += strlen("subvolid=0"); /* - * If there is no , after the subvol= option then we know there's no - * other options and we can just return. + * If there is a "," after the original subvol=... string, + * copy that suffix into our buffer. Otherwise, we're done. */ - pos = strchr(pos, ','); - if (!pos) - return ret; + src = strchr(src, ','); + if (src) + strcpy(dst, src); - /* Copy the rest of the arguments into our buffer */ - strncpy(ret + copied, pos, len - copied); - copied += strlen(pos); - - return ret; + return buf; } static struct dentry *mount_subvol(const char *subvol_name, int flags, From 4e89915220e2f1341c757b610d0f0c3821f3a65f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 May 2012 13:52:09 -0400 Subject: [PATCH 18/37] Btrfs: do not check delalloc when updating disk_i_size We are checking delalloc to see if it is ok to update the i_size. There are 2 cases it stops us from updating 1) If there is delalloc between our current disk_i_size and this ordered extent 2) If there is delalloc between our current ordered extent and the next ordered extent These tests are racy however since we can set delalloc for these ranges at any time. Also for the first case if we notice there is delalloc between disk_i_size and our ordered extent we will not update disk_i_size and assume that when that delalloc bit gets written out it will update everything properly. However if we crash before that we will have file extents outside of our i_size, which is not good, so this test is dangerous as well as racy. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9807750c625..9565c028916 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -751,7 +751,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 disk_i_size; u64 new_i_size; u64 i_size_test; @@ -784,14 +783,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, goto out; } - /* - * we can't update the disk_isize if there are delalloc bytes - * between disk_i_size and this ordered extent - */ - if (test_range_bit(io_tree, disk_i_size, offset - 1, - EXTENT_DELALLOC, 0, NULL)) { - goto out; - } /* * walk backward from this ordered extent to disk_i_size. * if we find an ordered extent then we can't update disk i_size @@ -853,15 +844,11 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents. As long as there - * are no delalloc bytes in this area, it is safe to update - * disk_i_size to the end of the region. + * extent where there are no ordered extents, we can safely set + * disk_i_size to this. */ - if (i_size_test > offset && - !test_range_bit(io_tree, offset, i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { + if (i_size_test > offset) new_i_size = min_t(u64, i_size_test, i_size); - } BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: From 5fd02043553b02867b29de1ac9fff2ec16b84def Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 May 2012 14:00:54 -0400 Subject: [PATCH 19/37] Btrfs: finish ordered extents in their own thread We noticed that the ordered extent completion doesn't really rely on having a page and that it could be done independantly of ending the writeback on a page. This patch makes us not do the threaded endio stuff for normal buffered writes and direct writes so we can end page writeback as soon as possible (in irq context) and only start threads to do the ordered work when it is actually done. Compression needs to be reworked some to take advantage of this as well, but atm it has to do a find_get_page in its endio handler so it must be done in its own thread. This makes direct writes quite a bit faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 12 --- fs/btrfs/extent_io.c | 15 +-- fs/btrfs/extent_io.h | 5 +- fs/btrfs/free-space-cache.c | 4 +- fs/btrfs/inode.c | 177 ++++++++++++++++-------------------- fs/btrfs/ordered-data.c | 129 ++++++++++++++------------ fs/btrfs/ordered-data.h | 13 ++- 7 files changed, 164 insertions(+), 191 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7ffc88a7db..19f5b450f40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3671,17 +3671,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state) -{ - struct super_block *sb = page->mapping->host->i_sb; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - btrfs_error(fs_info, -EIO, - "Error occured while writing out btree at %llu", start); - return -EIO; -} - static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3689,5 +3678,4 @@ static struct extent_io_ops btree_extent_io_ops = { .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, - .writepage_io_failed_hook = btree_writepage_io_failed_hook, }; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 836fc37a437..7af93435cee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1172,9 +1172,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, cached_state, mask); } -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, - gfp_t mask) +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, cached_state, mask); @@ -2221,17 +2220,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) uptodate = 0; } - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(NULL, page, - start, end, NULL); - /* Writeback already completed */ - if (ret == 0) - return 1; - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b516c3b8dec..4d8124b6457 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -75,9 +75,6 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -225,6 +222,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 202008ec367..cecf8df6248 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -972,9 +972,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - goto out; + btrfs_wait_ordered_range(inode, 0, (u64)-1); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41a62e6954c..9a1b96fd672 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { static int btrfs_setsize(struct inode *inode, loff_t newsize); static int btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -1572,11 +1572,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(root, inode)) metadata = 2; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); - if (ret) - return ret; - if (!(rw & REQ_WRITE)) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + return ret; + if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); @@ -1815,25 +1815,24 @@ out: * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { + struct inode *inode = ordered_extent->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans = NULL; - struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; int compress_type = 0; int ret; bool nolock; - ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1); - if (!ret) - return 0; - BUG_ON(!ordered_extent); /* Logic error */ - nolock = btrfs_is_free_space_inode(root, inode); + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + ret = -EIO; + goto out; + } + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); @@ -1889,12 +1888,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset, ordered_extent->len); } - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); + if (ret < 0) { btrfs_abort_transaction(trans, root, ret); - goto out; + goto out_unlock; } add_pending_csums(trans, inode, ordered_extent->file_offset, @@ -1905,10 +1902,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, root, ret); - goto out; + goto out_unlock; } } ret = 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); out: if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); @@ -1919,26 +1920,57 @@ out: btrfs_end_transaction(trans, root); } + if (ret) + clear_extent_uptodate(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, NULL, GFP_NOFS); + + /* + * This needs to be dont to make sure anybody waiting knows we are done + * upating everything for this ordered extent. + */ + btrfs_remove_ordered_extent(inode, ordered_extent); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - return 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); - goto out; + return ret; +} + +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); } static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_workers *workers; + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - return btrfs_finish_ordered_io(page->mapping->host, start, end); + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) + return 0; + + ordered_extent->work.func = finish_ordered_fn; + ordered_extent->work.flags = 0; + + if (btrfs_is_free_space_inode(root, inode)) + workers = &root->fs_info->endio_freespace_worker; + else + workers = &root->fs_info->endio_write_workers; + btrfs_queue_worker(workers, &ordered_extent->work); + + return 0; } /* @@ -5909,9 +5941,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; - struct extent_state *cached_state = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; int ret; @@ -5921,73 +5951,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes); + ordered_bytes, !err); if (!ret) goto out_test; - BUG_ON(!ordered); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - err = -ENOMEM; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret) - err = btrfs_update_inode_fallback(trans, root, inode); - goto out; - } - - lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, 0, - &cached_state); - - if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - ret = btrfs_mark_extent_written(trans, inode, - ordered->file_offset, - ordered->file_offset + - ordered->len); - if (ret) { - err = ret; - goto out_unlock; - } - } else { - ret = insert_reserved_file_extent(trans, inode, - ordered->file_offset, - ordered->start, - ordered->disk_len, - ordered->len, - ordered->len, - 0, 0, 0, - BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered->file_offset, ordered->len); - if (ret) { - err = ret; - WARN_ON(1); - goto out_unlock; - } - } - - add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode_fallback(trans, root, inode); - ret = 0; -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, - &cached_state, GFP_NOFS); -out: - btrfs_delalloc_release_metadata(inode, ordered->len); - btrfs_end_transaction(trans, root); - ordered_offset = ordered->file_offset + ordered->len; - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - + ordered->work.func = finish_ordered_fn; + ordered->work.flags = 0; + btrfs_queue_worker(&root->fs_info->endio_write_workers, + &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -5996,12 +5967,12 @@ out_test: if (ordered_offset < dip->logical_offset + dip->bytes) { ordered_bytes = dip->logical_offset + dip->bytes - ordered_offset; + ordered = NULL; goto again; } out_done: bio->bi_private = dip->private; - kfree(dip->csums); kfree(dip); /* If we had an error make sure to clear the uptodate flag */ @@ -6069,9 +6040,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int ret; bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto err; + + if (!write) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + } if (skip_sum) goto map; @@ -6491,13 +6465,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) static void btrfs_invalidatepage(struct page *page, unsigned long offset) { + struct inode *inode = page->mapping->host; struct extent_io_tree *tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - /* * we have the page locked, so new writeback can't start, * and the dirty bit won't be cleared while we are here. @@ -6507,13 +6481,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ wait_on_page_writeback(page); - tree = &BTRFS_I(page->mapping->host)->io_tree; + tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(page->mapping->host, + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* @@ -6528,9 +6502,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * whoever cleared the private bit is responsible * for the finish_ordered_io */ - if (TestClearPagePrivate2(page)) { - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + if (TestClearPagePrivate2(page) && + btrfs_dec_test_ordered_pending(inode, &ordered, page_start, + PAGE_CACHE_SIZE, 1)) { + btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); cached_state = NULL; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9565c028916..9e138cdc36c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->len = len; entry->disk_len = disk_len; entry->bytes_left = len; - entry->inode = inode; + entry->inode = igrab(inode); entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, trace_btrfs_ordered_extent_add(inode, entry); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) ordered_data_tree_panic(inode, -EEXIST, file_offset); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_inode_tree *tree; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); } /* @@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode, */ int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size) + u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; int ret; + unsigned long flags; u64 dec_end; u64 dec_start; u64 to_dec; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); if (!node) { ret = 1; @@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, (unsigned long long)to_dec); } entry->bytes_left -= to_dec; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -332,7 +336,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -347,15 +351,21 @@ out: */ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size) + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; int ret; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: if (!offset_in_entry(entry, file_offset)) { ret = 1; goto out; @@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, (unsigned long long)io_size); } entry->bytes_left -= io_size; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -383,7 +397,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + if (entry->inode) + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree lock - * while you call this function. + * and waiters are woken up. */ -static void __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_unlock_irq(&tree->lock); spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * remove an ordered extent from the tree. No references are dropped - * but any waiters are woken. - */ -void btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - __btrfs_remove_ordered_extent(inode, entry); - spin_unlock(&tree->lock); wake_up(&entry->wait); } @@ -663,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -674,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (entry) atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -690,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) { node = tree_search(tree, file_offset + len); @@ -715,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, out: if (entry) atomic_inc(&entry->refs); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -731,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -739,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -765,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -803,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } node = prev; } - while (node) { + for (; node; node = rb_prev(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + /* We treat this entry as if it doesnt exist */ + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; if (test->file_offset + test->len <= disk_i_size) break; if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; - node = rb_prev(node); } new_i_size = min_t(u64, offset, i_size); @@ -829,17 +834,27 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else node = rb_first(&tree->tree); } - i_size_test = 0; - if (node) { - /* - * do we have an area where IO might have finished - * between our ordered extent and the next one. - */ + + /* + * We are looking for an area between our current extent and the next + * ordered extent to update the i_size to. There are 3 cases here + * + * 1) We don't actually have anything and we can update to i_size. + * 2) We have stuff but they already did their i_size update so again we + * can just update to i_size. + * 3) We have an outstanding ordered extent so the most we can update + * our disk_i_size to is the start of the next offset. + */ + i_size_test = i_size; + for (; node; node = rb_next(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > offset) + + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; + if (test->file_offset > offset) { i_size_test = test->file_offset; - } else { - i_size_test = i_size; + break; + } } /* @@ -853,15 +868,15 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, ret = 0; out: /* - * we need to remove the ordered extent with the tree lock held - * so that other people calling this function don't find our fully - * processed ordered entry and skip updating the i_size + * We need to do this because we can't remove ordered extents until + * after the i_disk_size has been updated and then the inode has been + * updated to reflect the change, so we need to tell anybody who finds + * this ordered extent that we've already done all the real work, we + * just haven't completed all the other work. */ if (ordered) - __btrfs_remove_ordered_extent(inode, ordered); - spin_unlock(&tree->lock); - if (ordered) - wake_up(&ordered->wait); + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); + spin_unlock_irq(&tree->lock); return ret; } @@ -886,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, if (!ordered) return 1; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; @@ -901,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, } } out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); return ret; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c355ad4dc1a..e03c560d299 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,6 +74,12 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ +#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ + +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent + * has done its due diligence in updating + * the isize. */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -113,6 +119,8 @@ struct btrfs_ordered_extent { /* a per root list of all the pending ordered extents */ struct list_head root_extent_list; + + struct btrfs_work work; }; @@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size); + u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size); + u64 *file_offset, u64 io_size, + int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, From f8c5d0b443ff87c43ba690fa2b5bd2c9387d8624 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 10 May 2012 18:10:38 +0800 Subject: [PATCH 20/37] Btrfs: fix wrong error returned by adding a device Reproduce: $ mkfs.btrfs /dev/sdb7 $ mount /dev/sdb7 /mnt/btrfs -o ro $ btrfs dev add /dev/sdb8 /mnt/btrfs ERROR: error adding the device '/dev/sdb8' - Invalid argument Since we mount with readonly options, and /dev/sdb7 is not a seeding one, a readonly notification is preferred. Signed-off-by: Liu Bo Reviewed-by: Josef Bacik --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1411b99555a..48a06d1fc06 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1633,7 +1633,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) int ret = 0; if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) - return -EINVAL; + return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); From d1ac6e41d5437385957fd708e285defd0b1a430c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 10 May 2012 18:10:39 +0800 Subject: [PATCH 21/37] Btrfs: use fastpath in extent state ops as much as possible Fully utilize our extent state's new helper functions to use fastpath as much as possible. Signed-off-by: Liu Bo Reviewed-by: Josef Bacik --- fs/btrfs/extent_io.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7af93435cee..69a527c7a0b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -569,10 +569,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - clear_state_bit(tree, state, &bits, wake); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; + state = clear_state_bit(tree, state, &bits, wake); + goto next; } goto search_again; } @@ -780,7 +778,6 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; @@ -788,20 +785,15 @@ hit_next: } set_state_bits(tree, state, &bits); - cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -844,6 +836,10 @@ hit_next: if (last_end == (u64)-1) goto out; start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -993,21 +989,14 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; - set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -1041,10 +1030,13 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } From 9ba1f6e44ed7a1fa52d3f292508bf921b5054172 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 11 May 2012 18:11:26 +0800 Subject: [PATCH 22/37] Btrfs: do not do balance in readonly mode In normal cases, we would not be allowed to do balance in RO mode. However, when we're using a seeding device and adding another device to sprout, things will change: $ mkfs.btrfs /dev/sdb7 $ btrfstune -S 1 /dev/sdb7 $ mount /dev/sdb7 /mnt/btrfs -o ro $ btrfs fi bal /mnt/btrfs -----------------------> fail. $ btrfs dev add /dev/sdb8 /mnt/btrfs $ btrfs fi bal /mnt/btrfs -----------------------> works! It should not be designed as an exception, and we'd better add another check for mnt flags. Signed-off-by: Liu Bo Reviewed-by: Josef Bacik --- fs/btrfs/ioctl.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3d8ab27622c..15baf945630 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3216,8 +3216,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, } } -static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_balance(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; @@ -3229,6 +3230,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -3295,6 +3300,7 @@ out_bargs: out: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + mnt_drop_write(file->f_path.mnt); return ret; } @@ -3390,7 +3396,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(root, NULL); + return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3423,7 +3429,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); case BTRFS_IOC_BALANCE_V2: - return btrfs_ioctl_balance(root, argp); + return btrfs_ioctl_balance(file, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(root, arg); case BTRFS_IOC_BALANCE_PROGRESS: From cd023e7b17fe86c530475da210b3348421c40e5f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 May 2012 10:06:40 -0400 Subject: [PATCH 23/37] Btrfs: merge contigous regions when loading free space cache When we write out the free space cache we will write out everything that is in our in memory tree, and then we will just walk the pinned extents tree and write anything we see there. The problem with this is that during normal operations the pinned extents will be merged back into the free space tree normally, and then we can allocate space from the merged areas and commit them to the tree log. If we crash and replay the tree log we will crash again because the tree log will try to free up space from what looks like 2 seperate but contiguous entries, since one entry is from the original free space cache and the other was a pinned extent that was merged back. To fix this we just need to walk the free space tree after we load it and merge contiguous entries back together. This will keep the tree log stuff from breaking and it will make the allocator behave more nicely. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cecf8df6248..19a0d85b451 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -33,6 +33,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -584,6 +586,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, return 0; } +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries. This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache. So run through the space cache that we just + * loaded and merge contiguous entries. This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *e, *prev = NULL; + struct rb_node *n; + +again: + spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + e = rb_entry(n, struct btrfs_free_space, offset_index); + if (!prev) + goto next; + if (e->bitmap || prev->bitmap) + goto next; + if (prev->offset + prev->bytes == e->offset) { + unlink_free_space(ctl, prev); + unlink_free_space(ctl, e); + prev->bytes += e->bytes; + kmem_cache_free(btrfs_free_space_cachep, e); + link_free_space(ctl, prev); + prev = NULL; + spin_unlock(&ctl->tree_lock); + goto again; + } +next: + prev = e; + } + spin_unlock(&ctl->tree_lock); +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) @@ -726,6 +766,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } io_ctl_drop_pages(&io_ctl); + merge_space_tree(ctl); ret = 1; out: io_ctl_free(&io_ctl); From 72ac3c0d7921f943d92d1ef42a549fb52e56817d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 May 2012 14:13:11 -0400 Subject: [PATCH 24/37] Btrfs: convert the inode bit field to use the actual bit operations Miao pointed this out while I was working on an orphan problem that messing with a bitfield where different ranges are protected by different locks doesn't work out right. Turns out we've been doing this forever where we have different parts of the bit field protected by either no lock at all or different locks which could cause all sorts of weird problems including the issue I was hitting. So instead make a runtime_flags thing that we use the normal bit operations on that are all atomic so we can keep having our no/different locking for the different flags and then make force_compress it's own thing so it can be treated normally. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 30 ++++++++++++++++-------------- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/extent-tree.c | 11 ++++++----- fs/btrfs/file.c | 12 ++++++------ fs/btrfs/inode.c | 28 ++++++++++++---------------- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3771b8543a7..6265edb219e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -24,6 +24,19 @@ #include "ordered-data.h" #include "delayed-inode.h" +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 +#define BTRFS_INODE_ORPHAN_META_RESERVED 1 +#define BTRFS_INODE_DUMMY 2 +#define BTRFS_INODE_IN_DEFRAG 3 +#define BTRFS_INODE_DELALLOC_META_RESERVED 4 + /* in memory btrfs inode */ struct btrfs_inode { /* which subvolume this inode belongs to */ @@ -78,6 +91,8 @@ struct btrfs_inode { /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; + unsigned long runtime_flags; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -141,23 +156,10 @@ struct btrfs_inode { unsigned outstanding_extents; unsigned reserved_extents; - /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - */ - unsigned ordered_data_close:1; - unsigned orphan_meta_reserved:1; - unsigned dummy_inode:1; - unsigned in_defrag:1; - unsigned delalloc_meta_reserved:1; - /* * always compress this one file */ - unsigned force_compress:4; + unsigned force_compress; struct btrfs_delayed_node *delayed_node; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bcd40c7109f..c18d0442ae6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); - if (BTRFS_I(inode)->delalloc_meta_reserved) { - BTRFS_I(inode)->delalloc_meta_reserved = 0; + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { spin_unlock(&BTRFS_I(inode)->lock); release = true; goto migrate; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 19f5b450f40..0cf8ef2b5b1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2001,7 +2001,8 @@ int open_ctree(struct super_block *sb, BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); - BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 59ae191d4f9..1902726fa70 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode) BTRFS_I(inode)->outstanding_extents--; if (BTRFS_I(inode)->outstanding_extents == 0 && - BTRFS_I(inode)->delalloc_meta_reserved) { + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) drop_inode_space = 1; - BTRFS_I(inode)->delalloc_meta_reserved = 0; - } /* * If we have more or the same amount of outsanding extents than we have @@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * Add an item to reserve for updating the inode when we complete the * delalloc io. */ - if (!BTRFS_I(inode)->delalloc_meta_reserved) { + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { nr_extents++; extra_reserve = 1; } @@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); if (extra_reserve) { - BTRFS_I(inode)->delalloc_meta_reserved = 1; + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); nr_extents--; } BTRFS_I(inode)->reserved_extents += nr_extents; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cfc0ab915d0..c9005f21697 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -103,7 +103,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, goto exists; } } - BTRFS_I(inode)->in_defrag = 1; + set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); return; @@ -131,7 +131,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (btrfs_fs_closing(root->fs_info)) return 0; - if (BTRFS_I(inode)->in_defrag) + if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) return 0; if (trans) @@ -148,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!BTRFS_I(inode)->in_defrag) + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else kfree(defrag); @@ -252,7 +252,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) goto next; /* do a chunk of defrag */ - BTRFS_I(inode)->in_defrag = 0; + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); range.start = defrag->last_offset; num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, defrag_batch); @@ -1465,8 +1465,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. */ - if (BTRFS_I(inode)->ordered_data_close) { - BTRFS_I(inode)->ordered_data_close = 0; + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) { btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9a1b96fd672..91ad6390175 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2182,10 +2182,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; } - if (!BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 1; + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) reserve = 1; - } spin_unlock(&root->orphan_lock); /* grab metadata reservation from transaction handle */ @@ -2233,10 +2232,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) delete_item = 1; } - if (BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 0; + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) release_rsv = 1; - } spin_unlock(&root->orphan_lock); if (trans && delete_item) { @@ -3642,7 +3640,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) * any new writes get down to disk quickly. */ if (newsize == 0) - BTRFS_I(inode)->ordered_data_close = 1; + set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags); /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); @@ -4102,7 +4101,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); - BTRFS_I(inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; inode->i_op = &btrfs_dir_ro_inode_operations; @@ -4406,7 +4405,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; bool nolock = false; - if (BTRFS_I(inode)->dummy_inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) @@ -4439,7 +4438,7 @@ int btrfs_dirty_inode(struct inode *inode) struct btrfs_trans_handle *trans; int ret; - if (BTRFS_I(inode)->dummy_inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; trans = btrfs_join_transaction(root); @@ -6752,7 +6751,8 @@ static int btrfs_truncate(struct inode *inode) * using truncate to replace the contents of the file will * end up with a zero length file after a crash. */ - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) btrfs_add_ordered_operation(trans, root, inode); while (1) { @@ -6889,11 +6889,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->outstanding_extents = 0; ei->reserved_extents = 0; - ei->ordered_data_close = 0; - ei->orphan_meta_reserved = 0; - ei->dummy_inode = 0; - ei->in_defrag = 0; - ei->delalloc_meta_reserved = 0; + ei->runtime_flags = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; From 8a35d95ff4680a456d3ce47df9638f33d4f54f20 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 May 2012 14:26:42 -0400 Subject: [PATCH 25/37] Btrfs: fix how we deal with the orphan block rsv Ceph was hitting this race where we would remove an inode from the per-root orphan list before we would release the space we had reserved for the inode. We actually don't need a list or anything, we just need to make sure the root doesn't try to free up the orphan reserve until after the inodes have released their reservations. So use an atomic counter instead of a list on the root and only decrement the counter after we've released our reservation. I've tested this as well as several others and we no longer see the warnings that you would see while running ceph. Thanks, Btrfs: fix how we deal with the orphan block rsv Ceph was hitting this race where we would remove an inode from the per-root orphan list before we would release the space we had reserved for the inode. We actually don't need a list or anything, we just need to make sure the root doesn't try to free up the orphan reserve until after the inodes have released their reservations. So use an atomic counter instead of a list on the root and only decrement the counter after we've released our reservation. I've tested this as well as several others and we no longer see the warnings that you would see while running ceph. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 4 +--- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 38 +++++++++++++++++++++----------------- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6265edb219e..ce2c9d60031 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -36,6 +36,7 @@ #define BTRFS_INODE_DUMMY 2 #define BTRFS_INODE_IN_DEFRAG 3 #define BTRFS_INODE_DELALLOC_META_RESERVED 4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 /* in memory btrfs inode */ struct btrfs_inode { @@ -70,9 +71,6 @@ struct btrfs_inode { /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; - /* for keeping track of orphaned inodes */ - struct list_head i_orphan; - /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd72331d60..aad2600718a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1375,7 +1375,7 @@ struct btrfs_root { struct list_head root_list; spinlock_t orphan_lock; - struct list_head orphan_list; + atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; int orphan_item_inserted; int orphan_cleanup_state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0cf8ef2b5b1..297e5a8ed93 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); - INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); @@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->orphan_inodes, 0); root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 91ad6390175..029892887fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2104,12 +2104,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; int ret; - if (!list_empty(&root->orphan_list) || + if (atomic_read(&root->orphan_inodes) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; spin_lock(&root->orphan_lock); - if (!list_empty(&root->orphan_list)) { + if (atomic_read(&root->orphan_inodes)) { spin_unlock(&root->orphan_lock); return; } @@ -2166,8 +2166,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) block_rsv = NULL; } - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { #if 0 /* * For proper ENOSPC handling, we should do orphan @@ -2180,6 +2180,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; #endif insert = 1; + atomic_dec(&root->orphan_inodes); } if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, @@ -2197,6 +2198,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); btrfs_abort_transaction(trans, root, ret); return ret; } @@ -2227,10 +2230,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) int ret = 0; spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - list_del_init(&BTRFS_I(inode)->i_orphan); + if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) delete_item = 1; - } if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &BTRFS_I(inode)->runtime_flags)) @@ -2242,8 +2244,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ } - if (release_rsv) + if (release_rsv) { btrfs_orphan_release_metadata(inode); + atomic_dec(&root->orphan_inodes); + } return 0; } @@ -2371,6 +2375,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } + printk(KERN_ERR "auto deleting %Lu\n", + found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ @@ -2382,9 +2388,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); + set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { @@ -3706,7 +3711,8 @@ void btrfs_evict_inode(struct inode *inode) btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { - BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)); goto no_delete; } @@ -6903,7 +6909,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); - INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); @@ -6948,13 +6953,12 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", (unsigned long long)btrfs_ino(inode)); - list_del_init(&BTRFS_I(inode)->i_orphan); + atomic_dec(&root->orphan_inodes); } - spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); From 2adcac1a7331d93a17285804819caa96070b231f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 May 2012 16:10:14 -0400 Subject: [PATCH 26/37] Btrfs: fall back to non-inline if we don't have enough space If cow_file_range_inline fails with ENOSPC we abort the transaction which isn't very nice. This really shouldn't be happening anyways but there's no sense in making it a horrible error when we can easily just go allocate normal data space for this stuff. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 029892887fc..92df0a5d1d9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); - if (ret) { + if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); return ret; + } else if (ret == -ENOSPC) { + return 1; } + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; From 762f2263260d576504aeb23d20f90120acdb025f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 May 2012 18:58:27 +0800 Subject: [PATCH 27/37] Btrfs: fix the same inode id problem when doing auto defragment Two files in the different subvolumes may have the same inode id, so The rb-tree which is used to manage the defragment object must take it into account. This patch fix this problem. Signed-off-by: Miao Xie --- fs/btrfs/file.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c9005f21697..2e63cdc2b09 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -65,6 +65,21 @@ struct inode_defrag { int cycled; }; +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + /* pop a record for an inode into the defrag tree. The lock * must be held already * @@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; + int ret; p = &root->fs_info->defrag_inodes.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - if (defrag->ino < entry->ino) + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) p = &parent->rb_left; - else if (defrag->ino > entry->ino) + else if (ret > 0) p = &parent->rb_right; else { /* if we're reinserting an entry for @@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, /* * must be called with the defrag_inodes lock held */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, +struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, + u64 root, u64 ino, struct rb_node **next) { struct inode_defrag *entry = NULL; + struct inode_defrag tmp; struct rb_node *p; struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; p = info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - if (ino < entry->ino) + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) p = parent->rb_left; - else if (ino > entry->ino) + else if (ret > 0) p = parent->rb_right; else return entry; } if (next) { - while (parent && ino > entry->ino) { + while (parent && __compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); entry = rb_entry(parent, struct inode_defrag, rb_node); } @@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; u64 first_ino = 0; + u64 root_objectid = 0; int num_defrag; int defrag_batch = 1024; @@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) n = NULL; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); + defrag = btrfs_find_defrag_inode(fs_info, root_objectid, + first_ino, &n); if (!defrag) { - if (n) - defrag = rb_entry(n, struct inode_defrag, rb_node); - else if (first_ino) { + if (n) { + defrag = rb_entry(n, struct inode_defrag, + rb_node); + } else if (root_objectid || first_ino) { + root_objectid = 0; first_ino = 0; continue; } else { @@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* remove it from the rbtree */ first_ino = defrag->ino + 1; + root_objectid = defrag->root; rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); if (btrfs_fs_closing(fs_info)) From d07eb9117050c9ed3f78296ebcc06128b52693be Mon Sep 17 00:00:00 2001 From: Asias He Date: Fri, 25 May 2012 11:10:21 +0800 Subject: [PATCH 28/37] btrfs: Drop unused function btrfs_abort_devices() 1) This function is not used anywhere. 2) Using the blk_abort_queue() to abort the queue seems not correct. blk_abort_queue() is used for timeout handling (block/blk-timeout.c). Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Cc: Jens Axboe Cc: linux-kernel@vger.kernel.org Signed-off-by: Asias He --- fs/btrfs/disk-io.c | 13 ------------- fs/btrfs/disk-io.h | 1 - 2 files changed, 14 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 297e5a8ed93..0f788c05906 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2903,19 +2903,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -/* Kill all outstanding I/O */ -void btrfs_abort_devices(struct btrfs_root *root) -{ - struct list_head *head; - struct btrfs_device *dev; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - blk_abort_queue(dev->bdev->bd_disk->queue); - } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -} - void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index ab1830aaf0e..05b3fab39f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); From 442a4f6308e694e0fa6025708bd5e4e424bbf51c Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 25 May 2012 16:06:08 +0200 Subject: [PATCH 29/37] Btrfs: add device counters for detected IO and checksum errors The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. Signed-off-by: Stefan Behrens --- fs/btrfs/disk-io.c | 13 ++++-- fs/btrfs/extent_io.c | 18 ++++++++- fs/btrfs/ioctl.h | 19 +++++++++ fs/btrfs/scrub.c | 65 ++++++++++++++++++++++-------- fs/btrfs/volumes.c | 94 +++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 45 +++++++++++++++++++++ 6 files changed, 230 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0f788c05906..46d474e74aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2557,18 +2557,19 @@ recovery_tree_root: static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { + struct btrfs_device *device = (struct btrfs_device *) + bh->b_private; + printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); + "I/O error on %s\n", device->name); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); } unlock_buffer(bh); put_bh(bh); @@ -2683,6 +2684,7 @@ static int write_dev_supers(struct btrfs_device *device, set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; } /* @@ -2741,6 +2743,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait) } if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; + if (!bio_flagged(bio, BIO_EOPNOTSUPP)) + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 69a527c7a0b..b3692c1373a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1913,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { /* try to remap that extent elsewhere? */ bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } @@ -2327,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); - if (ret) + if (ret) { + /* no IO indicated but software detected errors + * in the block, either checksum errors or + * issues with the contents */ + struct btrfs_root *root = + BTRFS_I(page->mapping->host)->root; + struct btrfs_device *device; + uptodate = 0; - else + device = btrfs_find_device_for_logical( + root, start, mirror); + if (device) + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } else { clean_io_failure(start, page); + } } if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 086e6bdae1c..5bf05e28b82 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -266,6 +266,25 @@ struct btrfs_ioctl_logical_ino_args { __u64 inodes; }; +enum btrfs_dev_stat_values { + /* disk I/O failure stats */ + BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not + * been written */ + + BTRFS_DEV_STAT_VALUES_MAX +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2f3d6f917fb..a38cfa4f251 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -50,7 +50,7 @@ struct scrub_dev; struct scrub_page { struct scrub_block *sblock; struct page *page; - struct block_device *bdev; + struct btrfs_device *dev; u64 flags; /* extent flags */ u64 generation; u64 logical; @@ -86,6 +86,7 @@ struct scrub_block { unsigned int header_error:1; unsigned int checksum_error:1; unsigned int no_io_error_seen:1; + unsigned int generation_error:1; /* also sets header_error */ }; }; @@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sdev->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sdev->stat_lock); sdev->stat.csum_errors++; spin_unlock(&sdev->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { spin_lock(&sdev->stat_lock); sdev->stat.verify_errors++; @@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); + if (sblock_bad->generation_error) + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + else + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); } if (sdev->readonly) @@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page = sblock->pagev + page_index; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, bdev is NULL */ - page->bdev = bbio->stripes[mirror_index].dev->bdev; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); if (!page->page) { @@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_page *page = sblock->pagev + page_num; DECLARE_COMPLETION_ONSTACK(complete); - if (page->bdev == NULL) { + if (page->dev->bdev == NULL) { page->io_error = 1; sblock->no_io_error_seen = 0; continue; @@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_bdev = page->bdev; + bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; @@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, h = (struct btrfs_header *)mapped_buffer; if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || - generation != le64_to_cpu(h->generation) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + BTRFS_UUID_SIZE)) { sblock->header_error = 1; + } else if (generation != le64_to_cpu(h->generation)) { + sblock->header_error = 1; + sblock->generation_error = 1; + } csum = h->csum; } else { if (!have_csum) @@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_bdev = page_bad->bdev; + bio->bi_bdev = page_bad->dev->bdev; bio->bi_sector = page_bad->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; @@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, /* this will also unplug the queue */ wait_for_completion(&complete); + if (!bio_flagged(bio, BIO_UPTODATE)) { + btrfs_dev_stat_inc_and_print(page_bad->dev, + BTRFS_DEV_STAT_WRITE_ERRS); + bio_put(bio); + return -EIO; + } bio_put(bio); } @@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) u64 mapped_size; void *p; u32 crc = ~(u32)0; - int fail = 0; + int fail_gen = 0; + int fail_cor = 0; u64 len; int index; @@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock) memcpy(on_disk_csum, s->csum, sdev->csum_size); if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) - ++fail; + ++fail_cor; if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) - ++fail; + ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) - ++fail; + ++fail_cor; len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; @@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) btrfs_csum_final(crc, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) - ++fail; + ++fail_cor; - if (fail) { + if (fail_cor + fail_gen) { /* * if we find an error in a super block, we just report it. * They will get written with the next transaction commit @@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) spin_lock(&sdev->stat_lock); ++sdev->stat.super_errors; spin_unlock(&sdev->stat_lock); + if (fail_cor) + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + else + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); } - return fail; + return fail_cor + fail_gen; } static void scrub_block_get(struct scrub_block *sblock) @@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->bdev = sdev->dev->bdev; + spage->dev = sdev->dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 48a06d1fc06..2915521f44e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include "compat.h" @@ -4001,13 +4002,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } +static void *merge_stripe_index_into_bio_private(void *bi_private, + unsigned int stripe_index) +{ + /* + * with single, dup, RAID0, RAID1 and RAID10, stripe_index is + * at most 1. + * The alternative solution (instead of stealing bits from the + * pointer) would be to allocate an intermediate structure + * that contains the old private pointer plus the stripe_index. + */ + BUG_ON((((uintptr_t)bi_private) & 3) != 0); + BUG_ON(stripe_index > 3); + return (void *)(((uintptr_t)bi_private) | stripe_index); +} + +static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) +{ + return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); +} + +static unsigned int extract_stripe_index_from_bio_private(void *bi_private) +{ + return (unsigned int)((uintptr_t)bi_private) & 3; +} + static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); int is_orig_bio = 0; - if (err) + if (err) { atomic_inc(&bbio->error); + if (err == -EIO || err == -EREMOTEIO) { + unsigned int stripe_index = + extract_stripe_index_from_bio_private( + bio->bi_private); + struct btrfs_device *dev; + + BUG_ON(stripe_index >= bbio->num_stripes); + dev = bbio->stripes[stripe_index].dev; + if (bio->bi_rw & WRITE) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_WRITE_ERRS); + else + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_READ_ERRS); + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_print_on_error(dev); + } + } if (bio == bbio->orig_bio) is_orig_bio = 1; @@ -4149,6 +4195,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; } bio->bi_private = bbio; + bio->bi_private = merge_stripe_index_into_bio_private( + bio->bi_private, (unsigned int)dev_nr); bio->bi_end_io = btrfs_end_bio; bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; @@ -4509,6 +4557,28 @@ int btrfs_read_sys_array(struct btrfs_root *root) return ret; } +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int ret; + u64 map_length = 0; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *device; + + BUG_ON(mirror_num == 0); + ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, + mirror_num); + if (ret) { + BUG_ON(bbio != NULL); + return NULL; + } + BUG_ON(mirror_num != bbio->mirror_num); + device = bbio->stripes[mirror_num - 1].dev; + kfree(bbio); + return device; +} + int btrfs_read_chunk_tree(struct btrfs_root *root) { struct btrfs_path *path; @@ -4583,3 +4653,23 @@ error: btrfs_free_path(path); return ret; } + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ + btrfs_dev_stat_inc(dev, index); + btrfs_dev_stat_print_on_error(dev); +} + +void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +{ + printk_ratelimited(KERN_ERR + "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + dev->name, + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, + BTRFS_DEV_STAT_GENERATION_ERRS)); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bb6b03f97aa..193b2835e6a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -22,6 +22,7 @@ #include #include #include "async-thread.h" +#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -106,6 +107,10 @@ struct btrfs_device { struct completion flush_wait; int nobarriers; + /* disk I/O failure stats. For detailed description refer to + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_dirty; /* counters need to be written to disk */ + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; struct btrfs_fs_devices { @@ -281,4 +286,44 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num); +void btrfs_dev_stat_print_on_error(struct btrfs_device *device); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); + +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, + int index) +{ + atomic_inc(dev->dev_stat_values + index); + dev->dev_stats_dirty = 1; +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, + int index) +{ + return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, + int index) +{ + int ret; + + ret = atomic_xchg(dev->dev_stat_values + index, 0); + dev->dev_stats_dirty = 1; + return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, + int index, unsigned long val) +{ + atomic_set(dev->dev_stat_values + index, val); + dev->dev_stats_dirty = 1; +} + +static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, + int index) +{ + btrfs_dev_stat_set(dev, index, 0); +} #endif From c11d2c236cc260b36ef644700fbe99bcc7e7da33 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 25 May 2012 16:06:09 +0200 Subject: [PATCH 30/37] Btrfs: add ioctl to get and reset the device stats An ioctl interface is added to get the device statistic counters. A second ioctl is added to atomically get and reset these counters. Signed-off-by: Stefan Behrens --- fs/btrfs/ioctl.c | 26 ++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 14 ++++++++++++++ fs/btrfs/volumes.c | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 3 +++ 4 files changed, 77 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 15baf945630..0f8c354c4c7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3046,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, + void __user *arg, int reset_after_read) +{ + struct btrfs_ioctl_get_dev_stats *sa; + int ret; + + if (reset_after_read && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_get_dev_stats(root, sa, reset_after_read); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3434,6 +3456,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_ctl(root, arg); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root, argp); + case BTRFS_IOC_GET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp, 0); + case BTRFS_IOC_GET_AND_RESET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp, 1); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 5bf05e28b82..497c530724c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -285,6 +285,16 @@ enum btrfs_dev_stat_values { BTRFS_DEV_STAT_VALUES_MAX }; +struct btrfs_ioctl_get_dev_stats { + __u64 devid; /* in */ + __u64 nr_items; /* in/out */ + + /* out values: */ + __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; + + __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -349,5 +359,9 @@ enum btrfs_dev_stat_values { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ + struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_get_dev_stats) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2915521f44e..a112b758822 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4673,3 +4673,37 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } + +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats, + int reset_after_read) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int i; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + printk(KERN_WARNING + "btrfs: get dev_stats failed, device not found\n"); + return -ENODEV; + } else if (reset_after_read) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (stats->nr_items > i) + stats->values[i] = + btrfs_dev_stat_read_and_reset(dev, i); + else + btrfs_dev_stat_reset(dev, i); + } + } else { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (stats->nr_items > i) + stats->values[i] = btrfs_dev_stat_read(dev, i); + } + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 193b2835e6a..6798f8674b1 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -290,6 +290,9 @@ struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, u64 logical, int mirror_num); void btrfs_dev_stat_print_on_error(struct btrfs_device *device); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats, + int reset_after_read); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) From 733f4fbbc1083aa343da739f46ee839705d6cfe3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 25 May 2012 16:06:10 +0200 Subject: [PATCH 31/37] Btrfs: read device stats on mount, write modified ones during commit The device statistics are written into the device tree with each transaction commit. Only modified statistics are written. When a filesystem is mounted, the device statistics for each involved device are read from the device tree and used to initialize the counters. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 38 +++++++++ fs/btrfs/disk-io.c | 7 ++ fs/btrfs/print-tree.c | 3 + fs/btrfs/transaction.c | 4 + fs/btrfs/volumes.c | 176 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 4 + 6 files changed, 232 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aad2600718a..e176f8c551f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -823,6 +823,14 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1507,6 +1515,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_BALANCE_ITEM_KEY 248 +/* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + */ +#define BTRFS_DEV_STATS_KEY 249 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -2415,6 +2429,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* btrfs_dev_stats_item */ +static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index) +{ + u64 val; + + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 46d474e74aa..b0d49e21b0b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2354,6 +2354,13 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; + ret = btrfs_init_dev_stats(fs_info); + if (ret) { + printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", + ret); + goto fail_block_groups; + } + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f38e452486b..5e23684887e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + case BTRFS_DEV_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; }; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 36422254ef6..82b03afcbd9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "locking.h" #include "tree-log.h" #include "inode-map.h" +#include "volumes.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -758,6 +759,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_run_dev_stats(trans, root->fs_info); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a112b758822..7782020996f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -362,6 +364,7 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->dev_stats_valid = 0; device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); @@ -4654,6 +4657,162 @@ error: return ret; } +static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_reset(dev, i); +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + int i; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_dev_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + device->name, (unsigned long long)device->devid); + __btrfs_reset_dev_stats(device); + device->dev_stats_valid = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_reset(device, i); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_stats_item *ptr; + int ret; + int i; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + ret, device->name); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_set_dev_stats_value(eb, ptr, i, + btrfs_dev_stat_read(device, i)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->dev_stats_valid || !device->dev_stats_dirty) + continue; + + ret = update_dev_stat_item(trans, dev_root, device); + if (!ret) + device->dev_stats_dirty = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); @@ -4662,6 +4821,8 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { + if (!dev->dev_stats_valid) + return; printk_ratelimited(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", dev->name, @@ -4674,6 +4835,17 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) BTRFS_DEV_STAT_GENERATION_ERRS)); } +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ + printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + dev->name, + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats, int reset_after_read) @@ -4690,6 +4862,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root, printk(KERN_WARNING "btrfs: get dev_stats failed, device not found\n"); return -ENODEV; + } else if (!dev->dev_stats_valid) { + printk(KERN_WARNING + "btrfs: get dev_stats failed, not yet valid\n"); + return -ENODEV; } else if (reset_after_read) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (stats->nr_items > i) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6798f8674b1..3406a88ca83 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -109,6 +109,7 @@ struct btrfs_device { /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; int dev_stats_dirty; /* counters need to be written to disk */ atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; @@ -293,6 +294,9 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats, int reset_after_read); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) From 018642a1f197887058e97291460b890d296e8953 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 29 May 2012 18:10:13 +0900 Subject: [PATCH 32/37] Btrfs: return value of btrfs_read_buffer is checked correctly btrfs_read_buffer() has the possibility of returning the error. Therefore, I add the code in which the return value of btrfs_read_buffer() is checked. Signed-off-by: Tsutomu Itoh --- fs/btrfs/ctree.c | 6 +++++- fs/btrfs/tree-log.c | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 26847999c64..99fcad631a2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -739,7 +739,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) return -EIO; } else if (!uptodate) { - btrfs_read_buffer(cur, gen); + err = btrfs_read_buffer(cur, gen); + if (err) { + free_extent_buffer(cur); + return err; + } } } if (search_start == 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eb1ae908582..6f22a4fca8d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; level = btrfs_header_level(eb); @@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); @@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } WARN_ON(*level <= 0); if (path->nodes[*level-1]) From 22ee6985de7d3e81ec0cef9c6ba01b45ad1bafeb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 May 2012 16:57:49 -0400 Subject: [PATCH 33/37] Btrfs: check to see if the inode is in the log before fsyncing We have this check down in the actual logging code, but this is after we start a transaction and all that good stuff. So move the helper inode_in_log() out so we can call it in fsync() and avoid starting a transaction altogether and just exit if we've already fsync()'ed this file recently. You would notice this issue if you fsync()'ed a file over and over again until the transaction committed. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 13 +++++++++++++ fs/btrfs/file.c | 3 ++- fs/btrfs/tree-log.c | 17 +---------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ce2c9d60031..e616f8872e6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -199,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, return false; } +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == generation && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2e63cdc2b09..876cddd6b2f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1552,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * syncing */ smp_mb(); - if (BTRFS_I(inode)->last_trans <= + if (btrfs_inode_in_log(inode, root->fs_info->generation) || + BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6f22a4fca8d..425014bdc6a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3038,21 +3038,6 @@ out: return ret; } -static int inode_in_log(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); - if (BTRFS_I(inode)->logged_trans == trans->transid && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; -} - - /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -3093,7 +3078,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (inode_in_log(trans, inode)) { + if (btrfs_inode_in_log(inode, trans->transid)) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } From 5bdbeb2187a99d690b374a8c5ec9911fcbcfe739 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 May 2012 16:59:49 -0400 Subject: [PATCH 34/37] Btrfs: fix return code in drop_objectid_items So dpkg fsync()'s the file and the directory containing the file whenever it writes to a file which is really slow in btrfs. This is partly because fsync()'ing a directory _always_ committed the transaction instead of just going to the tree log. This is because drop_objectid_items() would return 1 since it does a btrfs_search_slot() which returns 1. In tree-log jargon this means that we have to commit the transaction to be safe. So just check if ret is greater than 0 and set it to 0 if it does. With this patch we now use the tree-log instead of committing the entire transaction, which is twice as fast on my box. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 425014bdc6a..2017d0ff511 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2667,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); } btrfs_release_path(path); + if (ret > 0) + ret = 0; return ret; } From 3d136a1131c66f7d26fb171e4c5b0b8baacd3129 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 3 Feb 2012 11:20:04 +0100 Subject: [PATCH 35/37] Btrfs: set ioprio of scrub readahead to idle Reduce ioprio class of scrub readahead threads to idle priority. This setting is fixed. This priority has shown the best performance during all measurements. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/reada.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e176f8c551f..1c665ebe47e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* ioprio of readahead is set to idle */ +#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index ac5d0108588..48a4882d8ad 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work) { struct reada_machine_work *rmw; struct btrfs_fs_info *fs_info; + int old_ioprio; rmw = container_of(work, struct reada_machine_work, work); fs_info = rmw->fs_info; kfree(rmw); + old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), + task_nice_ioprio(current)); + set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); + set_task_ioprio(current, old_ioprio); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) From 86ff7ffce0b93aed14df4c8dcedd05bb5e2fdfbc Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 24 Apr 2012 18:10:16 +0200 Subject: [PATCH 36/37] Btrfs: fix runtime warning in check-integrity check data mode If a file_extent_item was located at the very end of a leaf and there was not enough space to hold a full item, but there was enough space to hold one of type BTRFS_FILE_EXTENT_INLINE or PREALLOC, and it was only such a short item, a warning was printed anyway. This check is now fixed. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7f6cc359e44..ed761838932 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1428,6 +1428,28 @@ static int btrfsic_handle_extent_data( file_extent_item_offset = offsetof(struct btrfs_leaf, items) + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + (unsigned long long) + le64_to_cpu(file_extent_item.disk_bytenr)); + return 0; + } + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > block_ctx->len) { printk(KERN_INFO @@ -1452,9 +1474,6 @@ static int btrfsic_handle_extent_data( le64_to_cpu(file_extent_item.disk_bytenr), (unsigned long long)le64_to_cpu(file_extent_item.offset), (unsigned long long)num_bytes); - if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) - return 0; while (num_bytes > 0) { u32 chunk_len; int num_copies; From 48235a68a3d1db579fc20d9915815228a1825757 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 23 May 2012 17:57:49 +0200 Subject: [PATCH 37/37] Btrfs: fix false positive in check-integrity on unmount During unmount, it could happen that the integrity checker printed a warning message "attempt to free ... on umount which is not yet iodone" which turned out to be a false positive. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ed761838932..9cebb1fd6a3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -3337,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root, btrfsic_block_link_free(l); } - if (b_all->is_iodone) + if (b_all->is_iodone || b_all->never_written) btrfsic_block_free(b_all); else printk(KERN_INFO "btrfs: attempt to free %c-block"