From 6606bb97e146a387932efee263745b7240a11193 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 31 Jul 2009 11:03:58 -0400
Subject: [PATCH 1/5] Btrfs: fix btrfs_remove_from_free_space corner case

Yan Zheng hit a problem where we tried to remove some free space but failed
because we couldn't find the free space entry.  This is because the free space
was held within a bitmap that had a starting offset well before the actual
offset of the free space, and there were free space extents that were in the
same range as that offset, so tree_search_offset returned with NULL because we
couldn't find a free space extent that had that offset.  This is fixed by
making sure that if we fail to find the entry, we re-search again with
bitmap_only set to 1 and do an offset_to_bitmap so we can get the appropriate
bitmap.  A similar problem happens in btrfs_alloc_from_bitmap for the
clustering code, but that is not as bad since we will just go and redo our
cluster allocation.

Also this adds some debugging checks to make sure that the free space we are
trying to remove from the bitmap is in fact there.  This can probably go away
after a while, but since this code is only used by the tree-logging stuff it
would be nice to run with it for a while to make sure there are no problems.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 73 ++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af99b78b288..5edcee3a617 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -414,11 +414,29 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
 			      u64 *offset, u64 *bytes)
 {
 	u64 end;
+	u64 search_start, search_bytes;
+	int ret;
 
 again:
 	end = bitmap_info->offset +
 		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
 
+	/*
+	 * XXX - this can go away after a few releases.
+	 *
+	 * since the only user of btrfs_remove_free_space is the tree logging
+	 * stuff, and the only way to test that is under crash conditions, we
+	 * want to have this debug stuff here just in case somethings not
+	 * working.  Search the bitmap for the space we are trying to use to
+	 * make sure its actually there.  If its not there then we need to stop
+	 * because something has gone wrong.
+	 */
+	search_start = *offset;
+	search_bytes = *bytes;
+	ret = search_bitmap(block_group, bitmap_info, &search_start,
+			    &search_bytes);
+	BUG_ON(ret < 0 || search_start != *offset);
+
 	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
 		bitmap_clear_bits(block_group, bitmap_info, *offset,
 				  end - *offset + 1);
@@ -430,6 +448,7 @@ again:
 	}
 
 	if (*bytes) {
+		struct rb_node *next = rb_next(&bitmap_info->offset_index);
 		if (!bitmap_info->bytes) {
 			unlink_free_space(block_group, bitmap_info);
 			kfree(bitmap_info->bitmap);
@@ -438,16 +457,36 @@ again:
 			recalculate_thresholds(block_group);
 		}
 
-		bitmap_info = tree_search_offset(block_group,
-						 offset_to_bitmap(block_group,
-								  *offset),
-						 1, 0);
-		if (!bitmap_info)
+		/*
+		 * no entry after this bitmap, but we still have bytes to
+		 * remove, so something has gone wrong.
+		 */
+		if (!next)
 			return -EINVAL;
 
+		bitmap_info = rb_entry(next, struct btrfs_free_space,
+				       offset_index);
+
+		/*
+		 * if the next entry isn't a bitmap we need to return to let the
+		 * extent stuff do its work.
+		 */
 		if (!bitmap_info->bitmap)
 			return -EAGAIN;
 
+		/*
+		 * Ok the next item is a bitmap, but it may not actually hold
+		 * the information for the rest of this free space stuff, so
+		 * look for it, and if we don't find it return so we can try
+		 * everything over again.
+		 */
+		search_start = *offset;
+		search_bytes = *bytes;
+		ret = search_bitmap(block_group, bitmap_info, &search_start,
+				    &search_bytes);
+		if (ret < 0 || search_start != *offset)
+			return -EAGAIN;
+
 		goto again;
 	} else if (!bitmap_info->bytes) {
 		unlink_free_space(block_group, bitmap_info);
@@ -644,8 +683,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 again:
 	info = tree_search_offset(block_group, offset, 0, 0);
 	if (!info) {
-		WARN_ON(1);
-		goto out_lock;
+		/*
+		 * oops didn't find an extent that matched the space we wanted
+		 * to remove, look for a bitmap instead
+		 */
+		info = tree_search_offset(block_group,
+					  offset_to_bitmap(block_group, offset),
+					  1, 0);
+		if (!info) {
+			WARN_ON(1);
+			goto out_lock;
+		}
 	}
 
 	if (info->bytes < bytes && rb_next(&info->offset_index)) {
@@ -957,8 +1005,15 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 	if (cluster->block_group != block_group)
 		goto out;
 
-	entry = tree_search_offset(block_group, search_start, 0, 0);
-
+	/*
+	 * search_start is the beginning of the bitmap, but at some point it may
+	 * be a good idea to point to the actual start of the free area in the
+	 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+	 * to 1 to make sure we get the bitmap entry
+	 */
+	entry = tree_search_offset(block_group,
+				   offset_to_bitmap(block_group, search_start),
+				   1, 0);
 	if (!entry || !entry->bitmap)
 		goto out;
 

From 013f1b12f4fc479f697acae2f31bad220162cd03 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 31 Jul 2009 14:57:55 -0400
Subject: [PATCH 2/5] Btrfs: make sure the async caching thread advances the
 key

The async caching thread can end up looping forever if a given
search puts it at the last key in a leaf.  It will end up calling
btrfs_next_leaf and then checking if it needs to politely drop
the read semaphore.

Most of the time this looping isn't noticed because it is able to
make progress the next time around.  But, during log replay,
we wait on the async caching thread to finish, and the async thread
is waiting on the commit, and no progress is really made.

The fix used here is to copy the key out of the next leaf,
that way our search lands there properly.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dc84daee6bc..72a2b9c28e9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -265,10 +265,6 @@ static int caching_kthread(void *data)
 
 	atomic_inc(&block_group->space_info->caching_threads);
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
-again:
-	/* need to make sure the commit_root doesn't disappear */
-	down_read(&fs_info->extent_commit_sem);
-
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
 	 * extent for the extent root while also trying to search the extent
@@ -282,6 +278,10 @@ again:
 	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+again:
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->extent_commit_sem);
+
 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -304,6 +304,19 @@ again:
 
 			if (need_resched() ||
 			    btrfs_transaction_in_commit(fs_info)) {
+				leaf = path->nodes[0];
+
+				/* this shouldn't happen, but if the
+				 * leaf is empty just move on.
+				 */
+				if (btrfs_header_nritems(leaf) == 0)
+					break;
+				/*
+				 * we need to copy the key out so that
+				 * we are sure the next search advances
+				 * us forward in the btree.
+				 */
+				btrfs_item_key_to_cpu(leaf, &key, 0);
 				btrfs_release_path(fs_info->extent_root, path);
 				up_read(&fs_info->extent_commit_sem);
 				schedule_timeout(1);

From 4baf8c9201e88546918cbfa61ea8062c38bc1644 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 7 Aug 2009 13:47:08 -0400
Subject: [PATCH 3/5] Btrfs: remove superfluous NULL pointer check in
 btrfs_rename()

This takes care of the following entry from Dan's list:

fs/btrfs/inode.c +4788 btrfs_rename(36) warning: variable derefenced before check 'old_inode'

Reported-by: Dan Carpenter <error27@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Eugene Teo <eteo@redhat.com>
Cc: Julia Lawall <julia@diku.dk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea827ddf0f..04b53b5ebe5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4806,8 +4806,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * and the replacement file is large.  Start IO on it now so
 	 * we don't add too much work to the end of the transaction
 	 */
-	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
-	    new_inode->i_size &&
+	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 		filemap_flush(old_inode->i_mapping);
 

From 60f2e8f8a07331097a57ec4abcdc680405579377 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 7 Aug 2009 13:51:33 -0400
Subject: [PATCH 4/5] Btrfs: correct error-handling zlib error handling

find_zlib_workspace returns an ERR_PTR value in an error case instead of NULL.

A simplified version of the semantic match that finds this problem is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@match exists@
expression x, E;
statement S1, S2;
@@

x = find_zlib_workspace(...)
... when != x = E
(
*  if (x == NULL || ...) S1 else S2
|
*  if (x == NULL && ...) S1 else S2
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/zlib.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d3..3e2b90eaa23 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	*total_in = 0;
 
 	workspace = find_zlib_workspace();
-	if (!workspace)
+	if (IS_ERR(workspace))
 		return -1;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 	char *kaddr;
 
 	workspace = find_zlib_workspace();
-	if (!workspace)
+	if (IS_ERR(workspace))
 		return -ENOMEM;
 
 	data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 		return -ENOMEM;
 
 	workspace = find_zlib_workspace();
-	if (!workspace)
+	if (IS_ERR(workspace))
 		return -ENOMEM;
 
 	workspace->inf_strm.next_in = data_in;

From ceab36edd3d3ad3ffd01d41d6d1e05ac1ff8357e Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 7 Aug 2009 13:51:33 -0400
Subject: [PATCH 5/5] Btrfs: fix balancing oops when invalidate_inode_pages2
 returns EBUSY

invalidate_inode_pages2_range may return -EBUSY occasionally
which results Oops. This patch fixes the issue by moving
invalidate_inode_pages2_range into a loop and keeping calling
it until the return value is not -EBUSY.

The EBUSY return is temporary, and can happen when the btrfs release page
function is unable to release a page because the EXTENT_LOCK
bit is set.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e71264d1c2c..c04f7f21260 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2553,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
 	/* make sure the dirty trick played by the caller work */
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    first_index, last_index);
+	while (1) {
+		ret = invalidate_inode_pages2_range(inode->i_mapping,
+						    first_index, last_index);
+		if (ret != -EBUSY)
+			break;
+		schedule_timeout(HZ/10);
+	}
 	if (ret)
 		goto out_unlock;