dect
/
linux-2.6
Archived
13
0
Fork 0

[PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
This commit is contained in:
Mark Fasheh 2005-12-15 14:31:24 -08:00 committed by Joel Becker
parent 8df08c89c6
commit ccd979bdbc
55 changed files with 24504 additions and 0 deletions

View File

@ -36,6 +36,8 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
proc.txt
- info on Linux's /proc filesystem.
ocfs2.txt
- info and mount options for the OCFS2 clustered filesystem.
romfs.txt
- Description of the ROMFS filesystem.
smbfs.txt

View File

@ -0,0 +1,55 @@
OCFS2 filesystem
==================
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
numbers, and has automatically extending metadata groups which may
also make it attractive for non-clustered use.
You'll want to install the ocfs2-tools package in order to at least
get "mount.ocfs2" and "ocfs2_hb_ctl".
Project web page: http://oss.oracle.com/projects/ocfs2
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
All code copyright 2005 Oracle except when otherwise noted.
CREDITS:
Lots of code taken from ext3 and other projects.
Authors in alphabetical order:
Joel Becker <joel.becker@oracle.com>
Zach Brown <zach.brown@oracle.com>
Mark Fasheh <mark.fasheh@oracle.com>
Kurt Hackel <kurt.hackel@oracle.com>
Sunil Mushran <sunil.mushran@oracle.com>
Manish Singh <manish.singh@oracle.com>
Caveats
=======
Features which OCFS2 does not support yet:
- sparse files
- extended attributes
- shared writeable mmap
- loopback is supported, but data written will not
be cluster coherent.
- quotas
- cluster aware flock
- Directory change notification (F_NOTIFY)
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
- POSIX ACLs
- readpages / writepages (not user visible)
Mount options
=============
OCFS2 supports the following mount options:
(*) == default
barrier=1 This enables/disables barriers. barrier=0 disables it,
barrier=1 enables it.
errors=remount-ro(*) Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
intr (*) Allow signals to interrupt cluster operations.
nointr Do not allow signals to interrupt cluster
operations.

View File

@ -1905,6 +1905,15 @@ M: ajoshi@shell.unixbox.com
L: linux-nvidia@lists.surfsouth.com
S: Maintained
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
P: Mark Fasheh
M: mark.fasheh@oracle.com
P: Kurt Hackel
M: kurt.hackel@oracle.com
L: ocfs2-devel@oss.oracle.com
W: http://oss.oracle.com/projects/ocfs2/
S: Supported
OLYMPIC NETWORK DRIVER
P: Peter De Shrijver
M: p2@ace.ulyssis.student.kuleuven.ac.be

33
fs/ocfs2/Makefile Normal file
View File

@ -0,0 +1,33 @@
EXTRA_CFLAGS += -Ifs/ocfs2
EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
obj-$(CONFIG_OCFS2_FS) += ocfs2.o
ocfs2-objs := \
alloc.o \
aops.o \
buffer_head_io.o \
dcache.o \
dir.o \
dlmglue.o \
export.o \
extent_map.o \
file.o \
heartbeat.o \
inode.o \
journal.o \
localalloc.o \
mmap.o \
namei.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
ver.o \
vote.o
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS) += dlm/

2040
fs/ocfs2/alloc.c Normal file

File diff suppressed because it is too large Load Diff

82
fs/ocfs2/alloc.h Normal file
View File

@ -0,0 +1,82 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* alloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 blkno,
u32 new_clusters,
struct ocfs2_alloc_context *meta_ac);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct ocfs2_dinode *fe);
/* how many new metadata chunks would an allocation need at maximum? */
static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
{
/*
* Rather than do all the work of determining how much we need
* (involves a ton of reads and locks), just ask for the
* maximal limit. That's a tree depth shift. So, one block for
* level of the tree (current l_tree_depth), one block for the
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
}
int ocfs2_truncate_log_init(struct ocfs2_super *osb);
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel);
int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
int slot_num,
struct ocfs2_dinode **tl_copy);
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *tl_copy);
struct ocfs2_truncate_context {
struct inode *tc_ext_alloc_inode;
struct buffer_head *tc_ext_alloc_bh;
int tc_ext_alloc_locked; /* is it cluster locked? */
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
struct buffer_head *tc_last_eb_bh;
};
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc);
#endif /* OCFS2_ALLOC_H */

643
fs/ocfs2/aops.c Normal file
View File

@ -0,0 +1,643 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <asm/byteorder.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "inode.h"
#include "journal.h"
#include "super.h"
#include "symlink.h"
#include "buffer_head_io.h"
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int err = -EIO;
int status;
struct ocfs2_dinode *fe = NULL;
struct buffer_head *bh = NULL;
struct buffer_head *buffer_cache_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
void *kaddr;
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
(unsigned long long)iblock);
goto bail;
}
status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
OCFS2_I(inode)->ip_blkno,
&bh, OCFS2_BH_CACHED, inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) bh->b_data;
if (!OCFS2_IS_VALID_DINODE(fe)) {
mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
fe->i_blkno, 7, fe->i_signature);
goto bail;
}
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
}
/* We don't use the page cache to create symlink data, so if
* need be, copy it over from the buffer cache. */
if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
/* we haven't locked out transactions, so a commit
* could've happened. Since we've got a reference on
* the bh, even if it commits while we're doing the
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
}
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
}
map_bh(bh_result, inode->i_sb,
le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
err = 0;
bail:
if (bh)
brelse(bh);
mlog_exit(err);
return err;
}
static int ocfs2_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int err = 0;
u64 p_blkno, past_eof;
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
inode, inode->i_ino);
if (S_ISLNK(inode->i_mode)) {
/* this always does I/O for some reason. */
err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
goto bail;
}
/* this can happen if another node truncs after our extend! */
spin_lock(&OCFS2_I(inode)->ip_lock);
if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
OCFS2_I(inode)->ip_clusters))
err = -EIO;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (err)
goto bail;
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
NULL);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%"MLFu64", NULL)\n", err, inode,
(unsigned long long)iblock, p_blkno);
goto bail;
}
map_bh(bh_result, inode->i_sb, p_blkno);
if (bh_result->b_blocknr == 0) {
err = -EIO;
mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
"blkno=(%"MLFu64")\n", (unsigned long long)iblock,
p_blkno, OCFS2_I(inode)->ip_blkno);
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
if (create && (iblock >= past_eof))
set_buffer_new(bh_result);
bail:
if (err < 0)
err = -EIO;
mlog_exit(err);
return err;
}
static int ocfs2_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
int ret, unlock = 1;
mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
/*
* i_size might have just been updated as we grabed the meta lock. We
* might now be discovering a truncate that hit on another node.
* block_read_full_page->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
* (generic_file_read, fault->nopage) are clever enough to check i_size
* and notice that the page they just read isn't needed.
*
* XXX sys_readahead() seems to get that wrong?
*/
if (start >= i_size_read(inode)) {
char *addr = kmap(page);
memset(addr, 0, PAGE_SIZE);
flush_dcache_page(page);
kunmap(page);
SetPageUptodate(page);
ret = 0;
goto out_alloc;
}
ret = ocfs2_data_lock_with_page(inode, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
mlog_errno(ret);
goto out_alloc;
}
ret = block_read_full_page(page, ocfs2_get_block);
unlock = 0;
ocfs2_data_unlock(inode, 0);
out_alloc:
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 0);
out:
if (unlock)
unlock_page(page);
mlog_exit(ret);
return ret;
}
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
* ocfs2_writepage.
*
* ->writepage is called during the process of invalidating the page cache
* during blocked lock processing. It can't block on any cluster locks
* to during block mapping. It's relying on the fact that the block
* mapping can't have disappeared under the dirty pages that it is
* being asked to write back.
*/
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
mlog_entry("(0x%p)\n", page);
ret = block_write_full_page(page, ocfs2_get_block, wbc);
mlog_exit(ret);
return ret;
}
/*
* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
* from loopback. It must be able to perform its own locking around
* ocfs2_get_block().
*/
int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
int ret;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
if (ret != 0) {
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = block_prepare_write(page, from, to, ocfs2_get_block);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 0);
out:
mlog_exit(ret);
return ret;
}
/* Taken from ext3. We don't necessarily need the full blown
* functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */
static int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
unsigned blocksize = head->b_size;
int err, ret = 0;
struct buffer_head *next;
for ( bh = head, block_start = 0;
ret == 0 && (bh != head || !block_start);
block_start = block_end, bh = next)
{
next = bh->b_this_page;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (partial && !buffer_uptodate(bh))
*partial = 1;
continue;
}
err = (*fn)(handle, bh);
if (!ret)
ret = err;
}
return ret;
}
struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_journal_handle *handle = NULL;
int ret = 0;
handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
if (!handle) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
if (ocfs2_should_order_data(inode)) {
ret = walk_page_buffers(handle->k_handle,
page_buffers(page),
from, to, NULL,
ocfs2_journal_dirty_data);
if (ret < 0)
mlog_errno(ret);
}
out:
if (ret) {
if (handle)
ocfs2_commit_trans(handle);
handle = ERR_PTR(ret);
}
return handle;
}
static int ocfs2_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
int ret, extending = 0, locklevel = 0;
loff_t new_i_size;
struct buffer_head *di_bh = NULL;
struct inode *inode = page->mapping->host;
struct ocfs2_journal_handle *handle = NULL;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
* us to sample inode->i_size here without the metadata lock:
*
* 1) We're currently holding the inode alloc lock, so no
* nodes can change it underneath us.
*
* 2) We've had to take the metadata lock at least once
* already to check for extending writes, hence insuring
* that our current copy is also up to date.
*/
new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
if (new_i_size > i_size_read(inode)) {
extending = 1;
locklevel = 1;
}
ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
if (ret != 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_data_lock_with_page(inode, 1, page);
if (ret != 0) {
mlog_errno(ret);
goto out_unlock_meta;
}
if (extending) {
handle = ocfs2_start_walk_page_trans(inode, page, from, to);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
goto out_unlock_data;
}
/* Mark our buffer early. We'd rather catch this error up here
* as opposed to after a successful commit_write which would
* require us to set back inode->i_size. */
ret = ocfs2_journal_access(handle, inode, di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
}
/* might update i_size */
ret = generic_commit_write(file, page, from, to);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
if (extending) {
loff_t size = (u64) i_size_read(inode);
struct ocfs2_dinode *di =
(struct ocfs2_dinode *)di_bh->b_data;
/* ocfs2_mark_inode_dirty is too heavy to use here. */
inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
di->i_size = cpu_to_le64(size);
di->i_ctime = di->i_mtime =
cpu_to_le64(inode->i_mtime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec =
cpu_to_le32(inode->i_mtime.tv_nsec);
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
}
BUG_ON(extending && (i_size_read(inode) != new_i_size));
out_commit:
if (handle)
ocfs2_commit_trans(handle);
out_unlock_data:
ocfs2_data_unlock(inode, 1);
out_unlock_meta:
ocfs2_meta_unlock(inode, locklevel);
out:
if (di_bh)
brelse(di_bh);
mlog_exit(ret);
return ret;
}
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
u64 p_blkno = 0;
int err = 0;
struct inode *inode = mapping->host;
mlog_entry("(block = %llu)\n", (unsigned long long)block);
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
*/
if (!INODE_JOURNAL(inode)) {
err = ocfs2_meta_lock(inode, NULL, NULL, 0);
if (err) {
if (err != -ENOENT)
mlog_errno(err);
goto bail;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
}
err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
NULL);
if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 0);
}
if (err) {
mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
(unsigned long long)block);
mlog_errno(err);
goto bail;
}
bail:
status = err ? 0 : p_blkno;
mlog_exit((int)status);
return status;
}
/*
* TODO: Make this into a generic get_blocks function.
*
* From do_direct_io in direct-io.c:
* "So what we do is to permit the ->get_blocks function to populate
* bh.b_size with the size of IO which is permitted at this offset and
* this i_blkbits."
*
* This function is called directly from get_more_blocks in direct-io.c.
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
unsigned long max_blocks,
struct buffer_head *bh_result, int create)
{
int ret;
u64 vbo_max; /* file offset, max_blocks from iblock */
u64 p_blkno;
int contig_blocks;
unsigned char blocksize_bits;
if (!inode || !bh_result) {
mlog(ML_ERROR, "inode or bh_result is null\n");
return -EIO;
}
blocksize_bits = inode->i_sb->s_blocksize_bits;
/* This function won't even be called if the request isn't all
* nicely aligned and of the right size, so there's no need
* for us to check any of that. */
vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
spin_lock(&OCFS2_I(inode)->ip_lock);
if ((iblock + max_blocks) >
ocfs2_clusters_to_blocks(inode->i_sb,
OCFS2_I(inode)->ip_clusters)) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = -EIO;
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
&contig_blocks);
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
ret = -EIO;
goto bail;
}
map_bh(bh_result, inode->i_sb, p_blkno);
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
if (max_blocks < contig_blocks)
contig_blocks = max_blocks;
bh_result->b_size = contig_blocks << blocksize_bits;
bail:
return ret;
}
/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
* particularly interested in the aio/dio case. Like the core uses
* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
* truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
void *private)
{
struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
ocfs2_iocb_clear_rw_locked(iocb);
up_read(&inode->i_alloc_sem);
ocfs2_rw_unlock(inode, 0);
}
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
const struct iovec *iov,
loff_t offset,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
int ret;
mlog_entry_void();
ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
inode->i_sb->s_bdev, iov, offset,
nr_segs,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io);
mlog_exit(ret);
return ret;
}
struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.writepage = ocfs2_writepage,
.prepare_write = ocfs2_prepare_write,
.commit_write = ocfs2_commit_write,
.bmap = ocfs2_bmap,
.sync_page = block_sync_page,
.direct_IO = ocfs2_direct_IO
};

41
fs/ocfs2/aops.h Normal file
View File

@ -0,0 +1,41 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to);
struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_rw_locked(iocb) \
set_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(0, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */

232
fs/ocfs2/buffer_head_io.c Normal file
View File

@ -0,0 +1,232 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* io.c
*
* Buffer cache handling
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
#include "buffer_head_io.h"
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
struct inode *inode)
{
int ret = 0;
mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
(unsigned long long)bh->b_blocknr, inode);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
/* No need to check for a soft readonly file system here. non
* journalled writes are only ever done on system files which
* can get modified during recovery even if read-only. */
if (ocfs2_is_hard_readonly(osb)) {
ret = -EROFS;
goto out;
}
down(&OCFS2_I(inode)->ip_io_sem);
lock_buffer(bh);
set_buffer_uptodate(bh);
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh)) {
ocfs2_set_buffer_uptodate(inode, bh);
} else {
/* We don't need to remove the clustered uptodate
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
brelse(bh);
}
up(&OCFS2_I(inode)->ip_io_sem);
out:
mlog_exit(ret);
return ret;
}
int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
struct buffer_head *bhs[], int flags,
struct inode *inode)
{
int status = 0;
struct super_block *sb;
int i, ignore_cache = 0;
struct buffer_head *bh;
mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
block, nr, flags, inode);
if (osb == NULL || osb->sb == NULL || bhs == NULL) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
if (nr < 0) {
mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
status = -EINVAL;
mlog_errno(status);
goto bail;
}
if (nr == 0) {
mlog(ML_BH_IO, "No buffers will be read!\n");
status = 0;
goto bail;
}
sb = osb->sb;
if (flags & OCFS2_BH_CACHED && !inode)
flags &= ~OCFS2_BH_CACHED;
if (inode)
down(&OCFS2_I(inode)->ip_io_sem);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
if (inode)
up(&OCFS2_I(inode)->ip_io_sem);
status = -EIO;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
ignore_cache = 0;
if (flags & OCFS2_BH_CACHED &&
!ocfs2_buffer_uptodate(inode, bh)) {
mlog(ML_UPTODATE,
"bh (%llu), inode %"MLFu64" not uptodate\n",
(unsigned long long)bh->b_blocknr,
OCFS2_I(inode)->ip_blkno);
ignore_cache = 1;
}
/* XXX: Can we ever get this and *not* have the cached
* flag set? */
if (buffer_jbd(bh)) {
if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
mlog(ML_BH_IO, "trying to sync read a jbd "
"managed bh (blocknr = %llu)\n",
(unsigned long long)bh->b_blocknr);
continue;
}
if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
mlog(ML_BH_IO, "asking me to sync read a dirty "
"buffer! (blocknr = %llu)\n",
(unsigned long long)bh->b_blocknr);
continue;
}
lock_buffer(bh);
if (buffer_jbd(bh)) {
#ifdef CATCH_BH_JBD_RACES
mlog(ML_ERROR, "block %llu had the JBD bit set "
"while I was in lock_buffer!",
(unsigned long long)bh->b_blocknr);
BUG();
#else
unlock_buffer(bh);
continue;
#endif
}
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
if (flags & OCFS2_BH_READAHEAD)
submit_bh(READA, bh);
else
submit_bh(READ, bh);
continue;
}
}
status = 0;
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
/* We know this can't have changed as we hold the
* inode sem. Avoid doing any work on the bh if the
* journal has it. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* Status won't be cleared from here on out,
* so we can safely record this and loop back
* to cleanup the other buffers. Don't need to
* remove the clustered uptodate information
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
brelse(bh);
bhs[i] = NULL;
continue;
}
if (inode)
ocfs2_set_buffer_uptodate(inode, bh);
}
if (inode)
up(&OCFS2_I(inode)->ip_io_sem);
mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
(!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
bail:
mlog_exit(status);
return status;
}

73
fs/ocfs2/buffer_head_io.h Normal file
View File

@ -0,0 +1,73 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_buffer_head.h
*
* Buffer cache handling functions defined
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_BUFFER_HEAD_IO_H
#define OCFS2_BUFFER_HEAD_IO_H
#include <linux/buffer_head.h>
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
static inline int ocfs2_read_block(struct ocfs2_super *osb,
u64 off,
struct buffer_head **bh,
int flags,
struct inode *inode);
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct inode *inode);
int ocfs2_read_blocks(struct ocfs2_super *osb,
u64 block,
int nr,
struct buffer_head *bhs[],
int flags,
struct inode *inode);
#define OCFS2_BH_CACHED 1
#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
struct buffer_head **bh, int flags,
struct inode *inode)
{
int status = 0;
if (bh == NULL) {
printk("ocfs2: bh == NULL\n");
status = -EINVAL;
goto bail;
}
status = ocfs2_read_blocks(osb, off, 1, bh,
flags, inode);
bail:
return status;
}
#endif /* OCFS2_BUFFER_HEAD_IO_H */

91
fs/ocfs2/dcache.c Normal file
View File

@ -0,0 +1,91 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dcache.c
*
* dentry cache handling code
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/namei.h>
#define MLOG_MASK_PREFIX ML_DCACHE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dcache.h"
#include "file.h"
#include "inode.h"
static int ocfs2_dentry_revalidate(struct dentry *dentry,
struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
int ret = 0; /* if all else fails, just return false */
struct ocfs2_super *osb;
mlog_entry("(0x%p, '%.*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
/* Never trust a negative dentry - force a new lookup. */
if (inode == NULL) {
mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
dentry->d_name.name);
goto bail;
}
osb = OCFS2_SB(inode->i_sb);
BUG_ON(!osb);
if (inode != osb->root_inode) {
spin_lock(&OCFS2_I(inode)->ip_lock);
/* did we or someone else delete this inode? */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
mlog(0, "inode (%"MLFu64") deleted, returning false\n",
OCFS2_I(inode)->ip_blkno);
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (!inode->i_nlink) {
mlog(0, "Inode %"MLFu64" orphaned, returning false "
"dir = %d\n", OCFS2_I(inode)->ip_blkno,
S_ISDIR(inode->i_mode));
goto bail;
}
}
ret = 1;
bail:
mlog_exit(ret);
return ret;
}
struct dentry_operations ocfs2_dentry_ops = {
.d_revalidate = ocfs2_dentry_revalidate,
};

31
fs/ocfs2/dcache.h Normal file
View File

@ -0,0 +1,31 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dcache.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DCACHE_H
#define OCFS2_DCACHE_H
extern struct dentry_operations ocfs2_dentry_ops;
#endif /* OCFS2_DCACHE_H */

618
fs/ocfs2/dir.c Normal file
View File

@ -0,0 +1,618 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dir.c
*
* Creates, reads, walks and deletes directory-nodes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* Portions of this code from linux/fs/ext3/dir.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* from
*
* linux/fs/minix/dir.c
*
* Copyright (C) 1991, 1992 Linux Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "inode.h"
#include "journal.h"
#include "namei.h"
#include "suballoc.h"
#include "uptodate.h"
#include "buffer_head_io.h"
static unsigned char ocfs2_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
static int ocfs2_extend_dir(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct buffer_head **new_de_bh);
/*
* ocfs2_readdir()
*
*/
int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
int error = 0;
unsigned long offset, blk;
int i, num, stored;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
int err;
struct inode *inode = filp->f_dentry->d_inode;
struct super_block * sb = inode->i_sb;
int have_disk_lock = 0;
mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
stored = 0;
bh = NULL;
error = ocfs2_meta_lock(inode, NULL, NULL, 0);
if (error < 0) {
if (error != -ENOENT)
mlog_errno(error);
/* we haven't got any yet, so propagate the error. */
stored = error;
goto bail;
}
have_disk_lock = 1;
offset = filp->f_pos & (sb->s_blocksize - 1);
while (!error && !stored && filp->f_pos < i_size_read(inode)) {
blk = (filp->f_pos) >> sb->s_blocksize_bits;
bh = ocfs2_bread(inode, blk, &err, 0);
if (!bh) {
mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
"at offset %lld\n",
OCFS2_I(inode)->ip_blkno,
filp->f_pos);
filp->f_pos += sb->s_blocksize - offset;
continue;
}
/*
* Do the readahead (8k)
*/
if (!offset) {
for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
i > 0; i--) {
tmp = ocfs2_bread(inode, ++blk, &err, 1);
if (tmp)
brelse(tmp);
}
}
revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
if (filp->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ocfs2_dir_entry *) (bh->b_data + i);
/* It's too expensive to do a full
* dirent test each time round this
* loop, but we do have to test at
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
if (le16_to_cpu(de->rec_len) <
OCFS2_DIR_REC_LEN(1))
break;
i += le16_to_cpu(de->rec_len);
}
offset = i;
filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
| offset;
filp->f_version = inode->i_version;
}
while (!error && filp->f_pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
/* On error, skip the f_pos to the
next block. */
filp->f_pos = (filp->f_pos |
(sb->s_blocksize - 1)) + 1;
brelse(bh);
goto bail;
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
* currently swapped out. So, use a
* version stamp to detect whether or
* not the directory has been modified
* during the copy operation.
*/
unsigned long version = filp->f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
error = filldir(dirent, de->name,
de->name_len,
filp->f_pos,
ino_from_blkno(sb, le64_to_cpu(de->inode)),
d_type);
if (error)
break;
if (version != filp->f_version)
goto revalidate;
stored ++;
}
filp->f_pos += le16_to_cpu(de->rec_len);
}
offset = 0;
brelse(bh);
}
stored = 0;
bail:
if (have_disk_lock)
ocfs2_meta_unlock(inode, 0);
mlog_exit(stored);
return stored;
}
/*
* NOTE: this should always be called with parent dir i_sem taken.
*/
int ocfs2_find_files_on_disk(const char *name,
int namelen,
u64 *blkno,
struct inode *inode,
struct buffer_head **dirent_bh,
struct ocfs2_dir_entry **dirent)
{
int status = -ENOENT;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
"inode=%p)\n",
osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
if (!*dirent_bh || !*dirent) {
status = -ENOENT;
goto leave;
}
*blkno = le64_to_cpu((*dirent)->inode);
status = 0;
leave:
if (status < 0) {
*dirent = NULL;
if (*dirent_bh) {
brelse(*dirent_bh);
*dirent_bh = NULL;
}
}
mlog_exit(status);
return status;
}
/* Check for a name within a directory.
*
* Return 0 if the name does not exist
* Return -EEXIST if the directory contains the name
*
* Callers should have i_sem + a cluster lock on dir
*/
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen)
{
int ret;
struct buffer_head *dirent_bh = NULL;
struct ocfs2_dir_entry *dirent = NULL;
mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
namelen, name);
ret = -EEXIST;
dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
if (dirent_bh)
goto bail;
ret = 0;
bail:
if (dirent_bh)
brelse(dirent_bh);
mlog_exit(ret);
return ret;
}
/*
* routine to check that the specified directory is empty (for rmdir)
*/
int ocfs2_empty_dir(struct inode *inode)
{
unsigned long offset;
struct buffer_head * bh;
struct ocfs2_dir_entry * de, * de1;
struct super_block * sb;
int err;
sb = inode->i_sb;
if ((i_size_read(inode) <
(OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
!(bh = ocfs2_bread(inode, 0, &err, 0))) {
mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
"no data block\n",
OCFS2_I(inode)->ip_blkno);
return 1;
}
de = (struct ocfs2_dir_entry *) bh->b_data;
de1 = (struct ocfs2_dir_entry *)
((char *)de + le16_to_cpu(de->rec_len));
if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
!le64_to_cpu(de1->inode) ||
strcmp(".", de->name) ||
strcmp("..", de1->name)) {
mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
"no `.' or `..'\n",
OCFS2_I(inode)->ip_blkno);
brelse(bh);
return 1;
}
offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
while (offset < i_size_read(inode) ) {
if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
brelse(bh);
bh = ocfs2_bread(inode,
offset >> sb->s_blocksize_bits, &err, 0);
if (!bh) {
mlog(ML_ERROR, "directory #%"MLFu64" contains "
"a hole at offset %lu\n",
OCFS2_I(inode)->ip_blkno, offset);
offset += sb->s_blocksize;
continue;
}
de = (struct ocfs2_dir_entry *) bh->b_data;
}
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
brelse(bh);
return 1;
}
if (le64_to_cpu(de->inode)) {
brelse(bh);
return 0;
}
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *)
((char *)de + le16_to_cpu(de->rec_len));
}
brelse(bh);
return 1;
}
/* returns a bh of the 1st new block in the allocation. */
int ocfs2_do_extend_dir(struct super_block *sb,
struct ocfs2_journal_handle *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **new_bh)
{
int status;
int extend;
u64 p_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock);
extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
spin_unlock(&OCFS2_I(dir)->ip_lock);
if (extend) {
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
parent_fe_bh, handle,
data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
(sb->s_blocksize_bits - 9)),
1, &p_blkno, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
*new_bh = sb_getblk(sb, p_blkno);
if (!*new_bh) {
status = -EIO;
mlog_errno(status);
goto bail;
}
status = 0;
bail:
mlog_exit(status);
return status;
}
/* assumes you already have a cluster lock on the directory. */
static int ocfs2_extend_dir(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct buffer_head **new_de_bh)
{
int status = 0;
int credits, num_free_extents;
loff_t dir_i_size;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
struct ocfs2_journal_handle *handle = NULL;
struct buffer_head *new_bh = NULL;
struct ocfs2_dir_entry * de;
struct super_block *sb = osb->sb;
mlog_entry_void();
dir_i_size = i_size_read(dir);
mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
OCFS2_I(dir)->ip_blkno, dir_i_size);
handle = ocfs2_alloc_handle(osb);
if (handle == NULL) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
/* dir->i_size is always block aligned. */
spin_lock(&OCFS2_I(dir)->ip_lock);
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
spin_unlock(&OCFS2_I(dir)->ip_lock);
num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
goto bail;
}
if (!num_free_extents) {
status = ocfs2_reserve_new_metadata(osb, handle,
fe, &meta_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
}
status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
credits = ocfs2_calc_extend_credits(sb, fe, 1);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
}
handle = ocfs2_start_trans(osb, handle, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status);
goto bail;
}
status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
data_ac, meta_ac, &new_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
ocfs2_set_new_buffer_uptodate(dir, new_bh);
status = ocfs2_journal_access(handle, dir, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(new_bh->b_data, 0, sb->s_blocksize);
de = (struct ocfs2_dir_entry *) new_bh->b_data;
de->inode = 0;
de->rec_len = cpu_to_le16(sb->s_blocksize);
status = ocfs2_journal_dirty(handle, new_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
*new_de_bh = new_bh;
get_bh(*new_de_bh);
bail:
if (handle)
ocfs2_commit_trans(handle);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
if (new_bh)
brelse(new_bh);
mlog_exit(status);
return status;
}
/*
* Search the dir for a good spot, extending it if necessary. The
* block containing an appropriate record is returned in ret_de_bh.
*/
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
const char *name,
int namelen,
struct buffer_head **ret_de_bh)
{
unsigned long offset;
struct buffer_head * bh = NULL;
unsigned short rec_len;
struct ocfs2_dinode *fe;
struct ocfs2_dir_entry *de;
struct super_block *sb;
int status;
mlog_entry_void();
mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
namelen, OCFS2_I(dir)->ip_blkno);
BUG_ON(!S_ISDIR(dir->i_mode));
fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
sb = dir->i_sb;
if (!namelen) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
bh = ocfs2_bread(dir, 0, &status, 0);
if (!bh) {
mlog_errno(status);
goto bail;
}
rec_len = OCFS2_DIR_REC_LEN(namelen);
offset = 0;
de = (struct ocfs2_dir_entry *) bh->b_data;
while (1) {
if ((char *)de >= sb->s_blocksize + bh->b_data) {
brelse(bh);
bh = NULL;
if (i_size_read(dir) <= offset) {
status = ocfs2_extend_dir(osb,
dir,
parent_fe_bh,
&bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
BUG_ON(!bh);
*ret_de_bh = bh;
get_bh(*ret_de_bh);
goto bail;
}
bh = ocfs2_bread(dir,
offset >> sb->s_blocksize_bits,
&status,
0);
if (!bh) {
mlog_errno(status);
goto bail;
}
/* move to next block */
de = (struct ocfs2_dir_entry *) bh->b_data;
}
if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
status = -ENOENT;
goto bail;
}
if (ocfs2_match(namelen, name, de)) {
status = -EEXIST;
goto bail;
}
if (((le64_to_cpu(de->inode) == 0) &&
(le16_to_cpu(de->rec_len) >= rec_len)) ||
(le16_to_cpu(de->rec_len) >=
(OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
/* Ok, we found a spot. Return this bh and let
* the caller actually fill it in. */
*ret_de_bh = bh;
get_bh(*ret_de_bh);
status = 0;
goto bail;
}
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
}
status = 0;
bail:
if (bh)
brelse(bh);
mlog_exit(status);
return status;
}

54
fs/ocfs2/dir.h Normal file
View File

@ -0,0 +1,54 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dir.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DIR_H
#define OCFS2_DIR_H
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen);
int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
int ocfs2_find_files_on_disk(const char *name,
int namelen,
u64 *blkno,
struct inode *inode,
struct buffer_head **dirent_bh,
struct ocfs2_dir_entry **dirent);
int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
const char *name,
int namelen,
struct buffer_head **ret_de_bh);
struct ocfs2_alloc_context;
int ocfs2_do_extend_dir(struct super_block *sb,
struct ocfs2_journal_handle *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **new_bh);
#endif /* OCFS2_DIR_H */

2904
fs/ocfs2/dlmglue.c Normal file

File diff suppressed because it is too large Load Diff

111
fs/ocfs2/dlmglue.h Normal file
View File

@ -0,0 +1,111 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmglue.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef DLMGLUE_H
#define DLMGLUE_H
#define OCFS2_LVB_VERSION 2
struct ocfs2_meta_lvb {
__be32 lvb_version;
__be32 lvb_iclusters;
__be32 lvb_iuid;
__be32 lvb_igid;
__be64 lvb_iatime_packed;
__be64 lvb_ictime_packed;
__be64 lvb_imtime_packed;
__be64 lvb_isize;
__be16 lvb_imode;
__be16 lvb_inlink;
__be32 lvb_reserved[3];
};
/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
/* Instruct the dlm not to queue ourselves on the other node. */
#define OCFS2_META_LOCK_NOQUEUE (0x02)
/* don't block waiting for the vote thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
int ocfs2_dlm_init(struct ocfs2_super *osb);
void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
struct inode *inode);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags);
#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
int ocfs2_data_lock_with_page(struct inode *inode,
int write,
struct page *page);
void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_meta_lock_full(struct inode *inode,
struct ocfs2_journal_handle *handle,
struct buffer_head **ret_bh,
int ex,
int arg_flags);
int ocfs2_meta_lock_with_page(struct inode *inode,
struct ocfs2_journal_handle *handle,
struct buffer_head **ret_bh,
int ex,
struct page *page);
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
void ocfs2_meta_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
/* for the vote thread */
void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
/* aids in debugging and tracking lvbs */
void ocfs2_dump_meta_lvb_info(u64 level,
const char *function,
unsigned int line,
struct ocfs2_lock_res *lockres);
#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
#endif /* DLMGLUE_H */

45
fs/ocfs2/endian.h Normal file
View File

@ -0,0 +1,45 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_ENDIAN_H
#define OCFS2_ENDIAN_H
static inline void le16_add_cpu(__le16 *var, u16 val)
{
*var = cpu_to_le16(le16_to_cpu(*var) + val);
}
static inline void le32_add_cpu(__le32 *var, u32 val)
{
*var = cpu_to_le32(le32_to_cpu(*var) + val);
}
static inline void le32_and_cpu(__le32 *var, u32 val)
{
*var = cpu_to_le32(le32_to_cpu(*var) & val);
}
static inline void be32_add_cpu(__be32 *var, u32 val)
{
*var = cpu_to_be32(be32_to_cpu(*var) + val);
}
#endif /* OCFS2_ENDIAN_H */

248
fs/ocfs2/export.c Normal file
View File

@ -0,0 +1,248 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* export.c
*
* Functions to facilitate NFS exporting
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#define MLOG_MASK_PREFIX ML_EXPORT
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dir.h"
#include "dlmglue.h"
#include "export.h"
#include "inode.h"
#include "buffer_head_io.h"
struct ocfs2_inode_handle
{
u64 ih_blkno;
u32 ih_generation;
};
static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
{
struct ocfs2_inode_handle *handle = vobjp;
struct inode *inode;
struct dentry *result;
mlog_entry("(0x%p, 0x%p)\n", sb, handle);
if (handle->ih_blkno == 0) {
mlog_errno(-ESTALE);
return ERR_PTR(-ESTALE);
}
inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
return (void *)inode;
}
if (handle->ih_generation != inode->i_generation) {
iput(inode);
mlog_errno(-ESTALE);
return ERR_PTR(-ESTALE);
}
result = d_alloc_anon(inode);
if (!result) {
iput(inode);
mlog_errno(-ENOMEM);
return ERR_PTR(-ENOMEM);
}
mlog_exit_ptr(result);
return result;
}
static struct dentry *ocfs2_get_parent(struct dentry *child)
{
int status;
u64 blkno;
struct dentry *parent;
struct inode *inode;
struct inode *dir = child->d_inode;
struct buffer_head *dirent_bh = NULL;
struct ocfs2_dir_entry *dirent;
mlog_entry("(0x%p, '%.*s')\n", child,
child->d_name.len, child->d_name.name);
mlog(0, "find parent of directory %"MLFu64"\n",
OCFS2_I(dir)->ip_blkno);
status = ocfs2_meta_lock(dir, NULL, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
parent = ERR_PTR(status);
goto bail;
}
status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
&dirent);
if (status < 0) {
parent = ERR_PTR(-ENOENT);
goto bail_unlock;
}
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
if (IS_ERR(inode)) {
mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
parent = ERR_PTR(-EACCES);
goto bail_unlock;
}
parent = d_alloc_anon(inode);
if (!parent) {
iput(inode);
parent = ERR_PTR(-ENOMEM);
}
bail_unlock:
ocfs2_meta_unlock(dir, 0);
if (dirent_bh)
brelse(dirent_bh);
bail:
mlog_exit_ptr(parent);
return parent;
}
static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
int connectable)
{
struct inode *inode = dentry->d_inode;
int len = *max_len;
int type = 1;
u64 blkno;
u32 generation;
mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
dentry->d_name.len, dentry->d_name.name,
fh, len, connectable);
if (len < 3 || (connectable && len < 6)) {
mlog(ML_ERROR, "fh buffer is too small for encoding\n");
type = 255;
goto bail;
}
blkno = OCFS2_I(inode)->ip_blkno;
generation = inode->i_generation;
mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
blkno, generation);
len = 3;
fh[0] = cpu_to_le32((u32)(blkno >> 32));
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[2] = cpu_to_le32(generation);
if (connectable && !S_ISDIR(inode->i_mode)) {
struct inode *parent;
spin_lock(&dentry->d_lock);
parent = dentry->d_parent->d_inode;
blkno = OCFS2_I(parent)->ip_blkno;
generation = parent->i_generation;
fh[3] = cpu_to_le32((u32)(blkno >> 32));
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[5] = cpu_to_le32(generation);
spin_unlock(&dentry->d_lock);
len = 6;
type = 2;
mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
blkno, generation);
}
*max_len = len;
bail:
mlog_exit(type);
return type;
}
static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
int fh_len, int fileid_type,
int (*acceptable)(void *context,
struct dentry *de),
void *context)
{
struct ocfs2_inode_handle handle, parent;
struct dentry *ret = NULL;
mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
sb, fh, fh_len, fileid_type, acceptable, context);
if (fh_len < 3 || fileid_type > 2)
goto bail;
if (fileid_type == 2) {
if (fh_len < 6)
goto bail;
parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
parent.ih_generation = le32_to_cpu(fh[5]);
mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
parent.ih_blkno, parent.ih_generation);
}
handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
handle.ih_generation = le32_to_cpu(fh[2]);
mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
handle.ih_blkno, handle.ih_generation);
ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
acceptable, context);
bail:
mlog_exit_ptr(ret);
return ret;
}
struct export_operations ocfs2_export_ops = {
.decode_fh = ocfs2_decode_fh,
.encode_fh = ocfs2_encode_fh,
.get_parent = ocfs2_get_parent,
.get_dentry = ocfs2_get_dentry,
};

31
fs/ocfs2/export.h Normal file
View File

@ -0,0 +1,31 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* export.h
*
* Function prototypes
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_EXPORT_H
#define OCFS2_EXPORT_H
extern struct export_operations ocfs2_export_ops;
#endif /* OCFS2_EXPORT_H */

994
fs/ocfs2/extent_map.c Normal file
View File

@ -0,0 +1,994 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* extent_map.c
*
* In-memory extent map for OCFS2. Man, this code was prettier in
* the library.
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
#include "buffer_head_io.h"
/*
* SUCK SUCK SUCK
* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
*/
struct ocfs2_extent_map_entry {
struct rb_node e_node;
int e_tree_depth;
struct ocfs2_extent_rec e_rec;
};
struct ocfs2_em_insert_context {
int need_left;
int need_right;
struct ocfs2_extent_map_entry *new_ent;
struct ocfs2_extent_map_entry *old_ent;
struct ocfs2_extent_map_entry *left_ent;
struct ocfs2_extent_map_entry *right_ent;
};
static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent);
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth);
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent);
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el);
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_map_entry **ret_ent);
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt);
/* returns 1 only if the rec contains all the given clusters -- that is that
* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
* clusters) is >= the argument's endpoint */
static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
u32 cpos, u32 clusters)
{
if (le32_to_cpu(rec->e_cpos) > cpos)
return 0;
if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters))
return 0;
return 1;
}
/*
* Find an entry in the tree that intersects the region passed in.
* Note that this will find straddled intervals, it is up to the
* callers to enforce any boundary conditions.
*
* Callers must hold ip_lock. This lookup is not guaranteed to return
* a tree_depth 0 match, and as such can race inserts if the lock
* were not held.
*
* The rb_node garbage lets insertion share the search. Trivial
* callers pass NULL.
*/
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent)
{
struct rb_node **p = &em->em_extents.rb_node;
struct rb_node *parent = NULL;
struct ocfs2_extent_map_entry *ent = NULL;
while (*p)
{
parent = *p;
ent = rb_entry(parent, struct ocfs2_extent_map_entry,
e_node);
if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
p = &(*p)->rb_left;
ent = NULL;
} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters))) {
p = &(*p)->rb_right;
ent = NULL;
} else
break;
}
if (ret_p != NULL)
*ret_p = p;
if (ret_parent != NULL)
*ret_parent = parent;
return ent;
}
/*
* Find the leaf containing the interval we want. While we're on our
* way down the tree, fill in every record we see at any depth, because
* we might want it later.
*
* Note that this code is run without ip_lock. That's because it
* sleeps while reading. If someone is also filling the extent list at
* the same time we are, we might have to restart.
*/
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el)
{
int i, ret;
struct buffer_head *eb_bh = NULL;
u64 blkno;
u32 rec_end;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_rec *rec;
/*
* The bh data containing the el cannot change here, because
* we hold alloc_sem. So we can do this without other
* locks.
*/
while (el->l_tree_depth)
{
blkno = 0;
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
rec_end = (le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters));
ret = -EBADR;
if (rec_end > OCFS2_I(inode)->ip_clusters) {
mlog_errno(ret);
goto out_free;
}
if (rec_end <= cpos) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
/*
* We've found a record that matches our
* interval. We don't insert it because we're
* about to traverse it.
*/
/* Check to see if we're stradling */
ret = -ESRCH;
if (!ocfs2_extent_rec_contains_clusters(rec,
cpos,
clusters)) {
mlog_errno(ret);
goto out_free;
}
/*
* If we've already found a record, the el has
* two records covering the same interval.
* EEEK!
*/
ret = -EBADR;
if (blkno) {
mlog_errno(ret);
goto out_free;
}
blkno = le64_to_cpu(rec->e_blkno);
}
/*
* We don't support holes, and we're still up
* in the branches, so we'd better have found someone
*/
ret = -EBADR;
if (!blkno) {
mlog_errno(ret);
goto out_free;
}
if (eb_bh) {
brelse(eb_bh);
eb_bh = NULL;
}
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
blkno, &eb_bh, OCFS2_BH_CACHED,
inode);
if (ret) {
mlog_errno(ret);
goto out_free;
}
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
ret = -EIO;
goto out_free;
}
el = &eb->h_list;
}
if (el->l_tree_depth)
BUG();
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret) {
mlog_errno(ret);
goto out_free;
}
}
ret = 0;
out_free:
if (eb_bh)
brelse(eb_bh);
return ret;
}
/*
* This lookup actually will read from disk. It has one invariant:
* It will never re-traverse blocks. This means that all inserts should
* be new regions or more granular regions (both allowed by insert).
*/
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos,
u32 clusters,
struct ocfs2_extent_map_entry **ret_ent)
{
int ret;
u64 blkno;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
struct buffer_head *bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_dinode *di;
struct ocfs2_extent_list *el;
spin_lock(&OCFS2_I(inode)->ip_lock);
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (ent) {
if (!ent->e_tree_depth) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
*ret_ent = ent;
return 0;
}
blkno = le64_to_cpu(ent->e_rec.e_blkno);
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
if (bh)
brelse(bh);
return ret;
}
eb = (struct ocfs2_extent_block *)bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
brelse(bh);
return -EIO;
}
el = &eb->h_list;
} else {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
if (bh)
brelse(bh);
return ret;
}
di = (struct ocfs2_dinode *)bh->b_data;
if (!OCFS2_IS_VALID_DINODE(di)) {
brelse(bh);
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
return -EIO;
}
el = &di->id2.i_list;
}
ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
brelse(bh);
if (ret) {
mlog_errno(ret);
return ret;
}
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (!ent) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
}
if (ent->e_tree_depth)
BUG(); /* FIXME: Make sure this isn't a corruption */
*ret_ent = ent;
return 0;
}
/*
* Callers must hold ip_lock. This can insert pieces of the tree,
* thus racing lookup if the lock weren't held.
*/
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent)
{
struct rb_node **p, *parent;
struct ocfs2_extent_map_entry *old_ent;
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
le32_to_cpu(ent->e_rec.e_clusters),
&p, &parent);
if (old_ent)
return -EEXIST;
rb_link_node(&ent->e_node, parent, p);
rb_insert_color(&ent->e_node, &em->em_extents);
return 0;
}
/*
* Simple rule: on any return code other than -EAGAIN, anything left
* in the insert_context will be freed.
*/
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt)
{
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *old_ent;
ctxt->need_left = 0;
ctxt->need_right = 0;
ctxt->old_ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret) {
ctxt->new_ent = NULL;
goto out_unlock;
}
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
le32_to_cpu(rec->e_clusters), NULL,
NULL);
if (!old_ent)
BUG();
ret = -EEXIST;
if (old_ent->e_tree_depth < tree_depth)
goto out_unlock;
if (old_ent->e_tree_depth == tree_depth) {
if (!memcmp(rec, &old_ent->e_rec,
sizeof(struct ocfs2_extent_rec)))
ret = 0;
/* FIXME: Should this be ESRCH/EBADR??? */
goto out_unlock;
}
/*
* We do it in this order specifically so that no actual tree
* changes occur until we have all the pieces we need. We
* don't want malloc failures to leave an inconsistent tree.
* Whenever we drop the lock, another process could be
* inserting. Also note that, if another process just beat us
* to an insert, we might not need the same pieces we needed
* the first go round. In the end, the pieces we need will
* be used, and the pieces we don't will be freed.
*/
ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
le32_to_cpu(old_ent->e_rec.e_cpos));
ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
le32_to_cpu(old_ent->e_rec.e_clusters)) >
(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
ret = -EAGAIN;
if (ctxt->need_left) {
if (!ctxt->left_ent)
goto out_unlock;
*(ctxt->left_ent) = *old_ent;
ctxt->left_ent->e_rec.e_clusters =
cpu_to_le32(le32_to_cpu(rec->e_cpos) -
le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
}
if (ctxt->need_right) {
if (!ctxt->right_ent)
goto out_unlock;
*(ctxt->right_ent) = *old_ent;
ctxt->right_ent->e_rec.e_cpos =
cpu_to_le32(le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters));
ctxt->right_ent->e_rec.e_clusters =
cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
le32_to_cpu(old_ent->e_rec.e_clusters)) -
le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
}
rb_erase(&old_ent->e_node, &em->em_extents);
/* Now that he's erased, set him up for deletion */
ctxt->old_ent = old_ent;
if (ctxt->need_left) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->left_ent);
if (ret)
goto out_unlock;
ctxt->left_ent = NULL;
}
if (ctxt->need_right) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->right_ent);
if (ret)
goto out_unlock;
ctxt->right_ent = NULL;
}
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret)
ctxt->new_ent = NULL;
out_unlock:
spin_unlock(&OCFS2_I(inode)->ip_lock);
return ret;
}
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth)
{
int ret;
struct ocfs2_em_insert_context ctxt = {0, };
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
return ret;
}
/* Zero e_clusters means a truncated tail record. It better be EOF */
if (!rec->e_clusters) {
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
return ret;
}
/* Ignore the truncated tail */
return 0;
}
ret = -ENOMEM;
ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_KERNEL);
if (!ctxt.new_ent) {
mlog_errno(ret);
return ret;
}
ctxt.new_ent->e_rec = *rec;
ctxt.new_ent->e_tree_depth = tree_depth;
do {
ret = -ENOMEM;
if (ctxt.need_left && !ctxt.left_ent) {
ctxt.left_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_KERNEL);
if (!ctxt.left_ent)
break;
}
if (ctxt.need_right && !ctxt.right_ent) {
ctxt.right_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_KERNEL);
if (!ctxt.right_ent)
break;
}
ret = ocfs2_extent_map_try_insert(inode, rec,
tree_depth, &ctxt);
} while (ret == -EAGAIN);
if (ret < 0)
mlog_errno(ret);
if (ctxt.left_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
if (ctxt.right_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
if (ctxt.old_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
if (ctxt.new_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
return ret;
}
/*
* Append this record to the tail of the extent map. It must be
* tree_depth 0. The record might be an extension of an existing
* record, and as such that needs to be handled. eg:
*
* Existing record in the extent map:
*
* cpos = 10, len = 10
* |---------|
*
* New Record:
*
* cpos = 10, len = 20
* |------------------|
*
* The passed record is the new on-disk record. The new_clusters value
* is how many clusters were added to the file. If the append is a
* contiguous append, the new_clusters has been added to
* rec->e_clusters. If the append is an entirely new extent, then
* rec->e_clusters is == new_clusters.
*/
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters)
{
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
struct ocfs2_extent_rec *old;
BUG_ON(!new_clusters);
BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(em->em_clusters + new_clusters),
"Inode %"MLFu64":\n"
"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
"em->em_clusters = %u + new_clusters = %u = %u\n",
OCFS2_I(inode)->ip_blkno,
le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
em->em_clusters, new_clusters,
em->em_clusters + new_clusters);
em->em_clusters += new_clusters;
ret = -ENOENT;
if (le32_to_cpu(rec->e_clusters) > new_clusters) {
/* This is a contiguous append */
ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
NULL, NULL);
if (ent) {
old = &ent->e_rec;
BUG_ON((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(le32_to_cpu(old->e_cpos) +
le32_to_cpu(old->e_clusters) +
new_clusters));
if (ent->e_tree_depth == 0) {
BUG_ON(le32_to_cpu(old->e_cpos) !=
le32_to_cpu(rec->e_cpos));
BUG_ON(le64_to_cpu(old->e_blkno) !=
le64_to_cpu(rec->e_blkno));
ret = 0;
}
/*
* Let non-leafs fall through as -ENOENT to
* force insertion of the new leaf.
*/
le32_add_cpu(&old->e_clusters, new_clusters);
}
}
if (ret == -ENOENT)
ret = ocfs2_extent_map_insert(inode, rec, 0);
if (ret < 0)
mlog_errno(ret);
return ret;
}
#if 0
/* Code here is included but defined out as it completes the extent
* map api and may be used in the future. */
/*
* Look up the record containing this cluster offset. This record is
* part of the extent map. Do not free it. Any changes you make to
* it will reflect in the extent map. So, if your last extent
* is (cpos = 10, clusters = 10) and you truncate the file by 5
* clusters, you can do:
*
* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
* rec->e_clusters -= 5;
*
* The lookup does not read from disk. If the map isn't filled in for
* an entry, you won't find it.
*
* Also note that the returned record is valid until alloc_sem is
* dropped. After that, truncate and extend can happen. Caveat Emptor.
*/
int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
struct ocfs2_extent_rec **rec,
int *tree_depth)
{
int ret = -ENOENT;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*rec = NULL;
if (cpos >= OCFS2_I(inode)->ip_clusters)
return -EINVAL;
if (cpos >= em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters ;
}
ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
NULL, NULL);
if (ent) {
*rec = &ent->e_rec;
if (tree_depth)
*tree_depth = ent->e_tree_depth;
ret = 0;
}
return ret;
}
int ocfs2_extent_map_get_clusters(struct inode *inode,
u32 v_cpos, int count,
u32 *p_cpos, int *ret_count)
{
int ret;
u32 coff, ccount;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent = NULL;
*p_cpos = ccount = 0;
if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
return -EINVAL;
if ((v_cpos + count) > em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
if (ret)
return ret;
if (ent) {
/* We should never find ourselves straddling an interval */
if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
v_cpos,
count))
return -ESRCH;
coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(ent->e_rec.e_blkno)) +
coff;
if (ret_count)
*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
return 0;
}
return -ENOENT;
}
#endif /* 0 */
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count)
{
int ret;
u64 boff;
u32 cpos, clusters;
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
struct ocfs2_extent_map_entry *ent = NULL;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_rec *rec;
*p_blkno = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
clusters = ocfs2_blocks_to_clusters(inode->i_sb,
(u64)count + bpc - 1);
if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
ret = -EINVAL;
mlog_errno(ret);
return ret;
}
if ((cpos + clusters) > em->em_clusters) {
/*
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
if (ret) {
mlog_errno(ret);
return ret;
}
if (ent)
{
rec = &ent->e_rec;
/* We should never find ourselves straddling an interval */
if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
}
boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
le32_to_cpu(rec->e_cpos));
boff += (v_blkno & (u64)(bpc - 1));
*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(rec->e_clusters)) - boff;
}
return 0;
}
return -ENOENT;
}
int ocfs2_extent_map_init(struct inode *inode)
{
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
em->em_extents = RB_ROOT;
em->em_clusters = 0;
return 0;
}
/* Needs the lock */
static void __ocfs2_extent_map_drop(struct inode *inode,
u32 new_clusters,
struct rb_node **free_head,
struct ocfs2_extent_map_entry **tail_ent)
{
struct rb_node *node, *next;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*free_head = NULL;
ent = NULL;
node = rb_last(&em->em_extents);
while (node)
{
next = rb_prev(node);
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
break;
rb_erase(&ent->e_node, &em->em_extents);
node->rb_right = *free_head;
*free_head = node;
ent = NULL;
node = next;
}
/* Do we have an entry straddling new_clusters? */
if (tail_ent) {
if (ent &&
((le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
*tail_ent = ent;
else
*tail_ent = NULL;
}
}
static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
{
struct rb_node *node;
struct ocfs2_extent_map_entry *ent;
while (free_head) {
node = free_head;
free_head = node->rb_right;
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
kmem_cache_free(ocfs2_em_ent_cachep, ent);
}
}
/*
* Remove all entries past new_clusters, inclusive of an entry that
* contains new_clusters. This is effectively a cache forget.
*
* If you want to also clip the last extent by some number of clusters,
* you need to call ocfs2_extent_map_trunc().
* This code does not check or modify ip_clusters.
*/
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
{
struct rb_node *free_head = NULL;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
if (ent) {
rb_erase(&ent->e_node, &em->em_extents);
ent->e_node.rb_right = free_head;
free_head = &ent->e_node;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
/*
* Remove all entries past new_clusters and also clip any extent
* straddling new_clusters, if there is one. This does not check
* or modify ip_clusters
*/
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
{
struct rb_node *free_head = NULL;
struct ocfs2_extent_map_entry *ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
if (ent)
ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
le32_to_cpu(ent->e_rec.e_cpos));
OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
int __init init_ocfs2_extent_maps(void)
{
ocfs2_em_ent_cachep =
kmem_cache_create("ocfs2_em_ent",
sizeof(struct ocfs2_extent_map_entry),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ocfs2_em_ent_cachep)
return -ENOMEM;
return 0;
}
void __exit exit_ocfs2_extent_maps(void)
{
kmem_cache_destroy(ocfs2_em_ent_cachep);
}

46
fs/ocfs2/extent_map.h Normal file
View File

@ -0,0 +1,46 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* extent_map.h
*
* In-memory file extent mappings for OCFS2.
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H
int init_ocfs2_extent_maps(void);
void exit_ocfs2_extent_maps(void);
/*
* EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
* to be held. The allocation cannot change at all while the map is
* in the process of being updated.
*/
int ocfs2_extent_map_init(struct inode *inode);
int ocfs2_extent_map_append(struct inode *inode,
struct ocfs2_extent_rec *rec,
u32 new_clusters);
int ocfs2_extent_map_get_blocks(struct inode *inode,
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count);
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
#endif /* _EXTENT_MAP_H */

1237
fs/ocfs2/file.c Normal file

File diff suppressed because it is too large Load Diff

57
fs/ocfs2/file.h Normal file
View File

@ -0,0 +1,57 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* file.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_FILE_H
#define OCFS2_FILE_H
extern struct file_operations ocfs2_fops;
extern struct file_operations ocfs2_dops;
extern struct inode_operations ocfs2_file_iops;
extern struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
enum ocfs2_alloc_restarted {
RESTART_NONE = 0,
RESTART_TRANS,
RESTART_META
};
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
u32 clusters_to_add,
struct buffer_head *fe_bh,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size);
#endif /* OCFS2_FILE_H */

378
fs/ocfs2/heartbeat.c Normal file
View File

@ -0,0 +1,378 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* heartbeat.c
*
* Register ourselves with the heartbaet service, keep our node maps
* up to date, and fire off recovery when needed.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kmod.h>
#include <cluster/heartbeat.h>
#include <cluster/nodemanager.h>
#include <dlm/dlmapi.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "vote.h"
#include "buffer_head_io.h"
#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit);
static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
struct ocfs2_node_map *from);
static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
struct ocfs2_node_map *from);
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
ocfs2_node_map_init(&osb->mounted_map);
ocfs2_node_map_init(&osb->recovery_map);
ocfs2_node_map_init(&osb->umount_map);
}
static void ocfs2_do_node_down(int node_num,
struct ocfs2_super *osb)
{
BUG_ON(osb->node_num == node_num);
mlog(0, "ocfs2: node down event for %d\n", node_num);
if (!osb->dlm) {
/*
* No DLM means we're not even ready to participate yet.
* We check the slots after the DLM comes up, so we will
* notice the node death then. We can safely ignore it
* here.
*/
return;
}
if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
/* If a node is in the umount map, then we've been
* expecting him to go down and we know ahead of time
* that recovery is not necessary. */
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
return;
}
ocfs2_recovery_thread(osb, node_num);
ocfs2_remove_node_from_vote_queues(osb, node_num);
}
static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
int node_num,
void *data)
{
ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
}
/* Called from the dlm when it's about to evict a node. We may also
* get a heartbeat callback later. */
static void ocfs2_dlm_eviction_cb(int node_num,
void *data)
{
struct ocfs2_super *osb = (struct ocfs2_super *) data;
struct super_block *sb = osb->sb;
mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
ocfs2_do_node_down(node_num, osb);
}
static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
int node_num,
void *data)
{
struct ocfs2_super *osb = data;
BUG_ON(osb->node_num == node_num);
mlog(0, "node up event for %d\n", node_num);
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
}
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
{
o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
ocfs2_hb_node_down_cb, osb,
OCFS2_HB_NODE_DOWN_PRI);
o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
/* Not exactly a heartbeat callback, but leads to essentially
* the same path so we set it up here. */
dlm_setup_eviction_cb(&osb->osb_eviction_cb,
ocfs2_dlm_eviction_cb,
osb);
}
/* Most functions here are just stubs for now... */
int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
{
int status;
status = o2hb_register_callback(&osb->osb_hb_down);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = o2hb_register_callback(&osb->osb_hb_up);
if (status < 0)
mlog_errno(status);
bail:
return status;
}
void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
{
int status;
status = o2hb_unregister_callback(&osb->osb_hb_down);
if (status < 0)
mlog_errno(status);
status = o2hb_unregister_callback(&osb->osb_hb_up);
if (status < 0)
mlog_errno(status);
}
void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
{
int ret;
char *argv[5], *envp[3];
if (!osb->uuid_str) {
/* This can happen if we don't get far enough in mount... */
mlog(0, "No UUID with which to stop heartbeat!\n\n");
return;
}
argv[0] = (char *)o2nm_get_hb_ctl_path();
argv[1] = "-K";
argv[2] = "-u";
argv[3] = osb->uuid_str;
argv[4] = NULL;
mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
/* minimal command environment taken from cpu_run_sbin_hotplug */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
ret = call_usermodehelper(argv[0], argv, envp, 1);
if (ret < 0)
mlog_errno(ret);
}
/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
void ocfs2_node_map_init(struct ocfs2_node_map *map)
{
map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
sizeof(unsigned long));
}
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit)
{
set_bit(bit, map->map);
}
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
{
if (bit==-1)
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_set_bit(map, bit);
spin_unlock(&osb->node_map_lock);
}
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit)
{
clear_bit(bit, map->map);
}
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
{
if (bit==-1)
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_clear_bit(map, bit);
spin_unlock(&osb->node_map_lock);
}
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
{
int ret;
if (bit >= map->num_nodes) {
mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
BUG();
}
spin_lock(&osb->node_map_lock);
ret = test_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
return ret;
}
static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
{
int bit;
bit = find_next_bit(map->map, map->num_nodes, 0);
if (bit < map->num_nodes)
return 0;
return 1;
}
int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
struct ocfs2_node_map *map)
{
int ret;
BUG_ON(map->num_nodes == 0);
spin_lock(&osb->node_map_lock);
ret = __ocfs2_node_map_is_empty(map);
spin_unlock(&osb->node_map_lock);
return ret;
}
static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
struct ocfs2_node_map *from)
{
BUG_ON(from->num_nodes == 0);
ocfs2_node_map_init(target);
__ocfs2_node_map_set(target, from);
}
/* returns 1 if bit is the only bit set in target, 0 otherwise */
int ocfs2_node_map_is_only(struct ocfs2_super *osb,
struct ocfs2_node_map *target,
int bit)
{
struct ocfs2_node_map temp;
int ret;
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_dup(&temp, target);
__ocfs2_node_map_clear_bit(&temp, bit);
ret = __ocfs2_node_map_is_empty(&temp);
spin_unlock(&osb->node_map_lock);
return ret;
}
static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
struct ocfs2_node_map *from)
{
int num_longs, i;
BUG_ON(target->num_nodes != from->num_nodes);
BUG_ON(target->num_nodes == 0);
num_longs = BITS_TO_LONGS(target->num_nodes);
for (i = 0; i < num_longs; i++)
target->map[i] = from->map[i];
}
/* Returns whether the recovery bit was actually set - it may not be
* if a node is still marked as needing recovery */
int ocfs2_recovery_map_set(struct ocfs2_super *osb,
int num)
{
int set = 0;
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
if (!test_bit(num, osb->recovery_map.map)) {
__ocfs2_node_map_set_bit(&osb->recovery_map, num);
set = 1;
}
spin_unlock(&osb->node_map_lock);
return set;
}
void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
int num)
{
ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
}
int ocfs2_node_map_iterate(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int idx)
{
int i = idx;
idx = O2NM_INVALID_NODE_NUM;
spin_lock(&osb->node_map_lock);
if ((i != O2NM_INVALID_NODE_NUM) &&
(i >= 0) &&
(i < map->num_nodes)) {
while(i < map->num_nodes) {
if (test_bit(i, map->map)) {
idx = i;
break;
}
i++;
}
}
spin_unlock(&osb->node_map_lock);
return idx;
}

67
fs/ocfs2/heartbeat.h Normal file
View File

@ -0,0 +1,67 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* heartbeat.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_HEARTBEAT_H
#define OCFS2_HEARTBEAT_H
void ocfs2_init_node_maps(struct ocfs2_super *osb);
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
/* node map functions - used to keep track of mounted and in-recovery
* nodes. */
void ocfs2_node_map_init(struct ocfs2_node_map *map);
int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
struct ocfs2_node_map *map);
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
int ocfs2_node_map_iterate(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int idx);
static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map)
{
return ocfs2_node_map_iterate(osb, map, 0);
}
int ocfs2_recovery_map_set(struct ocfs2_super *osb,
int num);
void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
int num);
/* returns 1 if bit is the only bit set in target, 0 otherwise */
int ocfs2_node_map_is_only(struct ocfs2_super *osb,
struct ocfs2_node_map *target,
int bit);
#endif /* OCFS2_HEARTBEAT_H */

1140
fs/ocfs2/inode.c Normal file

File diff suppressed because it is too large Load Diff

145
fs/ocfs2/inode.h Normal file
View File

@ -0,0 +1,145 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* inode.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_INODE_H
#define OCFS2_INODE_H
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
u64 ip_blkno;
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
u32 ip_clusters;
struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers;
int ip_orphaned_slot;
struct semaphore ip_io_sem;
/* Used by the journalling code to attach an inode to a
* handle. These are protected by ip_io_sem in order to lock
* out other I/O to the inode until we either commit or
* abort. */
struct list_head ip_handle_list;
struct ocfs2_journal_handle *ip_handle;
u32 ip_flags; /* see below */
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
u32 ip_dir_start_lookup;
/* next two are protected by trans_inc_lock */
/* which transaction were we created on? Zero if none. */
unsigned long ip_created_trans;
/* last transaction we were a part of. */
unsigned long ip_last_trans;
struct ocfs2_caching_info ip_metadata_cache;
struct inode vfs_inode;
};
/*
* Flags for the ip_flags field
*/
/* System file inodes */
#define OCFS2_INODE_SYSTEM_FILE 0x00000001
#define OCFS2_INODE_JOURNAL 0x00000002
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
/* Another node is deleting, so our delete is a nop */
#define OCFS2_INODE_SKIP_DELETE 0x00000010
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
* continuing.
*
* We *only* set this on unlink vote from another node. If the inode
* was locally orphaned, then we're sure of the state and don't need
* to twiddle i_nlink later - it's either zero or not depending on
* whether our unlink succeeded. Otherwise we got this from a node
* whose intention was to orphan the inode, however he may have
* crashed, failed etc, so we let ocfs2_drop_inode zero the value and
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
/* Does someone have the file open O_DIRECT */
#define OCFS2_INODE_OPEN_DIRECT 0x00000040
/* Indicates that the metadata cache should be used as an array. */
#define OCFS2_INODE_CACHE_INLINE 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
return container_of(inode, struct ocfs2_inode_info, vfs_inode);
}
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
extern kmem_cache_t *ocfs2_inode_cache;
extern struct address_space_operations ocfs2_aops;
struct buffer_head *ocfs2_bread(struct inode *inode, int block,
int *err, int reada);
void ocfs2_clear_inode(struct inode *inode);
void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
int delete_vote);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
int create_ino);
void ocfs2_read_inode(struct inode *inode);
void ocfs2_read_inode2(struct inode *inode, void *opaque);
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
size_t size, loff_t *offp);
void ocfs2_sync_blockdev(struct super_block *sb);
void ocfs2_refresh_inode(struct inode *inode,
struct ocfs2_dinode *fe);
int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *bh);
int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
#endif /* OCFS2_INODE_H */

1652
fs/ocfs2/journal.c Normal file

File diff suppressed because it is too large Load Diff

457
fs/ocfs2/journal.h Normal file
View File

@ -0,0 +1,457 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* journal.h
*
* Defines journalling api and structures.
*
* Copyright (C) 2003, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_JOURNAL_H
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
#include <linux/jbd.h>
#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
OCFS2_JOURNAL_LOADED,
OCFS2_JOURNAL_IN_SHUTDOWN,
};
struct ocfs2_super;
struct ocfs2_dinode;
struct ocfs2_journal_handle;
struct ocfs2_journal {
enum ocfs2_journal_state j_state; /* Journals current state */
journal_t *j_journal; /* The kernels journal type */
struct inode *j_inode; /* Kernel inode pointing to
* this journal */
struct ocfs2_super *j_osb; /* pointer to the super
* block for the node
* we're currently
* running on -- not
* necessarily the super
* block from the node
* which we usually run
* from (recovery,
* etc) */
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
spinlock_t j_lock;
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
extern spinlock_t trans_inc_lock;
/* wrap j_trans_id so we never have it equal to zero. */
static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
{
unsigned long old_id;
spin_lock(&trans_inc_lock);
old_id = j->j_trans_id++;
if (unlikely(!j->j_trans_id))
j->j_trans_id = 1;
spin_unlock(&trans_inc_lock);
return old_id;
}
static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
struct inode *inode)
{
spin_lock(&trans_inc_lock);
OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Used to figure out whether it's safe to drop a metadata lock on an
* inode. Returns true if all the inodes changes have been
* checkpointed to disk. You should be holding the spinlock on the
* metadata lock while calling this to be sure that nobody can take
* the lock and put it on another transaction. */
static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
{
int ret;
struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
spin_lock(&trans_inc_lock);
ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
spin_unlock(&trans_inc_lock);
return ret;
}
/* convenience function to check if an inode is still new (has never
* hit disk) Will do you a favor and set created_trans = 0 when you've
* been checkpointed. returns '1' if the inode is still new. */
static inline int ocfs2_inode_is_new(struct inode *inode)
{
int ret;
/* System files are never "new" as they're written out by
* mkfs. This helps us early during mount, before we have the
* journal open and j_trans_id could be junk. */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
return 0;
spin_lock(&trans_inc_lock);
ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
OCFS2_I(inode)->ip_created_trans));
if (!ret)
OCFS2_I(inode)->ip_created_trans = 0;
spin_unlock(&trans_inc_lock);
return ret;
}
static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
struct inode *inode)
{
spin_lock(&trans_inc_lock);
OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
extern kmem_cache_t *ocfs2_lock_cache;
struct ocfs2_journal_lock {
struct inode *jl_inode;
struct list_head jl_lock_list;
};
struct ocfs2_journal_handle {
handle_t *k_handle; /* kernel handle. */
struct ocfs2_journal *journal;
u32 flags; /* see flags below. */
int max_buffs; /* Buffs reserved by this handle */
/* The following two fields are for ocfs2_handle_add_lock */
int num_locks;
struct list_head locks; /* A bunch of locks to
* release on commit. This
* should be a list_head */
struct list_head inode_list;
};
#define OCFS2_HANDLE_STARTED 1
/* should we sync-commit this handle? */
#define OCFS2_HANDLE_SYNC 2
static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
{
return handle->flags & OCFS2_HANDLE_STARTED;
}
static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
{
if (sync)
handle->flags |= OCFS2_HANDLE_SYNC;
else
handle->flags &= ~OCFS2_HANDLE_SYNC;
}
/* Exported only for the journal struct init code in super.c. Do not call. */
void ocfs2_complete_recovery(void *data);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
*
* ocfs2_journal_init - Initialize journal structures in the OSB.
* ocfs2_journal_load - Load the given journal off disk. Replay it if
* there's transactions still in there.
* ocfs2_journal_shutdown - Shutdown a journal, this will flush all
* uncommitted, uncheckpointed transactions.
* ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
* zero out each block.
* ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
* ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
* event on.
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
int ocfs2_journal_init(struct ocfs2_journal *journal,
int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
int ocfs2_journal_load(struct ocfs2_journal *journal);
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
atomic_set(&osb->needs_checkpoint, 1);
wake_up(&osb->checkpoint_event);
}
static inline void ocfs2_checkpoint_inode(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_inode_fully_checkpointed(inode)) {
/* WARNING: This only kicks off a single
* checkpoint. If someone races you and adds more
* metadata to the journal, you won't know, and will
* wind up waiting *alot* longer than necessary. Right
* now we only use this in clear_inode so that's
* OK. */
ocfs2_start_checkpoint(osb);
wait_event(osb->journal->j_checkpointed,
ocfs2_inode_fully_checkpointed(inode));
}
}
/*
* Transaction Handling:
* Manage the lifetime of a transaction handle.
*
* ocfs2_alloc_handle - Only allocate a handle so we can start putting
* cluster locks on it. To actually change blocks,
* call ocfs2_start_trans with the handle returned
* from this function. You may call ocfs2_commit_trans
* at any time in the lifetime of a handle.
* ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
* the number of blocks that will be changed during
* this handle.
* ocfs2_commit_trans - Complete a handle.
* ocfs2_extend_trans - Extend a handle by nblocks credits. This may
* commit the handle to disk in the process, but will
* not release any locks taken during the transaction.
* ocfs2_journal_access - Notify the handle that we want to journal this
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
* ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
* the current handle commits.
* ocfs2_handle_add_lock - Sometimes we need to delay lock release
* until after a transaction has been completed. Use
* ocfs2_handle_add_lock to indicate that a lock needs
* to be released at the end of that handle. Locks
* will be released in the order that they are added.
* ocfs2_handle_add_inode - Add a locked inode to a transaction.
*/
/* You must always start_trans with a number of buffs > 0, but it's
* perfectly legal to go through an entire transaction without having
* dirtied any buffers. */
struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
int max_buffs);
void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
int nblocks);
/*
* Create access is for when we get a newly created buffer and we're
* not gonna read it off disk, but rather fill it ourselves. Right
* now, we don't do anything special with this (it turns into a write
* request), but this is a good placeholder in case we do...
*
* Write access is for when we read a block off disk and are going to
* modify it. This way the journalling layer knows it may need to make
* a copy of that block (if it's part of another, uncommitted
* transaction) before we do so.
*/
#define OCFS2_JOURNAL_ACCESS_CREATE 0
#define OCFS2_JOURNAL_ACCESS_WRITE 1
#define OCFS2_JOURNAL_ACCESS_UNDO 2
int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
struct inode *inode,
struct buffer_head *bh,
int type);
/*
* A word about the journal_access/journal_dirty "dance". It is
* entirely legal to journal_access a buffer more than once (as long
* as the access type is the same -- I'm not sure what will happen if
* access type is different but this should never happen anyway) It is
* also legal to journal_dirty a buffer more than once. In fact, you
* can even journal_access a buffer after you've done a
* journal_access/journal_dirty pair. The only thing you cannot do
* however, is journal_dirty a buffer which you haven't yet passed to
* journal_access at least once.
*
* That said, 99% of the time this doesn't matter and this is what the
* path looks like:
*
* <read a bh>
* ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
* <modify the bh>
* ocfs2_journal_dirty(handle, bh);
*/
int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
struct buffer_head *bh);
int ocfs2_journal_dirty_data(handle_t *handle,
struct buffer_head *bh);
int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
struct inode *inode);
/*
* Use this to protect from other processes reading buffer state while
* it's in flight.
*/
void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
struct inode *inode);
/*
* Credit Macros:
* Convenience macros to calculate number of credits needed.
*
* For convenience sake, I have a set of macros here which calculate
* the *maximum* number of sectors which will be changed for various
* metadata updates.
*/
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
/* get one bit out of a suballocator: dinode + group descriptor +
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)
/* dinode + group descriptor update. We don't relink on free yet. */
#define OCFS2_SUBALLOC_FREE (2)
#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
+ OCFS2_TRUNCATE_LOG_UPDATE)
/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
* bitmap block for the new bit) */
#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
* group descriptor + mkdir/symlink blocks */
#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
+ OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
/* local alloc metadata change + main bitmap updates */
#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
+ OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
/* used when we don't need an allocation change for a dir extend. One
* for the dinode, one for the new block. */
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + dir entry block */
#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
* dir inode link */
#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
+ OCFS2_LINK_CREDITS)
/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
* inode alloc group descriptor */
#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink */
#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
+ OCFS2_UNLINK_CREDITS)
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
struct ocfs2_dinode *fe,
u32 bits_wanted)
{
int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
/* bitmap dinode, group desc. + relinked group. */
bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
/* we might need to shift tree depth so lets assume an
* absolute worst case of complete fragmentation. Even with
* that, we only need one update for the dinode, and then
* however many metadata chunks needed * a remaining suballoc
* alloc. */
sysfile_bitmap_blocks = 1 +
(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
/* this does not include *new* metadata blocks, which are
* accounted for in sysfile_bitmap_blocks. fe +
* prev. last_eb_blk + blocks along edge of tree.
* calc_symlink_credits passes because we just need 1
* credit for the dinode there. */
dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
{
int blocks = OCFS2_MKNOD_CREDITS;
/* links can be longer than one block so we may update many
* within our single allocated extent. */
blocks += ocfs2_clusters_to_blocks(sb, 1);
return blocks;
}
static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
unsigned int cpg)
{
int blocks;
int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
/* parent inode update + new block group header + bitmap inode update
+ bitmap blocks affected */
blocks = 1 + 1 + 1 + bitmap_blocks;
return blocks;
}
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
unsigned int clusters_to_del,
struct ocfs2_dinode *fe,
struct ocfs2_extent_list *last_el)
{
/* for dinode + all headers in this pass + update to next leaf */
u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
int credits = 1 + tree_depth + 1;
int i;
i = next_free - 1;
BUG_ON(i < 0);
/* We may be deleting metadata blocks, so metadata alloc dinode +
one desc. block for each possible delete. */
if (tree_depth && next_free == 1 &&
le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
credits += 1 + tree_depth;
/* update to the truncate log. */
credits += OCFS2_TRUNCATE_LOG_UPDATE;
return credits;
}
#endif /* OCFS2_JOURNAL_H */

983
fs/ocfs2/localalloc.c Normal file
View File

@ -0,0 +1,983 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* localalloc.c
*
* Node local data allocation
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/bitops.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
#include "suballoc.h"
#include "super.h"
#include "sysfile.h"
#include "buffer_head_io.h"
#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc,
u32 numbits);
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_dinode *alloc,
struct inode *main_bm_inode,
struct buffer_head *main_bm_bh);
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac,
struct inode **bitmap_inode,
struct buffer_head **bitmap_bh);
static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac);
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
/*
* Determine how large our local alloc window should be, in bits.
*
* These values (and the behavior in ocfs2_alloc_should_use_local) have
* been chosen so that most allocations, including new block groups go
* through local alloc.
*/
static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
{
BUG_ON(osb->s_clustersize_bits < 12);
return 2048 >> (osb->s_clustersize_bits - 12);
}
/*
* Tell us whether a given allocation should use the local alloc
* file. Otherwise, it has to go to the main bitmap.
*/
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{
int la_bits = ocfs2_local_alloc_window_bits(osb);
if (osb->local_alloc_state != OCFS2_LA_ENABLED)
return 0;
/* la_bits should be at least twice the size (in clusters) of
* a new block group. We want to be sure block group
* allocations go through the local alloc, so allow an
* allocation to take up to half the bitmap. */
if (bits > (la_bits / 2))
return 0;
return 1;
}
int ocfs2_load_local_alloc(struct ocfs2_super *osb)
{
int status = 0;
struct ocfs2_dinode *alloc = NULL;
struct buffer_head *alloc_bh = NULL;
u32 num_used;
struct inode *inode = NULL;
struct ocfs2_local_alloc *la;
mlog_entry_void();
/* read the alloc off disk */
inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num);
if (!inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
&alloc_bh, 0, inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
if (!(le32_to_cpu(alloc->i_flags) &
(OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
OCFS2_I(inode)->ip_blkno);
status = -EINVAL;
goto bail;
}
if ((la->la_size == 0) ||
(le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
le16_to_cpu(la->la_size));
status = -EINVAL;
goto bail;
}
/* do a little verification. */
num_used = ocfs2_local_alloc_count_bits(alloc);
/* hopefully the local alloc has always been recovered before
* we load it. */
if (num_used
|| alloc->id1.bitmap1.i_used
|| alloc->id1.bitmap1.i_total
|| la->la_bm_off)
mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
"found = %u, set = %u, taken = %u, off = %u\n",
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
le32_to_cpu(alloc->id1.bitmap1.i_total),
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
osb->local_alloc_bh = alloc_bh;
osb->local_alloc_state = OCFS2_LA_ENABLED;
bail:
if (status < 0)
if (alloc_bh)
brelse(alloc_bh);
if (inode)
iput(inode);
mlog_exit(status);
return status;
}
/*
* return any unused bits to the bitmap and write out a clean
* local_alloc.
*
* local_alloc_bh is optional. If not passed, we will simply use the
* one off osb. If you do pass it however, be warned that it *will* be
* returned brelse'd and NULL'd out.*/
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
{
int status;
struct ocfs2_journal_handle *handle = NULL;
struct inode *local_alloc_inode = NULL;
struct buffer_head *bh = NULL;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_dinode *alloc = NULL;
mlog_entry_void();
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto bail;
local_alloc_inode =
ocfs2_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num);
if (!local_alloc_inode) {
status = -ENOENT;
mlog_errno(status);
goto bail;
}
osb->local_alloc_state = OCFS2_LA_DISABLED;
handle = ocfs2_alloc_handle(osb);
if (!handle) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
ocfs2_handle_add_inode(handle, main_bm_inode);
status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
}
/* WINDOW_MOVE_CREDITS is a bit heavy... */
handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
handle = NULL;
goto bail;
}
bh = osb->local_alloc_bh;
alloc = (struct ocfs2_dinode *) bh->b_data;
alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
if (!alloc_copy) {
status = -ENOMEM;
goto bail;
}
memcpy(alloc_copy, alloc, bh->b_size);
status = ocfs2_journal_access(handle, local_alloc_inode, bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
ocfs2_clear_local_alloc(alloc);
status = ocfs2_journal_dirty(handle, bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
brelse(bh);
osb->local_alloc_bh = NULL;
osb->local_alloc_state = OCFS2_LA_UNUSED;
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
main_bm_inode, main_bm_bh);
if (status < 0)
mlog_errno(status);
bail:
if (handle)
ocfs2_commit_trans(handle);
if (main_bm_bh)
brelse(main_bm_bh);
if (main_bm_inode)
iput(main_bm_inode);
if (local_alloc_inode)
iput(local_alloc_inode);
if (alloc_copy)
kfree(alloc_copy);
mlog_exit_void();
}
/*
* We want to free the bitmap bits outside of any recovery context as
* we'll need a cluster lock to do so, but we must clear the local
* alloc before giving up the recovered nodes journal. To solve this,
* we kmalloc a copy of the local alloc before it's change for the
* caller to process with ocfs2_complete_local_alloc_recovery
*/
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int slot_num,
struct ocfs2_dinode **alloc_copy)
{
int status = 0;
struct buffer_head *alloc_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_dinode *alloc;
mlog_entry("(slot_num = %d)\n", slot_num);
*alloc_copy = NULL;
inode = ocfs2_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
slot_num);
if (!inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
down(&inode->i_sem);
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
&alloc_bh, 0, inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
if (!(*alloc_copy)) {
status = -ENOMEM;
goto bail;
}
memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
ocfs2_clear_local_alloc(alloc);
status = ocfs2_write_block(osb, alloc_bh, inode);
if (status < 0)
mlog_errno(status);
bail:
if ((status < 0) && (*alloc_copy)) {
kfree(*alloc_copy);
*alloc_copy = NULL;
}
if (alloc_bh)
brelse(alloc_bh);
if (inode) {
up(&inode->i_sem);
iput(inode);
}
mlog_exit(status);
return status;
}
/*
* Step 2: By now, we've completed the journal recovery, we've stamped
* a clean local alloc on disk and dropped the node out of the
* recovery map. Dlm locks will no longer stall, so lets clear out the
* main bitmap.
*/
int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc)
{
int status;
struct ocfs2_journal_handle *handle = NULL;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
mlog_entry_void();
handle = ocfs2_alloc_handle(osb);
if (!handle) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
ocfs2_handle_add_inode(handle, main_bm_inode);
status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
}
handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status);
goto bail;
}
/* we want the bitmap change to be recorded on disk asap */
ocfs2_handle_set_sync(handle, 1);
status = ocfs2_sync_local_to_main(osb, handle, alloc,
main_bm_inode, main_bm_bh);
if (status < 0)
mlog_errno(status);
bail:
if (handle)
ocfs2_commit_trans(handle);
if (main_bm_bh)
brelse(main_bm_bh);
if (main_bm_inode)
iput(main_bm_inode);
mlog_exit(status);
return status;
}
/*
* make sure we've got at least bitswanted contiguous bits in the
* local alloc. You lose them when you drop i_sem.
*
* We will add ourselves to the transaction passed in, but may start
* our own in order to shift windows.
*/
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
struct ocfs2_journal_handle *passed_handle,
u32 bits_wanted,
struct ocfs2_alloc_context *ac)
{
int status;
struct ocfs2_dinode *alloc;
struct inode *local_alloc_inode;
unsigned int free_bits;
mlog_entry_void();
BUG_ON(!passed_handle);
BUG_ON(!ac);
BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
local_alloc_inode =
ocfs2_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num);
if (!local_alloc_inode) {
status = -ENOENT;
mlog_errno(status);
goto bail;
}
ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
status = -ENOSPC;
goto bail;
}
if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
mlog(0, "Asking for more than my max window size!\n");
status = -ENOSPC;
goto bail;
}
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
"%u free bits, but a count shows %u",
le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
status = -EIO;
goto bail;
}
free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
le32_to_cpu(alloc->id1.bitmap1.i_used);
if (bits_wanted > free_bits) {
/* uhoh, window change time. */
status =
ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
}
ac->ac_inode = igrab(local_alloc_inode);
get_bh(osb->local_alloc_bh);
ac->ac_bh = osb->local_alloc_bh;
ac->ac_which = OCFS2_AC_USE_LOCAL;
status = 0;
bail:
if (local_alloc_inode)
iput(local_alloc_inode);
mlog_exit(status);
return status;
}
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac,
u32 min_bits,
u32 *bit_off,
u32 *num_bits)
{
int status, start;
struct inode *local_alloc_inode;
u32 bits_wanted;
void *bitmap;
struct ocfs2_dinode *alloc;
struct ocfs2_local_alloc *la;
mlog_entry_void();
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
local_alloc_inode = ac->ac_inode;
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
if (start == -1) {
/* TODO: Shouldn't we just BUG here? */
status = -ENOSPC;
mlog_errno(status);
goto bail;
}
bitmap = la->la_bitmap;
*bit_off = le32_to_cpu(la->la_bm_off) + start;
/* local alloc is always contiguous by nature -- we never
* delete bits from it! */
*num_bits = bits_wanted;
status = ocfs2_journal_access(handle, local_alloc_inode,
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
while(bits_wanted--)
ocfs2_set_bit(start++, bitmap);
alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
le32_to_cpu(alloc->id1.bitmap1.i_used));
status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = 0;
bail:
mlog_exit(status);
return status;
}
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
int i;
u8 *buffer;
u32 count = 0;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
mlog_entry_void();
buffer = la->la_bitmap;
for (i = 0; i < le16_to_cpu(la->la_size); i++)
count += hweight8(buffer[i]);
mlog_exit(count);
return count;
}
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc,
u32 numbits)
{
int numfound, bitoff, left, startoff, lastzero;
void *bitmap = NULL;
mlog_entry("(numbits wanted = %u)\n", numbits);
if (!alloc->id1.bitmap1.i_total) {
mlog(0, "No bits in my window!\n");
bitoff = -1;
goto bail;
}
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
lastzero = -1;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
if (bitoff == left) {
/* mlog(0, "bitoff (%d) == left", bitoff); */
break;
}
/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
"numfound = %d\n", bitoff, startoff, numfound);*/
/* Ok, we found a zero bit... is it contig. or do we
* start over?*/
if (bitoff == startoff) {
/* we found a zero */
numfound++;
startoff++;
} else {
/* got a zero after some ones */
numfound = 1;
startoff = bitoff+1;
}
/* we got everything we needed */
if (numfound == numbits) {
/* mlog(0, "Found it all!\n"); */
break;
}
}
mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
numfound);
if (numfound == numbits)
bitoff = startoff - numfound;
else
bitoff = -1;
bail:
mlog_exit(bitoff);
return bitoff;
}
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
{
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
int i;
mlog_entry_void();
alloc->id1.bitmap1.i_total = 0;
alloc->id1.bitmap1.i_used = 0;
la->la_bm_off = 0;
for(i = 0; i < le16_to_cpu(la->la_size); i++)
la->la_bitmap[i] = 0;
mlog_exit_void();
}
#if 0
/* turn this on and uncomment below to aid debugging window shifts. */
static void ocfs2_verify_zero_bits(unsigned long *bitmap,
unsigned int start,
unsigned int count)
{
unsigned int tmp = count;
while(tmp--) {
if (ocfs2_test_bit(start + tmp, bitmap)) {
printk("ocfs2_verify_zero_bits: start = %u, count = "
"%u\n", start, count);
printk("ocfs2_verify_zero_bits: bit %u is set!",
start + tmp);
BUG();
}
}
}
#endif
/*
* sync the local alloc to main bitmap.
*
* assumes you've already locked the main bitmap -- the bitmap inode
* passed is used for caching.
*/
static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_dinode *alloc,
struct inode *main_bm_inode,
struct buffer_head *main_bm_bh)
{
int status = 0;
int bit_off, left, count, start;
u64 la_start_blk;
u64 blkno;
void *bitmap;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
mlog_entry("total = %u, COUNT = %u, used = %u\n",
le32_to_cpu(alloc->id1.bitmap1.i_total),
ocfs2_local_alloc_count_bits(alloc),
le32_to_cpu(alloc->id1.bitmap1.i_used));
if (!alloc->id1.bitmap1.i_total) {
mlog(0, "nothing to sync!\n");
goto bail;
}
if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
le32_to_cpu(alloc->id1.bitmap1.i_total)) {
mlog(0, "all bits were taken!\n");
goto bail;
}
la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
le32_to_cpu(la->la_bm_off));
bitmap = la->la_bitmap;
start = count = bit_off = 0;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
!= -1) {
if ((bit_off < left) && (bit_off == start)) {
count++;
start++;
continue;
}
if (count) {
blkno = la_start_blk +
ocfs2_clusters_to_blocks(osb->sb,
start - count);
mlog(0, "freeing %u bits starting at local "
"alloc bit %u (la_start_blk = %"MLFu64", "
"blkno = %"MLFu64")\n", count, start - count,
la_start_blk, blkno);
status = ocfs2_free_clusters(handle, main_bm_inode,
main_bm_bh, blkno, count);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
if (bit_off >= left)
break;
count = 1;
start = bit_off + 1;
}
bail:
mlog_exit(status);
return status;
}
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac,
struct inode **bitmap_inode,
struct buffer_head **bitmap_bh)
{
int status;
*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
(*ac)->ac_handle = handle;
(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
*bitmap_inode = (*ac)->ac_inode;
igrab(*bitmap_inode);
*bitmap_bh = (*ac)->ac_bh;
get_bh(*bitmap_bh);
status = 0;
bail:
if ((status < 0) && *ac) {
ocfs2_free_alloc_context(*ac);
*ac = NULL;
}
mlog_exit(status);
return status;
}
/*
* pass it the bitmap lock in lock_bh if you have it.
*/
static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac)
{
int status = 0;
u32 cluster_off, cluster_count;
struct ocfs2_dinode *alloc = NULL;
struct ocfs2_local_alloc *la;
mlog_entry_void();
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
if (alloc->id1.bitmap1.i_total)
mlog(0, "asking me to alloc a new window over a non-empty "
"one\n");
mlog(0, "Allocating %u clusters for a new window.\n",
ocfs2_local_alloc_window_bits(osb));
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
status = ocfs2_claim_clusters(osb, handle, ac,
ocfs2_local_alloc_window_bits(osb),
&cluster_off, &cluster_count);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
la->la_bm_off = cpu_to_le32(cluster_off);
alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
/* just in case... In the future when we find space ourselves,
* we don't have to get all contiguous -- but we'll have to
* set all previously used bits in bitmap and update
* la_bits_set before setting the bits in the main bitmap. */
alloc->id1.bitmap1.i_used = 0;
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
le16_to_cpu(la->la_size));
mlog(0, "New window allocated:\n");
mlog(0, "window la_bm_off = %u\n",
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
bail:
mlog_exit(status);
return status;
}
/* Note that we do *NOT* lock the local alloc inode here as
* it's been locked already for us. */
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode)
{
int status = 0;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct ocfs2_journal_handle *handle = NULL;
struct ocfs2_dinode *alloc;
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_alloc_context *ac = NULL;
mlog_entry_void();
handle = ocfs2_alloc_handle(osb);
if (!handle) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
/* This will lock the main bitmap for us. */
status = ocfs2_local_alloc_reserve_for_window(osb,
handle,
&ac,
&main_bm_inode,
&main_bm_bh);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status);
goto bail;
}
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
/* We want to clear the local alloc before doing anything
* else, so that if we error later during this operation,
* local alloc shutdown won't try to double free main bitmap
* bits. Make a copy so the sync function knows which bits to
* free. */
alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
if (!alloc_copy) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
status = ocfs2_journal_access(handle, local_alloc_inode,
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
ocfs2_clear_local_alloc(alloc);
status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
main_bm_inode, main_bm_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = ocfs2_local_alloc_new_window(osb, handle, ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
atomic_inc(&osb->alloc_stats.moves);
status = 0;
bail:
if (handle)
ocfs2_commit_trans(handle);
if (main_bm_bh)
brelse(main_bm_bh);
if (main_bm_inode)
iput(main_bm_inode);
if (alloc_copy)
kfree(alloc_copy);
if (ac)
ocfs2_free_alloc_context(ac);
mlog_exit(status);
return status;
}

56
fs/ocfs2/localalloc.h Normal file
View File

@ -0,0 +1,56 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* localalloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCALALLOC_H
#define OCFS2_LOCALALLOC_H
int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc);
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
u64 bits);
struct ocfs2_alloc_context;
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
struct ocfs2_journal_handle *passed_handle,
u32 bits_wanted,
struct ocfs2_alloc_context *ac);
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac,
u32 min_bits,
u32 *bit_off,
u32 *num_bits);
#endif /* OCFS2_LOCALALLOC_H */

102
fs/ocfs2/mmap.c Normal file
View File

@ -0,0 +1,102 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* mmap.c
*
* Code to deal with the mess that is clustered mmap.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/signal.h>
#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "mmap.h"
static struct page *ocfs2_nopage(struct vm_area_struct * area,
unsigned long address,
int *type)
{
struct inode *inode = area->vm_file->f_dentry->d_inode;
struct page *page = NOPAGE_SIGBUS;
sigset_t blocked, oldset;
int ret;
mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
/* The best way to deal with signals in this path is
* to block them upfront, rather than allowing the
* locking paths to return -ERESTARTSYS. */
sigfillset(&blocked);
/* We should technically never get a bad ret return
* from sigprocmask */
ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
page = filemap_nopage(area, address, type);
ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
if (ret < 0)
mlog_errno(ret);
out:
mlog_exit_ptr(page);
return page;
}
static struct vm_operations_struct ocfs2_file_vm_ops = {
.nopage = ocfs2_nopage,
};
int ocfs2_mmap(struct file *file,
struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
struct inode *inode = mapping->host;
/* We don't want to support shared writable mappings yet. */
if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
&& ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
/* This is -EINVAL because generic_file_readonly_mmap
* returns it in a similar situation. */
return -EINVAL;
}
update_atime(inode);
vma->vm_ops = &ocfs2_file_vm_ops;
return 0;
}

6
fs/ocfs2/mmap.h Normal file
View File

@ -0,0 +1,6 @@
#ifndef OCFS2_MMAP_H
#define OCFS2_MMAP_H
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
#endif /* OCFS2_MMAP_H */

2264
fs/ocfs2/namei.c Normal file

File diff suppressed because it is too large Load Diff

58
fs/ocfs2/namei.h Normal file
View File

@ -0,0 +1,58 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* namei.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
extern struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
int ocfs2_check_dir_entry (struct inode *dir,
struct ocfs2_dir_entry *de,
struct buffer_head *bh,
unsigned long offset);
struct buffer_head *ocfs2_find_entry(const char *name,
int namelen,
struct inode *dir,
struct ocfs2_dir_entry **res_dir);
int ocfs2_orphan_del(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
struct buffer_head *orphan_dir_bh);
static inline int ocfs2_match(int len,
const char * const name,
struct ocfs2_dir_entry *de)
{
if (len != de->name_len)
return 0;
if (!de->inode)
return 0;
return !memcmp(name, de->name, len);
}
#endif /* OCFS2_NAMEI_H */

109
fs/ocfs2/ocfs1_fs_compat.h Normal file
View File

@ -0,0 +1,109 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs1_fs_compat.h
*
* OCFS1 volume header definitions. OCFS2 creates valid but unmountable
* OCFS1 volume headers on the first two sectors of an OCFS2 volume.
* This allows an OCFS1 volume to see the partition and cleanly fail to
* mount it.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS1_FS_COMPAT_H
#define _OCFS1_FS_COMPAT_H
#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
#define OCFS1_MAX_MOUNT_POINT_LEN 128
#define OCFS1_MAX_VOL_ID_LENGTH 16
#define OCFS1_MAX_VOL_LABEL_LEN 64
#define OCFS1_MAX_CLUSTER_NAME_LEN 64
#define OCFS1_MAJOR_VERSION (2)
#define OCFS1_MINOR_VERSION (0)
#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
/*
* OCFS1 superblock. Lives at sector 0.
*/
struct ocfs1_vol_disk_hdr
{
/*00*/ __u32 minor_version;
__u32 major_version;
/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
/*108*/ __u64 serial_num;
/*110*/ __u64 device_size;
__u64 start_off;
/*120*/ __u64 bitmap_off;
__u64 publ_off;
/*130*/ __u64 vote_off;
__u64 root_bitmap_off;
/*140*/ __u64 data_start_off;
__u64 root_bitmap_size;
/*150*/ __u64 root_off;
__u64 root_size;
/*160*/ __u64 cluster_size;
__u64 num_nodes;
/*170*/ __u64 num_clusters;
__u64 dir_node_size;
/*180*/ __u64 file_node_size;
__u64 internal_off;
/*190*/ __u64 node_cfg_off;
__u64 node_cfg_size;
/*1A0*/ __u64 new_cfg_off;
__u32 prot_bits;
__s32 excl_mount;
/*1B0*/
};
struct ocfs1_disk_lock
{
/*00*/ __u32 curr_master;
__u8 file_lock;
__u8 compat_pad[3]; /* Not in orignal definition. Used to
make the already existing alignment
explicit */
__u64 last_write_time;
/*10*/ __u64 last_read_time;
__u32 writer_node_num;
__u32 reader_node_num;
/*20*/ __u64 oin_node_map;
__u64 dlock_seq_num;
/*30*/
};
/*
* OCFS1 volume label. Lives at sector 1.
*/
struct ocfs1_vol_label
{
/*00*/ struct ocfs1_disk_lock disk_lock;
/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
/*70*/ __u16 label_len;
/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
/*82*/ __u16 vol_id_len;
/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
/*A4*/ __u16 cluster_name_len;
/*A6*/
};
#endif /* _OCFS1_FS_COMPAT_H */

464
fs/ocfs2/ocfs2.h Normal file
View File

@ -0,0 +1,464 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2.h
*
* Defines macros and structures used in OCFS2
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_H
#define OCFS2_H
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
#include "dlm/dlmapi.h"
#include "ocfs2_fs.h"
#include "endian.h"
#include "ocfs2_lockid.h"
struct ocfs2_extent_map {
u32 em_clusters;
struct rb_root em_extents;
};
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
* amount of inlined blocks to be stored on an array and grow the
* structure into a rb tree when necessary. */
#define OCFS2_INODE_MAX_CACHE_ARRAY 2
struct ocfs2_caching_info {
unsigned int ci_num_cached;
union {
sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
struct rb_root ci_tree;
} ci_cache;
};
/* this limits us to 256 nodes
* if we need more, we can do a kmalloc for the map */
#define OCFS2_NODE_MAP_MAX_NODES 256
struct ocfs2_node_map {
u16 num_nodes;
unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
};
enum ocfs2_ast_action {
OCFS2_AST_INVALID = 0,
OCFS2_AST_ATTACH,
OCFS2_AST_CONVERT,
OCFS2_AST_DOWNCONVERT,
};
/* actions for an unlockast function to take. */
enum ocfs2_unlock_action {
OCFS2_UNLOCK_INVALID = 0,
OCFS2_UNLOCK_CANCEL_CONVERT,
OCFS2_UNLOCK_DROP_LOCK,
};
/* ocfs2_lock_res->l_flags flags. */
#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
* the lvb */
#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
* dlm_lock */
#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
* downconvert*/
#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
#define OCFS2_LOCK_REFRESHING (0x00000020)
#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
* for shutdown paths */
#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
* when to skip queueing
* a lock because it's
* about to be
* dropped. */
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
spinlock_t l_lock;
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
struct dlm_lockstatus l_lksb;
/* used from AST/BAST funcs. */
enum ocfs2_ast_action l_action;
enum ocfs2_unlock_action l_unlock_action;
int l_requested;
int l_blocking;
wait_queue_head_t l_event;
struct list_head l_debug_list;
};
struct ocfs2_dlm_debug {
struct kref d_refcnt;
struct dentry *d_locking_state;
struct list_head d_lockres_tracking;
};
enum ocfs2_vol_state
{
VOLUME_INIT = 0,
VOLUME_MOUNTED,
VOLUME_DISMOUNTED,
VOLUME_DISABLED
};
struct ocfs2_alloc_stats
{
atomic_t moves;
atomic_t local_data;
atomic_t bitmap_data;
atomic_t bg_allocs;
atomic_t bg_extends;
};
enum ocfs2_local_alloc_state
{
OCFS2_LA_UNUSED = 0,
OCFS2_LA_ENABLED,
OCFS2_LA_DISABLED
};
enum ocfs2_mount_options
{
OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
#ifdef OCFS2_ORACORE_WORKAROUNDS
OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
#endif
};
#define OCFS2_OSB_SOFT_RO 0x0001
#define OCFS2_OSB_HARD_RO 0x0002
#define OCFS2_OSB_ERROR_FS 0x0004
struct ocfs2_journal;
struct ocfs2_journal_handle;
struct ocfs2_super
{
u32 osb_id; /* id used by the proc interface */
struct task_struct *commit_task;
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
struct inode *system_inodes[NUM_SYSTEM_INODES];
struct ocfs2_slot_info *slot_info;
spinlock_t node_map_lock;
struct ocfs2_node_map mounted_map;
struct ocfs2_node_map recovery_map;
struct ocfs2_node_map umount_map;
u32 num_clusters;
u64 root_blkno;
u64 system_dir_blkno;
u64 bitmap_blkno;
u32 bitmap_cpg;
u8 *uuid;
char *uuid_str;
u8 *vol_label;
u64 first_cluster_group_blkno;
u32 fs_generation;
u32 s_feature_compat;
u32 s_feature_incompat;
u32 s_feature_ro_compat;
/* Protects s_next_generaion, osb_flags. Could protect more on
* osb as it's very short lived. */
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
unsigned long s_mount_opt;
u16 max_slots;
u16 num_nodes;
s16 node_num;
s16 slot_num;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
atomic_t vol_state;
struct semaphore recovery_lock;
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
enum ocfs2_local_alloc_state local_alloc_state;
struct buffer_head *local_alloc_bh;
/* Next two fields are for local node slot recovery during
* mount. */
int dirty;
struct ocfs2_dinode *local_alloc_copy;
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
struct dlm_ctxt *dlm;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct dlm_eviction_cb osb_eviction_cb;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
wait_queue_head_t recovery_event;
spinlock_t vote_task_lock;
struct task_struct *vote_task;
wait_queue_head_t vote_event;
unsigned long vote_wake_sequence;
unsigned long vote_work_sequence;
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
struct list_head vote_list;
int vote_count;
u32 net_key;
spinlock_t net_response_lock;
unsigned int net_response_ids;
struct list_head net_response_list;
struct o2hb_callback_func osb_hb_up;
struct o2hb_callback_func osb_hb_down;
struct list_head osb_net_handlers;
wait_queue_head_t osb_mount_event;
/* Truncate log info */
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct work_struct osb_truncate_log_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
#define OCFS2_MAX_OSB_ID 65536
static inline int ocfs2_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 0;
if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
return 0;
return 1;
}
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
* too! */
static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
unsigned long flag)
{
spin_lock(&osb->osb_lock);
osb->osb_flags |= flag;
spin_unlock(&osb->osb_lock);
}
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
spin_lock(&osb->osb_lock);
osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
if (hard)
osb->osb_flags |= OCFS2_OSB_HARD_RO;
else
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
spin_unlock(&osb->osb_lock);
}
static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
{
int ret;
spin_lock(&osb->osb_lock);
ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
spin_unlock(&osb->osb_lock);
return ret;
}
static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
{
int ret;
spin_lock(&osb->osb_lock);
ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
spin_unlock(&osb->osb_lock);
return ret;
}
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
typeof(__di) ____di = (__di); \
ocfs2_error((__sb), \
"Dinode # %"MLFu64" has bad signature %.*s", \
(____di)->i_blkno, 7, \
(____di)->i_signature); \
} while (0);
#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
typeof(__eb) ____eb = (__eb); \
ocfs2_error((__sb), \
"Extent Block # %"MLFu64" has bad signature %.*s", \
(____eb)->h_blkno, 7, \
(____eb)->h_signature); \
} while (0);
#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
typeof(__gd) ____gd = (__gd); \
ocfs2_error((__sb), \
"Group Descriptor # %"MLFu64" has bad signature %.*s", \
(____gd)->bg_blkno, 7, \
(____gd)->bg_signature); \
} while (0);
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
return (unsigned long)(blkno & (u64)ULONG_MAX);
}
static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
u32 clusters)
{
int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
sb->s_blocksize_bits;
return (u64)clusters << c_to_b_bits;
}
static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
u64 blocks)
{
int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
sb->s_blocksize_bits;
return (u32)(blocks >> b_to_c_bits);
}
static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
u64 bytes)
{
int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int clusters;
bytes += OCFS2_SB(sb)->s_clustersize - 1;
/* OCFS2 just cannot have enough clusters to overflow this */
clusters = (unsigned int)(bytes >> cl_bits);
return clusters;
}
static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
u64 bytes)
{
bytes += sb->s_blocksize - 1;
return bytes >> sb->s_blocksize_bits;
}
static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
u32 clusters)
{
return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
}
static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
u64 bytes)
{
int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int clusters;
clusters = ocfs2_clusters_for_bytes(sb, bytes);
return (u64)clusters << cl_bits;
}
static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
u64 bytes)
{
u64 blocks;
blocks = ocfs2_blocks_for_bytes(sb, bytes);
return blocks << sb->s_blocksize_bits;
}
static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
{
return (unsigned long)((bytes + 511) >> 9);
}
#define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit
#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
#endif /* OCFS2_H */

638
fs/ocfs2/ocfs2_fs.h Normal file
View File

@ -0,0 +1,638 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_fs.h
*
* On-disk structures for OCFS2.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS2_FS_H
#define _OCFS2_FS_H
/* Version */
#define OCFS2_MAJOR_REV_LEVEL 0
#define OCFS2_MINOR_REV_LEVEL 90
/*
* An OCFS2 volume starts this way:
* Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
* Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
* Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
*
* All other structures are found from the superblock information.
*
* OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a
* blocksize of 2K, it is 4096 bytes into disk.
*/
#define OCFS2_SUPER_BLOCK_BLKNO 2
/*
* Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
* grow if needed.
*/
#define OCFS2_MIN_CLUSTERSIZE 4096
#define OCFS2_MAX_CLUSTERSIZE 1048576
/*
* Blocks cannot be bigger than clusters, so the maximum blocksize is the
* minimum cluster size.
*/
#define OCFS2_MIN_BLOCKSIZE 512
#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
/* Filesystem magic number */
#define OCFS2_SUPER_MAGIC 0x7461636f
/* Object signatures */
#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
#define OCFS2_INODE_SIGNATURE "INODE01"
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
( OCFS2_SB(sb)->s_feature_compat & (mask) )
#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \
( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \
( OCFS2_SB(sb)->s_feature_incompat & (mask) )
#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_compat |= (mask)
#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_incompat |= (mask)
#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_compat &= ~(mask)
#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
#define OCFS2_FEATURE_COMPAT_SUPP 0
#define OCFS2_FEATURE_INCOMPAT_SUPP 0
#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
/*
* Heartbeat-only devices are missing journals and other files. The
* filesystem driver can't load them, but the library can. Never put
* this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
*/
#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
/*
* Flags on ocfs2_dinode.i_flags
*/
#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
#define OCFS2_UNUSED2_FL (0x00000002)
#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */
#define OCFS2_UNUSED3_FL (0x00000008)
/* System inode flags */
#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */
#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */
#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */
#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */
#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
*/
#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
/*
* superblock s_state flags
*/
#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */
/* Limit of space in ocfs2_dir_entry */
#define OCFS2_MAX_FILENAME_LEN 255
/* Maximum slots on an ocfs2 file system */
#define OCFS2_MAX_SLOTS 255
/* Slot map indicator for an empty slot */
#define OCFS2_INVALID_SLOT -1
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
int si_mode;
};
/* System file index */
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
SLOT_MAP_SYSTEM_INODE,
#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
JOURNAL_SYSTEM_INODE,
LOCAL_ALLOC_SYSTEM_INODE,
TRUNCATE_LOG_SYSTEM_INODE,
NUM_SYSTEM_INODES
};
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Global system inodes (single copy) */
/* The first two are only used from userspace mfks/tunefs */
[BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 },
[GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
/* These are used by the running filesystem */
[SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
[HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
[GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
/* Slot-specific system inodes (one copy per slot) */
[ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
[EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
[INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
[JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
[LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
[TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
};
/* Parameter passed from mount.ocfs2 to module */
#define OCFS2_HB_NONE "heartbeat=none"
#define OCFS2_HB_LOCAL "heartbeat=local"
/*
* OCFS2 directory file types. Only the low 3 bits are used. The
* other bits are reserved for now.
*/
#define OCFS2_FT_UNKNOWN 0
#define OCFS2_FT_REG_FILE 1
#define OCFS2_FT_DIR 2
#define OCFS2_FT_CHRDEV 3
#define OCFS2_FT_BLKDEV 4
#define OCFS2_FT_FIFO 5
#define OCFS2_FT_SOCK 6
#define OCFS2_FT_SYMLINK 7
#define OCFS2_FT_MAX 8
/*
* OCFS2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
*/
#define OCFS2_DIR_PAD 4
#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1)
#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name)
#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
OCFS2_DIR_ROUND) & \
~OCFS2_DIR_ROUND)
#define OCFS2_LINK_MAX 32000
#define S_SHIFT 12
static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
[S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
[S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
[S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
[S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
};
/*
* Convenience casts
*/
#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
/*
* On disk extent record for OCFS2
* It describes a range of clusters on disk.
*/
struct ocfs2_extent_rec {
/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
__le32 e_clusters; /* Clusters covered by this extent */
__le64 e_blkno; /* Physical disk offset, in blocks */
/*10*/
};
struct ocfs2_chain_rec {
__le32 c_free; /* Number of free bits in this chain. */
__le32 c_total; /* Number of total bits in this chain */
__le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
};
struct ocfs2_truncate_rec {
__le32 t_start; /* 1st cluster in this log */
__le32 t_clusters; /* Number of total clusters covered */
};
/*
* On disk extent list for OCFS2 (node in the tree). Note that this
* is contained inside ocfs2_dinode or ocfs2_extent_block, so the
* offsets are relative to ocfs2_dinode.id2.i_list or
* ocfs2_extent_block.h_list, respectively.
*/
struct ocfs2_extent_list {
/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
point. 0 means data extents
hang directly off this
header (a leaf) */
__le16 l_count; /* Number of extent records */
__le16 l_next_free_rec; /* Next unused extent slot */
__le16 l_reserved1;
__le64 l_reserved2; /* Pad to
sizeof(ocfs2_extent_rec) */
/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
};
/*
* On disk allocation chain list for OCFS2. Note that this is
* contained inside ocfs2_dinode, so the offsets are relative to
* ocfs2_dinode.id2.i_chain.
*/
struct ocfs2_chain_list {
/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
__le16 cl_bpc; /* Bits per cluster */
__le16 cl_count; /* Total chains in this list */
__le16 cl_next_free_rec; /* Next unused chain slot */
__le64 cl_reserved1;
/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
};
/*
* On disk deallocation log for OCFS2. Note that this is
* contained inside ocfs2_dinode, so the offsets are relative to
* ocfs2_dinode.id2.i_dealloc.
*/
struct ocfs2_truncate_log {
/*00*/ __le16 tl_count; /* Total records in this log */
__le16 tl_used; /* Number of records in use */
__le32 tl_reserved1;
/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
};
/*
* On disk extent block (indirect block) for OCFS2
*/
struct ocfs2_extent_block
{
/*00*/ __u8 h_signature[8]; /* Signature for verification */
__le64 h_reserved1;
/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
extent_header belongs to */
__le16 h_suballoc_bit; /* Bit offset in suballocator
block group */
__le32 h_fs_generation; /* Must match super block */
__le64 h_blkno; /* Offset on disk, in blocks */
/*20*/ __le64 h_reserved3;
__le64 h_next_leaf_blk; /* Offset on disk, in blocks,
of next leaf header pointing
to data */
/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */
/* Actual on-disk size is one block */
};
/*
* On disk superblock for OCFS2
* Note that it is contained inside an ocfs2_dinode, so all offsets
* are relative to the start of ocfs2_dinode.id2.
*/
struct ocfs2_super_block {
/*00*/ __le16 s_major_rev_level;
__le16 s_minor_rev_level;
__le16 s_mnt_count;
__le16 s_max_mnt_count;
__le16 s_state; /* File system state */
__le16 s_errors; /* Behaviour when detecting errors */
__le32 s_checkinterval; /* Max time between checks */
/*10*/ __le64 s_lastcheck; /* Time of last check */
__le32 s_creator_os; /* OS */
__le32 s_feature_compat; /* Compatible feature set */
/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */
__le32 s_feature_ro_compat; /* Readonly-compatible feature set */
__le64 s_root_blkno; /* Offset, in blocks, of root directory
dinode */
/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system
directory dinode */
__le32 s_blocksize_bits; /* Blocksize for this fs */
__le32 s_clustersize_bits; /* Clustersize for this fs */
/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
before tunefs required */
__le16 s_reserved1;
__le32 s_reserved2;
__le64 s_first_cluster_group; /* Block offset of 1st cluster
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
/*A0*/
};
/*
* Local allocation bitmap for OCFS2 slots
* Note that it exists inside an ocfs2_dinode, so all offsets are
* relative to the start of ocfs2_dinode.id2.
*/
struct ocfs2_local_alloc
{
/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */
__le16 la_size; /* Size of included bitmap, in bytes */
__le16 la_reserved1;
__le64 la_reserved2;
/*10*/ __u8 la_bitmap[0];
};
/*
* On disk inode for OCFS2
*/
struct ocfs2_dinode {
/*00*/ __u8 i_signature[8]; /* Signature for validation */
__le32 i_generation; /* Generation number */
__le16 i_suballoc_slot; /* Slot suballocator this inode
belongs to */
__le16 i_suballoc_bit; /* Bit offset in suballocator
block group */
/*10*/ __le32 i_reserved0;
__le32 i_clusters; /* Cluster count */
__le32 i_uid; /* Owner UID */
__le32 i_gid; /* Owning GID */
/*20*/ __le64 i_size; /* Size in bytes */
__le16 i_mode; /* File mode */
__le16 i_links_count; /* Links count */
__le32 i_flags; /* File flags */
/*30*/ __le64 i_atime; /* Access time */
__le64 i_ctime; /* Creation time */
/*40*/ __le64 i_mtime; /* Modification time */
__le64 i_dtime; /* Deletion time */
/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */
__le64 i_last_eb_blk; /* Pointer to last extent
block */
/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */
__le32 i_atime_nsec;
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
/*70*/ __le64 i_reserved1[9];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
struct {
__le64 i_rdev; /* Device number */
} dev1;
struct { /* Info for bitmap system
inodes */
__le32 i_used; /* Bits (ie, clusters) used */
__le32 i_total; /* Total bits (clusters)
available */
} bitmap1;
struct { /* Info for journal system
inodes */
__le32 ij_flags; /* Mounted, version, etc. */
__le32 ij_pad;
} journal1;
} id1; /* Inode type dependant 1 */
/*C0*/ union {
struct ocfs2_super_block i_super;
struct ocfs2_local_alloc i_lab;
struct ocfs2_chain_list i_chain;
struct ocfs2_extent_list i_list;
struct ocfs2_truncate_log i_dealloc;
__u8 i_symlink[0];
} id2;
/* Actual on-disk size is one block */
};
/*
* On-disk directory entry structure for OCFS2
*
* Packed as this structure could be accessed unaligned on 64-bit platforms
*/
struct ocfs2_dir_entry {
/*00*/ __le64 inode; /* Inode number */
__le16 rec_len; /* Directory entry length */
__u8 name_len; /* Name length */
__u8 file_type;
/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */
/* Actual on-disk length specified by rec_len */
} __attribute__ ((packed));
/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
{
/*00*/ __u8 bg_signature[8]; /* Signature for validation */
__le16 bg_size; /* Size of included bitmap in
bytes. */
__le16 bg_bits; /* Bits represented by this
group. */
__le16 bg_free_bits_count; /* Free bits count */
__le16 bg_chain; /* What chain I am in. */
/*10*/ __le32 bg_generation;
__le32 bg_reserved1;
__le64 bg_next_group; /* Next group in my list, in
blocks */
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
blocks */
__le64 bg_blkno; /* Offset on disk, in blocks */
/*30*/ __le64 bg_reserved2[2];
/*40*/ __u8 bg_bitmap[0];
};
#ifdef __KERNEL__
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
{
return sb->s_blocksize -
offsetof(struct ocfs2_dinode, id2.i_symlink);
}
static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
{
int size;
size = sb->s_blocksize -
offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
return size / sizeof(struct ocfs2_extent_rec);
}
static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
{
int size;
size = sb->s_blocksize -
offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
return size / sizeof(struct ocfs2_chain_rec);
}
static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
{
int size;
size = sb->s_blocksize -
offsetof(struct ocfs2_extent_block, h_list.l_recs);
return size / sizeof(struct ocfs2_extent_rec);
}
static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
{
u16 size;
size = sb->s_blocksize -
offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
return size;
}
static inline int ocfs2_group_bitmap_size(struct super_block *sb)
{
int size;
size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
return size;
}
static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
{
int size;
size = sb->s_blocksize -
offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
return size / sizeof(struct ocfs2_truncate_rec);
}
#else
static inline int ocfs2_fast_symlink_chars(int blocksize)
{
return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
}
static inline int ocfs2_extent_recs_per_inode(int blocksize)
{
int size;
size = blocksize -
offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
return size / sizeof(struct ocfs2_extent_rec);
}
static inline int ocfs2_chain_recs_per_inode(int blocksize)
{
int size;
size = blocksize -
offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
return size / sizeof(struct ocfs2_chain_rec);
}
static inline int ocfs2_extent_recs_per_eb(int blocksize)
{
int size;
size = blocksize -
offsetof(struct ocfs2_extent_block, h_list.l_recs);
return size / sizeof(struct ocfs2_extent_rec);
}
static inline int ocfs2_local_alloc_size(int blocksize)
{
int size;
size = blocksize -
offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
return size;
}
static inline int ocfs2_group_bitmap_size(int blocksize)
{
int size;
size = blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
return size;
}
static inline int ocfs2_truncate_recs_per_inode(int blocksize)
{
int size;
size = blocksize -
offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
return size / sizeof(struct ocfs2_truncate_rec);
}
#endif /* __KERNEL__ */
static inline int ocfs2_system_inode_is_global(int type)
{
return ((type >= 0) &&
(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
}
static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
int type, int slot)
{
int chars;
/*
* Global system inodes can only have one copy. Everything
* after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
* list has a copy per slot.
*/
if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
chars = snprintf(buf, len,
ocfs2_system_inodes[type].si_name);
else
chars = snprintf(buf, len,
ocfs2_system_inodes[type].si_name,
slot);
return chars;
}
static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
umode_t mode)
{
de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
#endif /* _OCFS2_FS_H */

73
fs/ocfs2/ocfs2_lockid.h Normal file
View File

@ -0,0 +1,73 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_lockid.h
*
* Defines OCFS2 lockid bits.
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCKID_H
#define OCFS2_LOCKID_H
/* lock ids are made up in the following manner:
* name[0] --> type
* name[1-6] --> 6 pad characters, reserved for now
* name[7-22] --> block number, expressed in hex as 16 chars
* name[23-30] --> i_generation, expressed in hex 8 chars
* name[31] --> '\0' */
#define OCFS2_LOCK_ID_MAX_LEN 32
#define OCFS2_LOCK_ID_PAD "000000"
enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_META = 0,
OCFS2_LOCK_TYPE_DATA,
OCFS2_LOCK_TYPE_SUPER,
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_NUM_LOCK_TYPES
};
static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
{
char c;
switch (type) {
case OCFS2_LOCK_TYPE_META:
c = 'M';
break;
case OCFS2_LOCK_TYPE_DATA:
c = 'D';
break;
case OCFS2_LOCK_TYPE_SUPER:
c = 'S';
break;
case OCFS2_LOCK_TYPE_RENAME:
c = 'R';
break;
case OCFS2_LOCK_TYPE_RW:
c = 'W';
break;
default:
c = '\0';
}
return c;
}
#endif /* OCFS2_LOCKID_H */

303
fs/ocfs2/slot_map.c Normal file
View File

@ -0,0 +1,303 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* slot_map.c
*
*
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
#include "buffer_head_io.h"
static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
s16 global);
static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 slot_num,
s16 node_num);
/* Use the slot information we've collected to create a map of mounted
* nodes. Should be holding an EX on super block. assumes slot info is
* up to date. Note that we call this *after* we find a slot, so our
* own node should be set in the map too... */
void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
{
int i;
struct ocfs2_slot_info *si = osb->slot_info;
spin_lock(&si->si_lock);
for (i = 0; i < si->si_size; i++)
if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
ocfs2_node_map_set_bit(osb, &osb->mounted_map,
si->si_global_node_nums[i]);
spin_unlock(&si->si_lock);
}
/* post the slot information on disk into our slot_info struct. */
void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
int i;
__le16 *disk_info;
/* we don't read the slot block here as ocfs2_super_lock
* should've made sure we have the most recent copy. */
spin_lock(&si->si_lock);
disk_info = (__le16 *) si->si_bh->b_data;
for (i = 0; i < si->si_size; i++)
si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
spin_unlock(&si->si_lock);
}
/* post the our slot info stuff into it's destination bh and write it
* out. */
int ocfs2_update_disk_slots(struct ocfs2_super *osb,
struct ocfs2_slot_info *si)
{
int status, i;
__le16 *disk_info = (__le16 *) si->si_bh->b_data;
spin_lock(&si->si_lock);
for (i = 0; i < si->si_size; i++)
disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
spin_unlock(&si->si_lock);
status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
if (status < 0)
mlog_errno(status);
return status;
}
/* try to find global node in the slot info. Returns
* OCFS2_INVALID_SLOT if nothing is found. */
static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
s16 global)
{
int i;
s16 ret = OCFS2_INVALID_SLOT;
for(i = 0; i < si->si_num_slots; i++) {
if (global == si->si_global_node_nums[i]) {
ret = (s16) i;
break;
}
}
return ret;
}
static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
{
int i;
s16 ret = OCFS2_INVALID_SLOT;
for(i = 0; i < si->si_num_slots; i++) {
if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
ret = (s16) i;
break;
}
}
return ret;
}
s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
s16 global)
{
s16 ret;
spin_lock(&si->si_lock);
ret = __ocfs2_node_num_to_slot(si, global);
spin_unlock(&si->si_lock);
return ret;
}
static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 slot_num,
s16 node_num)
{
BUG_ON(slot_num == OCFS2_INVALID_SLOT);
BUG_ON(slot_num >= si->si_num_slots);
BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
(node_num >= O2NM_MAX_NODES));
si->si_global_node_nums[slot_num] = node_num;
}
void ocfs2_clear_slot(struct ocfs2_slot_info *si,
s16 slot_num)
{
spin_lock(&si->si_lock);
__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
spin_unlock(&si->si_lock);
}
int ocfs2_init_slot_info(struct ocfs2_super *osb)
{
int status, i;
u64 blkno;
struct inode *inode = NULL;
struct buffer_head *bh = NULL;
struct ocfs2_slot_info *si;
si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
spin_lock_init(&si->si_lock);
si->si_num_slots = osb->max_slots;
si->si_size = OCFS2_MAX_SLOTS;
for(i = 0; i < si->si_num_slots; i++)
si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
si->si_inode = inode;
si->si_bh = bh;
osb->slot_info = si;
bail:
if (status < 0 && si)
ocfs2_free_slot_info(si);
return status;
}
void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
{
if (si->si_inode)
iput(si->si_inode);
if (si->si_bh)
brelse(si->si_bh);
kfree(si);
}
int ocfs2_find_slot(struct ocfs2_super *osb)
{
int status;
s16 slot;
struct ocfs2_slot_info *si;
mlog_entry_void();
si = osb->slot_info;
ocfs2_update_slot_info(si);
spin_lock(&si->si_lock);
/* search for ourselves first and take the slot if it already
* exists. Perhaps we need to mark this in a variable for our
* own journal recovery? Possibly not, though we certainly
* need to warn to the user */
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
if (slot == OCFS2_INVALID_SLOT) {
/* if no slot yet, then just take 1st available
* one. */
slot = __ocfs2_find_empty_slot(si);
if (slot == OCFS2_INVALID_SLOT) {
spin_unlock(&si->si_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
goto bail;
}
} else
mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
slot);
__ocfs2_fill_slot(si, slot, osb->node_num);
osb->slot_num = slot;
spin_unlock(&si->si_lock);
mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
status = ocfs2_update_disk_slots(osb, si);
if (status < 0)
mlog_errno(status);
bail:
mlog_exit(status);
return status;
}
void ocfs2_put_slot(struct ocfs2_super *osb)
{
int status;
struct ocfs2_slot_info *si = osb->slot_info;
if (!si)
return;
ocfs2_update_slot_info(si);
spin_lock(&si->si_lock);
__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
osb->slot_num = OCFS2_INVALID_SLOT;
spin_unlock(&si->si_lock);
status = ocfs2_update_disk_slots(osb, si);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bail:
osb->slot_info = NULL;
ocfs2_free_slot_info(si);
}

66
fs/ocfs2/slot_map.h Normal file
View File

@ -0,0 +1,66 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* slotmap.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef SLOTMAP_H
#define SLOTMAP_H
struct ocfs2_slot_info {
spinlock_t si_lock;
struct inode *si_inode;
struct buffer_head *si_bh;
unsigned int si_num_slots;
unsigned int si_size;
s16 si_global_node_nums[OCFS2_MAX_SLOTS];
};
int ocfs2_init_slot_info(struct ocfs2_super *osb);
void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
int ocfs2_find_slot(struct ocfs2_super *osb);
void ocfs2_put_slot(struct ocfs2_super *osb);
void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
int ocfs2_update_disk_slots(struct ocfs2_super *osb,
struct ocfs2_slot_info *si);
s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
s16 global);
void ocfs2_clear_slot(struct ocfs2_slot_info *si,
s16 slot_num);
void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
int slot_num)
{
BUG_ON(slot_num == OCFS2_INVALID_SLOT);
assert_spin_locked(&si->si_lock);
return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
}
#endif

1651
fs/ocfs2/suballoc.c Normal file

File diff suppressed because it is too large Load Diff

132
fs/ocfs2/suballoc.h Normal file
View File

@ -0,0 +1,132 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* suballoc.h
*
* Defines sub allocator api
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _CHAINALLOC_H_
#define _CHAINALLOC_H_
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
u32,
u32,
u16 *,
u16 *);
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
struct buffer_head *ac_bh; /* file entry bh */
u32 ac_bits_wanted;
u32 ac_bits_given;
#define OCFS2_AC_USE_LOCAL 1
#define OCFS2_AC_USE_MAIN 2
#define OCFS2_AC_USE_INODE 3
#define OCFS2_AC_USE_META 4
u32 ac_which;
struct ocfs2_journal_handle *ac_handle;
/* these are used by the chain search */
u16 ac_chain;
int ac_allow_chain_relink;
group_search_t *ac_group_search;
};
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
{
return ac->ac_bits_wanted - ac->ac_bits_given;
}
int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_dinode *fe,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_clusters(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
int ocfs2_claim_metadata(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
u16 *suballoc_bit_start,
u32 *num_bits,
u64 *blkno_start);
int ocfs2_claim_new_inode(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac,
u16 *suballoc_bit,
u64 *fe_blkno);
int ocfs2_claim_clusters(struct ocfs2_super *osb,
struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
u32 *num_clusters);
int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
struct ocfs2_dinode *di);
int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
struct inode *eb_alloc_inode,
struct buffer_head *eb_alloc_bh,
struct ocfs2_extent_block *eb);
int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
struct inode *bitmap_inode,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
u64 bg_blkno)
{
/* This should work for all block group descriptors as only
* the 1st group descriptor of the cluster bitmap is
* different. */
if (bg_blkno == osb->first_cluster_group_blkno)
return 0;
/* the rest of the block groups are located at the beginning
* of their 1st cluster, so a direct translation just
* works. */
return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
}
static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
}
/* This is for local alloc ONLY. Others should use the task-specific
* apis above. */
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
#endif /* _CHAINALLOC_H_ */

1733
fs/ocfs2/super.c Normal file

File diff suppressed because it is too large Load Diff

44
fs/ocfs2/super.h Normal file
View File

@ -0,0 +1,44 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* super.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
extern struct workqueue_struct *ocfs2_wq;
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
void __ocfs2_error(struct super_block *sb,
const char *function,
const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
void __ocfs2_abort(struct super_block *sb,
const char *function,
const char *fmt, ...);
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
#endif /* OCFS2_SUPER_H */

180
fs/ocfs2/symlink.c Normal file
View File

@ -0,0 +1,180 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* linux/cluster/ssi/cfs/symlink.c
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
* or NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
*
* Copyright (C) 1992 Rick Sladkey
*
* Optimization changes Copyright (C) 1994 Florian La Roche
*
* Jun 7 1999, cache symlink lookups in the page cache. -DaveM
*
* Portions Copyright (C) 2001 Compaq Computer Corporation
*
* ocfs2 symlink handling code.
*
* Copyright (C) 2004, 2005 Oracle.
*
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/utsname.h>
#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "file.h"
#include "inode.h"
#include "journal.h"
#include "symlink.h"
#include "buffer_head_io.h"
static char *ocfs2_page_getlink(struct dentry * dentry,
struct page **ppage);
static char *ocfs2_fast_symlink_getlink(struct inode *inode,
struct buffer_head **bh);
/* get the link contents into pagecache */
static char *ocfs2_page_getlink(struct dentry * dentry,
struct page **ppage)
{
struct page * page;
struct address_space *mapping = dentry->d_inode->i_mapping;
page = read_cache_page(mapping, 0,
(filler_t *)mapping->a_ops->readpage, NULL);
if (IS_ERR(page))
goto sync_fail;
wait_on_page_locked(page);
if (!PageUptodate(page))
goto async_fail;
*ppage = page;
return kmap(page);
async_fail:
page_cache_release(page);
return ERR_PTR(-EIO);
sync_fail:
return (char*)page;
}
static char *ocfs2_fast_symlink_getlink(struct inode *inode,
struct buffer_head **bh)
{
int status;
char *link = NULL;
struct ocfs2_dinode *fe;
mlog_entry_void();
status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
OCFS2_I(inode)->ip_blkno,
bh,
OCFS2_BH_CACHED,
inode);
if (status < 0) {
mlog_errno(status);
link = ERR_PTR(status);
goto bail;
}
fe = (struct ocfs2_dinode *) (*bh)->b_data;
link = (char *) fe->id2.i_symlink;
bail:
mlog_exit(status);
return link;
}
static int ocfs2_readlink(struct dentry *dentry,
char __user *buffer,
int buflen)
{
int ret;
char *link;
struct buffer_head *bh = NULL;
struct inode *inode = dentry->d_inode;
mlog_entry_void();
link = ocfs2_fast_symlink_getlink(inode, &bh);
if (IS_ERR(link)) {
ret = PTR_ERR(link);
goto out;
}
ret = vfs_readlink(dentry, buffer, buflen, link);
brelse(bh);
out:
mlog_exit(ret);
return ret;
}
static void *ocfs2_follow_link(struct dentry *dentry,
struct nameidata *nd)
{
int status;
char *link;
struct inode *inode = dentry->d_inode;
struct page *page = NULL;
struct buffer_head *bh = NULL;
if (ocfs2_inode_is_fast_symlink(inode))
link = ocfs2_fast_symlink_getlink(inode, &bh);
else
link = ocfs2_page_getlink(dentry, &page);
if (IS_ERR(link)) {
status = PTR_ERR(link);
mlog_errno(status);
goto bail;
}
status = vfs_follow_link(nd, link);
if (status)
mlog_errno(status);
bail:
if (page) {
kunmap(page);
page_cache_release(page);
}
if (bh)
brelse(bh);
return ERR_PTR(status);
}
struct inode_operations ocfs2_symlink_inode_operations = {
.readlink = page_readlink,
.follow_link = ocfs2_follow_link,
.getattr = ocfs2_getattr,
};
struct inode_operations ocfs2_fast_symlink_inode_operations = {
.readlink = ocfs2_readlink,
.follow_link = ocfs2_follow_link,
.getattr = ocfs2_getattr,
};

42
fs/ocfs2/symlink.h Normal file
View File

@ -0,0 +1,42 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* symlink.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_SYMLINK_H
#define OCFS2_SYMLINK_H
extern struct inode_operations ocfs2_symlink_inode_operations;
extern struct inode_operations ocfs2_fast_symlink_inode_operations;
/*
* Test whether an inode is a fast symlink.
*/
static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
{
return (S_ISLNK(inode->i_mode) &&
inode->i_blocks == 0);
}
#endif /* OCFS2_SYMLINK_H */

131
fs/ocfs2/sysfile.c Normal file
View File

@ -0,0 +1,131 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* sysfile.c
*
* Initialize, read, write, etc. system files.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include "ocfs2.h"
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "alloc.h"
#include "dir.h"
#include "inode.h"
#include "journal.h"
#include "sysfile.h"
#include "buffer_head_io.h"
static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
static inline int is_global_system_inode(int type);
static inline int is_in_system_inode_array(struct ocfs2_super *osb,
int type,
u32 slot);
static inline int is_global_system_inode(int type)
{
return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
}
static inline int is_in_system_inode_array(struct ocfs2_super *osb,
int type,
u32 slot)
{
return slot == osb->slot_num || is_global_system_inode(type);
}
struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot)
{
struct inode *inode = NULL;
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
if (is_in_system_inode_array(osb, type, slot))
arr = &(osb->system_inodes[type]);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
inode = igrab(inode);
if (!inode)
BUG();
return inode;
}
/* this gets one ref thru iget */
inode = _ocfs2_get_system_file_inode(osb, type, slot);
/* add one more if putting into array for first time */
if (arr && inode) {
*arr = igrab(inode);
if (!*arr)
BUG();
}
return inode;
}
static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot)
{
char namebuf[40];
struct inode *inode = NULL;
u64 blkno;
struct buffer_head *dirent_bh = NULL;
struct ocfs2_dir_entry *de = NULL;
int status = 0;
ocfs2_sprintf_system_inode_name(namebuf,
sizeof(namebuf),
type, slot);
status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
&blkno, osb->sys_root_inode,
&dirent_bh, &de);
if (status < 0) {
goto bail;
}
inode = ocfs2_iget(osb, blkno);
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
inode = NULL;
goto bail;
}
bail:
if (dirent_bh)
brelse(dirent_bh);
return inode;
}

33
fs/ocfs2/sysfile.h Normal file
View File

@ -0,0 +1,33 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* sysfile.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_SYSFILE_H
#define OCFS2_SYSFILE_H
struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
#endif /* OCFS2_SYSFILE_H */

544
fs/ocfs2/uptodate.c Normal file
View File

@ -0,0 +1,544 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* uptodate.c
*
* Tracking the up-to-date-ness of a local buffer_head with respect to
* the cluster.
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
* Standard buffer head caching flags (uptodate, etc) are insufficient
* in a clustered environment - a buffer may be marked up to date on
* our local node but could have been modified by another cluster
* member. As a result an additional (and performant) caching scheme
* is required. A further requirement is that we consume as little
* memory as possible - we never pin buffer_head structures in order
* to cache them.
*
* We track the existence of up to date buffers on the inodes which
* are associated with them. Because we don't want to pin
* buffer_heads, this is only a (strong) hint and several other checks
* are made in the I/O path to ensure that we don't use a stale or
* invalid buffer without going to disk:
* - buffer_jbd is used liberally - if a bh is in the journal on
* this node then it *must* be up to date.
* - the standard buffer_uptodate() macro is used to detect buffers
* which may be invalid (even if we have an up to date tracking
* item for them)
*
* For a full understanding of how this code works together, one
* should read the callers in dlmglue.c, the I/O functions in
* buffer_head_io.c and ocfs2_journal_access in journal.c
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
#include <linux/jbd.h>
#define MLOG_MASK_PREFIX ML_UPTODATE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "inode.h"
#include "uptodate.h"
struct ocfs2_meta_cache_item {
struct rb_node c_node;
sector_t c_block;
};
static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
void ocfs2_metadata_cache_init(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
ci->ci_num_cached = 0;
}
/* No lock taken here as 'root' is not expected to be visible to other
* processes. */
static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
{
unsigned int purged = 0;
struct rb_node *node;
struct ocfs2_meta_cache_item *item;
while ((node = rb_last(root)) != NULL) {
item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
mlog(0, "Purge item %llu\n",
(unsigned long long) item->c_block);
rb_erase(&item->c_node, root);
kmem_cache_free(ocfs2_uptodate_cachep, item);
purged++;
}
return purged;
}
/* Called from locking and called from ocfs2_clear_inode. Dump the
* cache for a given inode.
*
* This function is a few more lines longer than necessary due to some
* accounting done here, but I think it's worth tracking down those
* bugs sooner -- Mark */
void ocfs2_metadata_cache_purge(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
unsigned int tree, to_purge, purged;
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct rb_root root = RB_ROOT;
spin_lock(&oi->ip_lock);
tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
to_purge = ci->ci_num_cached;
mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
tree ? "array" : "tree", oi->ip_blkno);
/* If we're a tree, save off the root so that we can safely
* initialize the cache. We do the work to free tree members
* without the spinlock. */
if (tree)
root = ci->ci_cache.ci_tree;
ocfs2_metadata_cache_init(inode);
spin_unlock(&oi->ip_lock);
purged = ocfs2_purge_copied_metadata_tree(&root);
/* If possible, track the number wiped so that we can more
* easily detect counting errors. Unfortunately, this is only
* meaningful for trees. */
if (tree && purged != to_purge)
mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
oi->ip_blkno, to_purge, purged);
}
/* Returns the index in the cache array, -1 if not found.
* Requires ip_lock. */
static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
sector_t item)
{
int i;
for (i = 0; i < ci->ci_num_cached; i++) {
if (item == ci->ci_cache.ci_array[i])
return i;
}
return -1;
}
/* Returns the cache item if found, otherwise NULL.
* Requires ip_lock. */
static struct ocfs2_meta_cache_item *
ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
sector_t block)
{
struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
struct ocfs2_meta_cache_item *item = NULL;
while (n) {
item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
if (block < item->c_block)
n = n->rb_left;
else if (block > item->c_block)
n = n->rb_right;
else
return item;
}
return NULL;
}
static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
struct buffer_head *bh)
{
int index = -1;
struct ocfs2_meta_cache_item *item = NULL;
spin_lock(&oi->ip_lock);
mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
oi->ip_blkno, (unsigned long long) bh->b_blocknr,
!!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
bh->b_blocknr);
else
item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
bh->b_blocknr);
spin_unlock(&oi->ip_lock);
mlog(0, "index = %d, item = %p\n", index, item);
return (index != -1) || (item != NULL);
}
/* Warning: even if it returns true, this does *not* guarantee that
* the block is stored in our inode metadata cache. */
int ocfs2_buffer_uptodate(struct inode *inode,
struct buffer_head *bh)
{
/* Doesn't matter if the bh is in our cache or not -- if it's
* not marked uptodate then we know it can't have correct
* data. */
if (!buffer_uptodate(bh))
return 0;
/* OCFS2 does not allow multiple nodes to be changing the same
* block at the same time. */
if (buffer_jbd(bh))
return 1;
/* Ok, locally the buffer is marked as up to date, now search
* our cache to see if we can trust that. */
return ocfs2_buffer_cached(OCFS2_I(inode), bh);
}
/* Requires ip_lock */
static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
sector_t block)
{
BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
ci->ci_num_cached);
ci->ci_cache.ci_array[ci->ci_num_cached] = block;
ci->ci_num_cached++;
}
/* By now the caller should have checked that the item does *not*
* exist in the tree.
* Requires ip_lock. */
static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *new)
{
sector_t block = new->c_block;
struct rb_node *parent = NULL;
struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
struct ocfs2_meta_cache_item *tmp;
mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
ci->ci_num_cached);
while(*p) {
parent = *p;
tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
if (block < tmp->c_block)
p = &(*p)->rb_left;
else if (block > tmp->c_block)
p = &(*p)->rb_right;
else {
/* This should never happen! */
mlog(ML_ERROR, "Duplicate block %llu cached!\n",
(unsigned long long) block);
BUG();
}
}
rb_link_node(&new->c_node, parent, p);
rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
ci->ci_num_cached++;
}
static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
struct ocfs2_caching_info *ci)
{
assert_spin_locked(&oi->ip_lock);
return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
}
/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
* pointers in tree after we use them - this allows caller to detect
* when to free in case of error. */
static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
struct ocfs2_meta_cache_item **tree)
{
int i;
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
"Inode %"MLFu64", num cached = %u, should be %u\n",
oi->ip_blkno, ci->ci_num_cached,
OCFS2_INODE_MAX_CACHE_ARRAY);
mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
"Inode %"MLFu64" not marked as inline anymore!\n",
oi->ip_blkno);
assert_spin_locked(&oi->ip_lock);
/* Be careful to initialize the tree members *first* because
* once the ci_tree is used, the array is junk... */
for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
tree[i]->c_block = ci->ci_cache.ci_array[i];
oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
ci->ci_cache.ci_tree = RB_ROOT;
/* this will be set again by __ocfs2_insert_cache_tree */
ci->ci_num_cached = 0;
for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
__ocfs2_insert_cache_tree(ci, tree[i]);
tree[i] = NULL;
}
mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
}
/* Slow path function - memory allocation is necessary. See the
* comment above ocfs2_set_buffer_uptodate for more information. */
static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
sector_t block,
int expand_tree)
{
int i;
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct ocfs2_meta_cache_item *new = NULL;
struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
{ NULL, };
mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
oi->ip_blkno, (unsigned long long) block, expand_tree);
new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
if (!new) {
mlog_errno(-ENOMEM);
return;
}
new->c_block = block;
if (expand_tree) {
/* Do *not* allocate an array here - the removal code
* has no way of tracking that. */
for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
GFP_KERNEL);
if (!tree[i]) {
mlog_errno(-ENOMEM);
goto out_free;
}
/* These are initialized in ocfs2_expand_cache! */
}
}
spin_lock(&oi->ip_lock);
if (ocfs2_insert_can_use_array(oi, ci)) {
mlog(0, "Someone cleared the tree underneath us\n");
/* Ok, items were removed from the cache in between
* locks. Detect this and revert back to the fast path */
ocfs2_append_cache_array(ci, block);
spin_unlock(&oi->ip_lock);
goto out_free;
}
if (expand_tree)
ocfs2_expand_cache(oi, tree);
__ocfs2_insert_cache_tree(ci, new);
spin_unlock(&oi->ip_lock);
new = NULL;
out_free:
if (new)
kmem_cache_free(ocfs2_uptodate_cachep, new);
/* If these were used, then ocfs2_expand_cache re-set them to
* NULL for us. */
if (tree[0]) {
for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
if (tree[i])
kmem_cache_free(ocfs2_uptodate_cachep,
tree[i]);
}
}
/* Item insertion is guarded by ip_io_sem, so the insertion path takes
* advantage of this by not rechecking for a duplicate insert during
* the slow case. Additionally, if the cache needs to be bumped up to
* a tree, the code will not recheck after acquiring the lock --
* multiple paths cannot be expanding to a tree at the same time.
*
* The slow path takes into account that items can be removed
* (including the whole tree wiped and reset) when this process it out
* allocating memory. In those cases, it reverts back to the fast
* path.
*
* Note that this function may actually fail to insert the block if
* memory cannot be allocated. This is not fatal however (but may
* result in a performance penalty) */
void ocfs2_set_buffer_uptodate(struct inode *inode,
struct buffer_head *bh)
{
int expand;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
/* The block may very well exist in our cache already, so avoid
* doing any more work in that case. */
if (ocfs2_buffer_cached(oi, bh))
return;
mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
(unsigned long long) bh->b_blocknr);
/* No need to recheck under spinlock - insertion is guarded by
* ip_io_sem */
spin_lock(&oi->ip_lock);
if (ocfs2_insert_can_use_array(oi, ci)) {
/* Fast case - it's an array and there's a free
* spot. */
ocfs2_append_cache_array(ci, bh->b_blocknr);
spin_unlock(&oi->ip_lock);
return;
}
expand = 0;
if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
/* We need to bump things up to a tree. */
expand = 1;
}
spin_unlock(&oi->ip_lock);
__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
}
/* Called against a newly allocated buffer. Most likely nobody should
* be able to read this sort of metadata while it's still being
* allocated, but this is careful to take ip_io_sem anyway. */
void ocfs2_set_new_buffer_uptodate(struct inode *inode,
struct buffer_head *bh)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
/* This should definitely *not* exist in our cache */
BUG_ON(ocfs2_buffer_cached(oi, bh));
set_buffer_uptodate(bh);
down(&oi->ip_io_sem);
ocfs2_set_buffer_uptodate(inode, bh);
up(&oi->ip_io_sem);
}
/* Requires ip_lock. */
static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
int index)
{
sector_t *array = ci->ci_cache.ci_array;
int bytes;
BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
BUG_ON(index >= ci->ci_num_cached);
BUG_ON(!ci->ci_num_cached);
mlog(0, "remove index %d (num_cached = %u\n", index,
ci->ci_num_cached);
ci->ci_num_cached--;
/* don't need to copy if the array is now empty, or if we
* removed at the tail */
if (ci->ci_num_cached && index < ci->ci_num_cached) {
bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
memmove(&array[index], &array[index + 1], bytes);
}
}
/* Requires ip_lock. */
static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *item)
{
mlog(0, "remove block %llu from tree\n",
(unsigned long long) item->c_block);
rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
ci->ci_num_cached--;
}
/* Called when we remove a chunk of metadata from an inode. We don't
* bother reverting things to an inlined array in the case of a remove
* which moves us back under the limit. */
void ocfs2_remove_from_cache(struct inode *inode,
struct buffer_head *bh)
{
int index;
sector_t block = bh->b_blocknr;
struct ocfs2_meta_cache_item *item = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
spin_lock(&oi->ip_lock);
mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
index = ocfs2_search_cache_array(ci, block);
if (index != -1)
ocfs2_remove_metadata_array(ci, index);
} else {
item = ocfs2_search_cache_tree(ci, block);
if (item)
ocfs2_remove_metadata_tree(ci, item);
}
spin_unlock(&oi->ip_lock);
if (item)
kmem_cache_free(ocfs2_uptodate_cachep, item);
}
int __init init_ocfs2_uptodate_cache(void)
{
ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
sizeof(struct ocfs2_meta_cache_item),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ocfs2_uptodate_cachep)
return -ENOMEM;
mlog(0, "%u inlined cache items per inode.\n",
OCFS2_INODE_MAX_CACHE_ARRAY);
return 0;
}
void __exit exit_ocfs2_uptodate_cache(void)
{
if (ocfs2_uptodate_cachep)
kmem_cache_destroy(ocfs2_uptodate_cachep);
}

44
fs/ocfs2/uptodate.h Normal file
View File

@ -0,0 +1,44 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* uptodate.h
*
* Cluster uptodate tracking
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_UPTODATE_H
#define OCFS2_UPTODATE_H
int __init init_ocfs2_uptodate_cache(void);
void __exit exit_ocfs2_uptodate_cache(void);
void ocfs2_metadata_cache_init(struct inode *inode);
void ocfs2_metadata_cache_purge(struct inode *inode);
int ocfs2_buffer_uptodate(struct inode *inode,
struct buffer_head *bh);
void ocfs2_set_buffer_uptodate(struct inode *inode,
struct buffer_head *bh);
void ocfs2_set_new_buffer_uptodate(struct inode *inode,
struct buffer_head *bh);
void ocfs2_remove_from_cache(struct inode *inode,
struct buffer_head *bh);
#endif /* OCFS2_UPTODATE_H */

43
fs/ocfs2/ver.c Normal file
View File

@ -0,0 +1,43 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ver.c
*
* version string
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include "ver.h"
#define OCFS2_BUILD_VERSION "1.3.3"
#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
void ocfs2_print_version(void)
{
printk(KERN_INFO "%s\n", VERSION_STR);
}
MODULE_DESCRIPTION(VERSION_STR);
MODULE_VERSION(OCFS2_BUILD_VERSION);

31
fs/ocfs2/ver.h Normal file
View File

@ -0,0 +1,31 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ver.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_VER_H
#define OCFS2_VER_H
void ocfs2_print_version(void);
#endif /* OCFS2_VER_H */

1202
fs/ocfs2/vote.c Normal file

File diff suppressed because it is too large Load Diff

56
fs/ocfs2/vote.h Normal file
View File

@ -0,0 +1,56 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* vote.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef VOTE_H
#define VOTE_H
int ocfs2_vote_thread(void *arg);
static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
{
spin_lock(&osb->vote_task_lock);
/* make sure the voting thread gets a swipe at whatever changes
* the caller may have made to the voting state */
osb->vote_wake_sequence++;
spin_unlock(&osb->vote_task_lock);
wake_up(&osb->vote_event);
}
int ocfs2_request_delete_vote(struct inode *inode);
int ocfs2_request_unlink_vote(struct inode *inode,
struct dentry *dentry,
unsigned int nlink);
int ocfs2_request_rename_vote(struct inode *inode,
struct dentry *dentry);
int ocfs2_request_mount_vote(struct ocfs2_super *osb);
int ocfs2_request_umount_vote(struct ocfs2_super *osb);
int ocfs2_register_net_handlers(struct ocfs2_super *osb);
void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
int node_num);
#endif